{ "best_global_step": 1400, "best_metric": 0.5177596838523945, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 25, "global_step": 6045, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 275.25, "completions/min_length": 220.0, "epoch": 0.0008271298593879239, "grad_norm": 0.20617489516735077, "kl": 0.0, "learning_rate": 8.264462809917357e-08, "loss": -3.650784492492676e-07, "memory(GiB)": 36.96, "reward": 0.2268388867378235, "reward_std": 0.04408634826540947, "rewards/VisualizationJSONCombinedORM/mean": 0.2268388867378235, "rewards/VisualizationJSONCombinedORM/std": 0.10921228677034378, "step": 1, "train_speed(iter/s)": 0.009603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 264.0, "completions/min_length": 224.0, "epoch": 0.0016542597187758478, "grad_norm": 0.16278743743896484, "kl": 0.0, "learning_rate": 1.6528925619834713e-07, "loss": 1.9371509552001953e-07, "memory(GiB)": 38.07, "reward": 0.28399160504341125, "reward_std": 0.032923515886068344, "rewards/VisualizationJSONCombinedORM/mean": 0.28399160504341125, "rewards/VisualizationJSONCombinedORM/std": 0.05161440372467041, "step": 2, "train_speed(iter/s)": 0.015636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 282.3125, "completions/min_length": 219.0, "epoch": 0.0024813895781637717, "grad_norm": 0.17665378749370575, "kl": 0.0005412101745605469, "learning_rate": 2.4793388429752067e-07, "loss": 5.451962351799011e-06, "memory(GiB)": 38.33, "reward": 0.43657755851745605, "reward_std": 0.058388255536556244, "rewards/VisualizationJSONCombinedORM/mean": 0.43657755851745605, "rewards/VisualizationJSONCombinedORM/std": 0.23954744637012482, "step": 3, "train_speed(iter/s)": 0.019662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 263.8125, "completions/min_length": 217.0, "epoch": 0.0033085194375516956, "grad_norm": 0.17892757058143616, "kl": 0.0005612373352050781, "learning_rate": 3.3057851239669426e-07, "loss": 5.520880222320557e-06, "memory(GiB)": 38.33, "reward": 0.36957865953445435, "reward_std": 0.11263839900493622, "rewards/VisualizationJSONCombinedORM/mean": 0.36957865953445435, "rewards/VisualizationJSONCombinedORM/std": 0.22434937953948975, "step": 4, "train_speed(iter/s)": 0.022761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 275.5625, "completions/min_length": 218.0, "epoch": 0.0041356492969396195, "grad_norm": 0.17141889035701752, "kl": 0.0004563331604003906, "learning_rate": 4.132231404958678e-07, "loss": 4.366040229797363e-06, "memory(GiB)": 38.33, "reward": 0.2779262661933899, "reward_std": 0.053357817232608795, "rewards/VisualizationJSONCombinedORM/mean": 0.2779262661933899, "rewards/VisualizationJSONCombinedORM/std": 0.07058914005756378, "step": 5, "train_speed(iter/s)": 0.025205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 273.0625, "completions/min_length": 228.0, "epoch": 0.004962779156327543, "grad_norm": 0.23015904426574707, "kl": 0.0006504058837890625, "learning_rate": 4.958677685950413e-07, "loss": 6.355345249176025e-06, "memory(GiB)": 38.33, "reward": 0.16393393278121948, "reward_std": 0.017604481428861618, "rewards/VisualizationJSONCombinedORM/mean": 0.16393393278121948, "rewards/VisualizationJSONCombinedORM/std": 0.025659462437033653, "step": 6, "train_speed(iter/s)": 0.027418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 256.3125, "completions/min_length": 221.0, "epoch": 0.005789909015715467, "grad_norm": 0.18158496916294098, "kl": 0.00040912628173828125, "learning_rate": 5.78512396694215e-07, "loss": 3.986060619354248e-06, "memory(GiB)": 38.33, "reward": 0.18955692648887634, "reward_std": 0.038205139338970184, "rewards/VisualizationJSONCombinedORM/mean": 0.18955692648887634, "rewards/VisualizationJSONCombinedORM/std": 0.03833550214767456, "step": 7, "train_speed(iter/s)": 0.028924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 269.5, "completions/min_length": 231.0, "epoch": 0.006617038875103391, "grad_norm": 0.2121194303035736, "kl": 0.0006380081176757812, "learning_rate": 6.611570247933885e-07, "loss": 6.2426552176475525e-06, "memory(GiB)": 38.33, "reward": 0.3960614502429962, "reward_std": 0.06805281341075897, "rewards/VisualizationJSONCombinedORM/mean": 0.3960614502429962, "rewards/VisualizationJSONCombinedORM/std": 0.2947322130203247, "step": 8, "train_speed(iter/s)": 0.030511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 286.6875, "completions/min_length": 251.0, "epoch": 0.007444168734491315, "grad_norm": 0.18491621315479279, "kl": 0.0006732940673828125, "learning_rate": 7.438016528925621e-07, "loss": 6.549060344696045e-06, "memory(GiB)": 38.6, "reward": 0.3475692868232727, "reward_std": 0.08901970088481903, "rewards/VisualizationJSONCombinedORM/mean": 0.3475692868232727, "rewards/VisualizationJSONCombinedORM/std": 0.09262581914663315, "step": 9, "train_speed(iter/s)": 0.031115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 286.0, "completions/min_length": 236.0, "epoch": 0.008271298593879239, "grad_norm": 0.19188393652439117, "kl": 0.0005388259887695312, "learning_rate": 8.264462809917356e-07, "loss": 5.465000867843628e-06, "memory(GiB)": 38.6, "reward": 0.3478381037712097, "reward_std": 0.09898707270622253, "rewards/VisualizationJSONCombinedORM/mean": 0.3478381037712097, "rewards/VisualizationJSONCombinedORM/std": 0.2281455099582672, "step": 10, "train_speed(iter/s)": 0.032294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 252.875, "completions/min_length": 209.0, "epoch": 0.009098428453267164, "grad_norm": 0.1961890310049057, "kl": 0.0006208419799804688, "learning_rate": 9.090909090909091e-07, "loss": 6.601214408874512e-06, "memory(GiB)": 38.6, "reward": 0.2119571566581726, "reward_std": 0.028602758422493935, "rewards/VisualizationJSONCombinedORM/mean": 0.2119571566581726, "rewards/VisualizationJSONCombinedORM/std": 0.032155342400074005, "step": 11, "train_speed(iter/s)": 0.033339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 270.125, "completions/min_length": 228.0, "epoch": 0.009925558312655087, "grad_norm": 0.19100701808929443, "kl": 0.0006513595581054688, "learning_rate": 9.917355371900827e-07, "loss": 6.22868537902832e-06, "memory(GiB)": 38.6, "reward": 0.3015629053115845, "reward_std": 0.03434445336461067, "rewards/VisualizationJSONCombinedORM/mean": 0.3015629053115845, "rewards/VisualizationJSONCombinedORM/std": 0.07573263347148895, "step": 12, "train_speed(iter/s)": 0.0336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 259.6875, "completions/min_length": 221.0, "epoch": 0.010752688172043012, "grad_norm": 0.1881749927997589, "kl": 0.0005564689636230469, "learning_rate": 1.0743801652892562e-06, "loss": 5.081295967102051e-06, "memory(GiB)": 38.6, "reward": 0.17475785315036774, "reward_std": 0.021660517901182175, "rewards/VisualizationJSONCombinedORM/mean": 0.17475785315036774, "rewards/VisualizationJSONCombinedORM/std": 0.035188861191272736, "step": 13, "train_speed(iter/s)": 0.034039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 269.5625, "completions/min_length": 244.0, "epoch": 0.011579818031430935, "grad_norm": 0.20712636411190033, "kl": 0.0006303787231445312, "learning_rate": 1.15702479338843e-06, "loss": 6.251037120819092e-06, "memory(GiB)": 38.6, "reward": 0.4451269507408142, "reward_std": 0.09017834067344666, "rewards/VisualizationJSONCombinedORM/mean": 0.4451269507408142, "rewards/VisualizationJSONCombinedORM/std": 0.2310655415058136, "step": 14, "train_speed(iter/s)": 0.034495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 297.75, "completions/min_length": 250.0, "epoch": 0.01240694789081886, "grad_norm": 0.19225679337978363, "kl": 0.0009307861328125, "learning_rate": 1.2396694214876035e-06, "loss": 9.34302806854248e-06, "memory(GiB)": 38.6, "reward": 0.21202747523784637, "reward_std": 0.037484221160411835, "rewards/VisualizationJSONCombinedORM/mean": 0.21202747523784637, "rewards/VisualizationJSONCombinedORM/std": 0.043457869440317154, "step": 15, "train_speed(iter/s)": 0.034821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 249.8125, "completions/min_length": 218.0, "epoch": 0.013234077750206782, "grad_norm": 0.2092384546995163, "kl": 0.000629425048828125, "learning_rate": 1.322314049586777e-06, "loss": 6.15045428276062e-06, "memory(GiB)": 38.78, "reward": 0.5543199777603149, "reward_std": 0.16598990559577942, "rewards/VisualizationJSONCombinedORM/mean": 0.5543199777603149, "rewards/VisualizationJSONCombinedORM/std": 0.1618148386478424, "step": 16, "train_speed(iter/s)": 0.034785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 277.1875, "completions/min_length": 235.0, "epoch": 0.014061207609594707, "grad_norm": 0.1963672637939453, "kl": 0.0007390975952148438, "learning_rate": 1.4049586776859506e-06, "loss": 6.8247318267822266e-06, "memory(GiB)": 38.78, "reward": 0.5309390425682068, "reward_std": 0.05354722589254379, "rewards/VisualizationJSONCombinedORM/mean": 0.5309390425682068, "rewards/VisualizationJSONCombinedORM/std": 0.26435336470603943, "step": 17, "train_speed(iter/s)": 0.035032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 267.8125, "completions/min_length": 234.0, "epoch": 0.01488833746898263, "grad_norm": 0.20986706018447876, "kl": 0.0006914138793945312, "learning_rate": 1.4876033057851241e-06, "loss": 6.884336471557617e-06, "memory(GiB)": 38.78, "reward": 0.219796821475029, "reward_std": 0.046677421778440475, "rewards/VisualizationJSONCombinedORM/mean": 0.219796821475029, "rewards/VisualizationJSONCombinedORM/std": 0.0504375621676445, "step": 18, "train_speed(iter/s)": 0.035473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 283.5, "completions/min_length": 235.0, "epoch": 0.015715467328370553, "grad_norm": 0.20454968512058258, "kl": 0.0007762908935546875, "learning_rate": 1.5702479338842977e-06, "loss": 7.506459951400757e-06, "memory(GiB)": 38.78, "reward": 0.38683342933654785, "reward_std": 0.02932591550052166, "rewards/VisualizationJSONCombinedORM/mean": 0.38683342933654785, "rewards/VisualizationJSONCombinedORM/std": 0.16101324558258057, "step": 19, "train_speed(iter/s)": 0.035626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 282.0625, "completions/min_length": 220.0, "epoch": 0.016542597187758478, "grad_norm": 0.18690010905265808, "kl": 0.0007762908935546875, "learning_rate": 1.6528925619834712e-06, "loss": 8.07642936706543e-06, "memory(GiB)": 38.78, "reward": 0.271468847990036, "reward_std": 0.07573415338993073, "rewards/VisualizationJSONCombinedORM/mean": 0.271468847990036, "rewards/VisualizationJSONCombinedORM/std": 0.1866159588098526, "step": 20, "train_speed(iter/s)": 0.035653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 289.0, "completions/min_length": 242.0, "epoch": 0.017369727047146403, "grad_norm": 0.17983175814151764, "kl": 0.0006799697875976562, "learning_rate": 1.7355371900826448e-06, "loss": 6.917864084243774e-06, "memory(GiB)": 38.78, "reward": 0.5290386080741882, "reward_std": 0.06914424896240234, "rewards/VisualizationJSONCombinedORM/mean": 0.5290386080741882, "rewards/VisualizationJSONCombinedORM/std": 0.21853314340114594, "step": 21, "train_speed(iter/s)": 0.035778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 262.625, "completions/min_length": 233.0, "epoch": 0.018196856906534328, "grad_norm": 0.20821766555309296, "kl": 0.0007028579711914062, "learning_rate": 1.8181818181818183e-06, "loss": 7.02589750289917e-06, "memory(GiB)": 38.78, "reward": 0.44192224740982056, "reward_std": 0.07075139880180359, "rewards/VisualizationJSONCombinedORM/mean": 0.44192224740982056, "rewards/VisualizationJSONCombinedORM/std": 0.1982736438512802, "step": 22, "train_speed(iter/s)": 0.035911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 271.75, "completions/min_length": 225.0, "epoch": 0.01902398676592225, "grad_norm": 0.187339186668396, "kl": 0.0007386207580566406, "learning_rate": 1.900826446280992e-06, "loss": 7.934868335723877e-06, "memory(GiB)": 38.78, "reward": 0.2529742121696472, "reward_std": 0.033710841089487076, "rewards/VisualizationJSONCombinedORM/mean": 0.2529742121696472, "rewards/VisualizationJSONCombinedORM/std": 0.07466588169336319, "step": 23, "train_speed(iter/s)": 0.03608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 290.9375, "completions/min_length": 227.0, "epoch": 0.019851116625310174, "grad_norm": 0.19307929277420044, "kl": 0.0007381439208984375, "learning_rate": 1.9834710743801654e-06, "loss": 7.212162017822266e-06, "memory(GiB)": 38.78, "reward": 0.4380393326282501, "reward_std": 0.11039689928293228, "rewards/VisualizationJSONCombinedORM/mean": 0.4380393326282501, "rewards/VisualizationJSONCombinedORM/std": 0.24256671965122223, "step": 24, "train_speed(iter/s)": 0.036049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 291.25, "completions/min_length": 231.0, "epoch": 0.0206782464846981, "grad_norm": 0.17635561525821686, "kl": 0.0006780624389648438, "learning_rate": 2.066115702479339e-06, "loss": 6.8247318267822266e-06, "memory(GiB)": 38.78, "reward": 0.4731794595718384, "reward_std": 0.05948401987552643, "rewards/VisualizationJSONCombinedORM/mean": 0.4731794595718384, "rewards/VisualizationJSONCombinedORM/std": 0.29066339135169983, "step": 25, "train_speed(iter/s)": 0.035983 }, { "epoch": 0.0206782464846981, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 327.1666666666667, "eval_completions/mean_length": 281.640625, "eval_completions/min_length": 242.08333333333334, "eval_kl": 0.0006991227467854818, "eval_loss": 6.944561846466968e-06, "eval_reward": 0.28845673116544884, "eval_reward_std": 0.04784151667263359, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.28845673116544884, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04784151564429825, "eval_runtime": 287.8171, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 281.75, "completions/min_length": 244.0, "epoch": 0.021505376344086023, "grad_norm": 0.18464083969593048, "kl": 0.0006866455078125, "learning_rate": 2.1487603305785124e-06, "loss": 6.6980719566345215e-06, "memory(GiB)": 38.78, "reward": 0.46923601627349854, "reward_std": 0.11093136668205261, "rewards/VisualizationJSONCombinedORM/mean": 0.46923601627349854, "rewards/VisualizationJSONCombinedORM/std": 0.11688430607318878, "step": 26, "train_speed(iter/s)": 0.025917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 297.9375, "completions/min_length": 226.0, "epoch": 0.022332506203473945, "grad_norm": 0.1909973919391632, "kl": 0.0007228851318359375, "learning_rate": 2.231404958677686e-06, "loss": 7.119029760360718e-06, "memory(GiB)": 38.78, "reward": 0.41625693440437317, "reward_std": 0.03693682327866554, "rewards/VisualizationJSONCombinedORM/mean": 0.41625693440437317, "rewards/VisualizationJSONCombinedORM/std": 0.309472918510437, "step": 27, "train_speed(iter/s)": 0.026205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 273.0625, "completions/min_length": 227.0, "epoch": 0.02315963606286187, "grad_norm": 0.18573859333992004, "kl": 0.0007352828979492188, "learning_rate": 2.31404958677686e-06, "loss": 7.120892405509949e-06, "memory(GiB)": 38.78, "reward": 0.3249530792236328, "reward_std": 0.09077966213226318, "rewards/VisualizationJSONCombinedORM/mean": 0.3249530792236328, "rewards/VisualizationJSONCombinedORM/std": 0.2549353539943695, "step": 28, "train_speed(iter/s)": 0.026556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 316.4375, "completions/min_length": 235.0, "epoch": 0.023986765922249794, "grad_norm": 0.1797690987586975, "kl": 0.000614166259765625, "learning_rate": 2.3966942148760335e-06, "loss": 6.4820051193237305e-06, "memory(GiB)": 38.78, "reward": 0.13540390133857727, "reward_std": 0.013587342575192451, "rewards/VisualizationJSONCombinedORM/mean": 0.13540390133857727, "rewards/VisualizationJSONCombinedORM/std": 0.022643154487013817, "step": 29, "train_speed(iter/s)": 0.026699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 311.0625, "completions/min_length": 242.0, "epoch": 0.02481389578163772, "grad_norm": 0.1915312260389328, "kl": 0.0006341934204101562, "learning_rate": 2.479338842975207e-06, "loss": 6.5267086029052734e-06, "memory(GiB)": 38.78, "reward": 0.45092111825942993, "reward_std": 0.08471018075942993, "rewards/VisualizationJSONCombinedORM/mean": 0.45092111825942993, "rewards/VisualizationJSONCombinedORM/std": 0.19151414930820465, "step": 30, "train_speed(iter/s)": 0.027012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 264.0625, "completions/min_length": 222.0, "epoch": 0.02564102564102564, "grad_norm": 0.2060600072145462, "kl": 0.000995635986328125, "learning_rate": 2.56198347107438e-06, "loss": 9.94047150015831e-06, "memory(GiB)": 38.78, "reward": 0.45561057329177856, "reward_std": 0.08985652029514313, "rewards/VisualizationJSONCombinedORM/mean": 0.45561057329177856, "rewards/VisualizationJSONCombinedORM/std": 0.20273970067501068, "step": 31, "train_speed(iter/s)": 0.027326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 269.8125, "completions/min_length": 225.0, "epoch": 0.026468155500413565, "grad_norm": 0.20699529349803925, "kl": 0.000759124755859375, "learning_rate": 2.644628099173554e-06, "loss": 7.353723049163818e-06, "memory(GiB)": 38.78, "reward": 0.28461015224456787, "reward_std": 0.036962129175662994, "rewards/VisualizationJSONCombinedORM/mean": 0.28461015224456787, "rewards/VisualizationJSONCombinedORM/std": 0.05740722268819809, "step": 32, "train_speed(iter/s)": 0.027565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 285.8125, "completions/min_length": 215.0, "epoch": 0.02729528535980149, "grad_norm": 0.2014593929052353, "kl": 0.0010824203491210938, "learning_rate": 2.7272727272727272e-06, "loss": 1.084059476852417e-05, "memory(GiB)": 38.78, "reward": 0.318606436252594, "reward_std": 0.09085272252559662, "rewards/VisualizationJSONCombinedORM/mean": 0.318606436252594, "rewards/VisualizationJSONCombinedORM/std": 0.12434961646795273, "step": 33, "train_speed(iter/s)": 0.027833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 337.1875, "completions/min_length": 268.0, "epoch": 0.028122415219189414, "grad_norm": 0.1787831038236618, "kl": 0.00098419189453125, "learning_rate": 2.809917355371901e-06, "loss": 9.402632713317871e-06, "memory(GiB)": 38.78, "reward": 0.19949863851070404, "reward_std": 0.012751984409987926, "rewards/VisualizationJSONCombinedORM/mean": 0.19949863851070404, "rewards/VisualizationJSONCombinedORM/std": 0.03698711097240448, "step": 34, "train_speed(iter/s)": 0.028069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 276.25, "completions/min_length": 210.0, "epoch": 0.028949545078577336, "grad_norm": 0.2096458226442337, "kl": 0.001129150390625, "learning_rate": 2.8925619834710743e-06, "loss": 1.142546534538269e-05, "memory(GiB)": 38.78, "reward": 0.4666332006454468, "reward_std": 0.09783059358596802, "rewards/VisualizationJSONCombinedORM/mean": 0.4666332006454468, "rewards/VisualizationJSONCombinedORM/std": 0.17987503111362457, "step": 35, "train_speed(iter/s)": 0.028319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 275.0625, "completions/min_length": 230.0, "epoch": 0.02977667493796526, "grad_norm": 0.1825750172138214, "kl": 0.00098419189453125, "learning_rate": 2.9752066115702483e-06, "loss": 9.709969162940979e-06, "memory(GiB)": 38.78, "reward": 0.41184288263320923, "reward_std": 0.04293879121541977, "rewards/VisualizationJSONCombinedORM/mean": 0.41184288263320923, "rewards/VisualizationJSONCombinedORM/std": 0.2820163667201996, "step": 36, "train_speed(iter/s)": 0.028574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 271.125, "completions/min_length": 237.0, "epoch": 0.030603804797353185, "grad_norm": 0.17744269967079163, "kl": 0.001026153564453125, "learning_rate": 3.0578512396694214e-06, "loss": 1.0833144187927246e-05, "memory(GiB)": 38.78, "reward": 0.2871443033218384, "reward_std": 0.02447727881371975, "rewards/VisualizationJSONCombinedORM/mean": 0.2871443033218384, "rewards/VisualizationJSONCombinedORM/std": 0.10985644906759262, "step": 37, "train_speed(iter/s)": 0.028822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 280.9375, "completions/min_length": 230.0, "epoch": 0.03143093465674111, "grad_norm": 0.17119137942790985, "kl": 0.0013942718505859375, "learning_rate": 3.1404958677685953e-06, "loss": 1.4103949069976807e-05, "memory(GiB)": 38.78, "reward": 0.2506603002548218, "reward_std": 0.0768154114484787, "rewards/VisualizationJSONCombinedORM/mean": 0.2506603002548218, "rewards/VisualizationJSONCombinedORM/std": 0.1251477301120758, "step": 38, "train_speed(iter/s)": 0.029029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 269.625, "completions/min_length": 238.0, "epoch": 0.03225806451612903, "grad_norm": 0.20039072632789612, "kl": 0.0012969970703125, "learning_rate": 3.2231404958677685e-06, "loss": 1.2919306755065918e-05, "memory(GiB)": 38.78, "reward": 0.27569255232810974, "reward_std": 0.04444187134504318, "rewards/VisualizationJSONCombinedORM/mean": 0.27569255232810974, "rewards/VisualizationJSONCombinedORM/std": 0.045470573008060455, "step": 39, "train_speed(iter/s)": 0.029294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 261.625, "completions/min_length": 221.0, "epoch": 0.033085194375516956, "grad_norm": 0.16562272608280182, "kl": 0.0011243820190429688, "learning_rate": 3.3057851239669424e-06, "loss": 1.0885298252105713e-05, "memory(GiB)": 38.78, "reward": 0.19930918514728546, "reward_std": 0.03888659179210663, "rewards/VisualizationJSONCombinedORM/mean": 0.19930918514728546, "rewards/VisualizationJSONCombinedORM/std": 0.08251883089542389, "step": 40, "train_speed(iter/s)": 0.029454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 308.5625, "completions/min_length": 240.0, "epoch": 0.03391232423490488, "grad_norm": 0.20006558299064636, "kl": 0.001560211181640625, "learning_rate": 3.388429752066116e-06, "loss": 1.5579164028167725e-05, "memory(GiB)": 38.78, "reward": 0.23638151586055756, "reward_std": 0.03646443784236908, "rewards/VisualizationJSONCombinedORM/mean": 0.23638151586055756, "rewards/VisualizationJSONCombinedORM/std": 0.10036057978868484, "step": 41, "train_speed(iter/s)": 0.029681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 285.875, "completions/min_length": 223.0, "epoch": 0.034739454094292806, "grad_norm": 0.21244677901268005, "kl": 0.001987457275390625, "learning_rate": 3.4710743801652895e-06, "loss": 1.9945204257965088e-05, "memory(GiB)": 38.78, "reward": 0.49223023653030396, "reward_std": 0.05785026401281357, "rewards/VisualizationJSONCombinedORM/mean": 0.49223023653030396, "rewards/VisualizationJSONCombinedORM/std": 0.17838872969150543, "step": 42, "train_speed(iter/s)": 0.029838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 289.375, "completions/min_length": 221.0, "epoch": 0.03556658395368073, "grad_norm": 0.1934722661972046, "kl": 0.002933502197265625, "learning_rate": 3.553719008264463e-06, "loss": 2.9131770133972168e-05, "memory(GiB)": 38.78, "reward": 0.43273746967315674, "reward_std": 0.0938977599143982, "rewards/VisualizationJSONCombinedORM/mean": 0.43273746967315674, "rewards/VisualizationJSONCombinedORM/std": 0.19893603026866913, "step": 43, "train_speed(iter/s)": 0.029995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 257.125, "completions/min_length": 204.0, "epoch": 0.036393713813068655, "grad_norm": 0.16338427364826202, "kl": 0.0016918182373046875, "learning_rate": 3.6363636363636366e-06, "loss": 1.722201704978943e-05, "memory(GiB)": 38.78, "reward": 0.2409389615058899, "reward_std": 0.0227540023624897, "rewards/VisualizationJSONCombinedORM/mean": 0.2409389615058899, "rewards/VisualizationJSONCombinedORM/std": 0.10587932914495468, "step": 44, "train_speed(iter/s)": 0.030118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 294.4375, "completions/min_length": 243.0, "epoch": 0.03722084367245657, "grad_norm": 0.20093360543251038, "kl": 0.003307342529296875, "learning_rate": 3.71900826446281e-06, "loss": 3.291666507720947e-05, "memory(GiB)": 38.78, "reward": 0.34354880452156067, "reward_std": 0.06569940596818924, "rewards/VisualizationJSONCombinedORM/mean": 0.34354880452156067, "rewards/VisualizationJSONCombinedORM/std": 0.09633155912160873, "step": 45, "train_speed(iter/s)": 0.03028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 269.625, "completions/min_length": 221.0, "epoch": 0.0380479735318445, "grad_norm": 0.18240894377231598, "kl": 0.0030422210693359375, "learning_rate": 3.801652892561984e-06, "loss": 3.0644237995147705e-05, "memory(GiB)": 38.78, "reward": 0.3689318895339966, "reward_std": 0.05348683148622513, "rewards/VisualizationJSONCombinedORM/mean": 0.3689318895339966, "rewards/VisualizationJSONCombinedORM/std": 0.16262900829315186, "step": 46, "train_speed(iter/s)": 0.030482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 299.125, "completions/min_length": 243.0, "epoch": 0.03887510339123242, "grad_norm": 0.20712299644947052, "kl": 0.00476837158203125, "learning_rate": 3.884297520661157e-06, "loss": 4.772841930389404e-05, "memory(GiB)": 38.78, "reward": 0.5095909833908081, "reward_std": 0.11901424825191498, "rewards/VisualizationJSONCombinedORM/mean": 0.5095909833908081, "rewards/VisualizationJSONCombinedORM/std": 0.1323833018541336, "step": 47, "train_speed(iter/s)": 0.030594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 282.3125, "completions/min_length": 233.0, "epoch": 0.03970223325062035, "grad_norm": 0.21468938887119293, "kl": 0.005123138427734375, "learning_rate": 3.966942148760331e-06, "loss": 5.132704973220825e-05, "memory(GiB)": 38.78, "reward": 0.35828346014022827, "reward_std": 0.06582397222518921, "rewards/VisualizationJSONCombinedORM/mean": 0.35828346014022827, "rewards/VisualizationJSONCombinedORM/std": 0.15606869757175446, "step": 48, "train_speed(iter/s)": 0.030758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 305.8125, "completions/min_length": 233.0, "epoch": 0.04052936311000827, "grad_norm": 0.19921226799488068, "kl": 0.003383636474609375, "learning_rate": 4.049586776859504e-06, "loss": 3.363192081451416e-05, "memory(GiB)": 38.78, "reward": 0.17606136202812195, "reward_std": 0.03202737495303154, "rewards/VisualizationJSONCombinedORM/mean": 0.17606136202812195, "rewards/VisualizationJSONCombinedORM/std": 0.044644128531217575, "step": 49, "train_speed(iter/s)": 0.030782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 261.0625, "completions/min_length": 217.0, "epoch": 0.0413564929693962, "grad_norm": 0.1917077898979187, "kl": 0.008392333984375, "learning_rate": 4.132231404958678e-06, "loss": 8.423253893852234e-05, "memory(GiB)": 38.78, "reward": 0.1938277631998062, "reward_std": 0.029320189729332924, "rewards/VisualizationJSONCombinedORM/mean": 0.1938277631998062, "rewards/VisualizationJSONCombinedORM/std": 0.04847441986203194, "step": 50, "train_speed(iter/s)": 0.030832 }, { "epoch": 0.0413564929693962, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 328.875, "eval_completions/mean_length": 279.125, "eval_completions/min_length": 240.29166666666666, "eval_kl": 0.0070209503173828125, "eval_loss": 7.088357961038128e-05, "eval_reward": 0.3181192570676406, "eval_reward_std": 0.05552327272016555, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.3181192570676406, "eval_rewards/VisualizationJSONCombinedORM/std": 0.055523274621615805, "eval_runtime": 287.7334, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 261.6875, "completions/min_length": 229.0, "epoch": 0.04218362282878412, "grad_norm": 0.1804126799106598, "kl": 0.008758544921875, "learning_rate": 4.214876033057851e-06, "loss": 8.777529001235962e-05, "memory(GiB)": 38.78, "reward": 0.2152000069618225, "reward_std": 0.035984575748443604, "rewards/VisualizationJSONCombinedORM/mean": 0.2152000069618225, "rewards/VisualizationJSONCombinedORM/std": 0.05432786047458649, "step": 51, "train_speed(iter/s)": 0.026361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 273.4375, "completions/min_length": 238.0, "epoch": 0.043010752688172046, "grad_norm": 0.1935201734304428, "kl": 0.00485992431640625, "learning_rate": 4.297520661157025e-06, "loss": 4.897266626358032e-05, "memory(GiB)": 38.78, "reward": 0.27695050835609436, "reward_std": 0.027058597654104233, "rewards/VisualizationJSONCombinedORM/mean": 0.27695050835609436, "rewards/VisualizationJSONCombinedORM/std": 0.041098836809396744, "step": 52, "train_speed(iter/s)": 0.026551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 271.375, "completions/min_length": 224.0, "epoch": 0.043837882547559964, "grad_norm": 0.21191160380840302, "kl": 0.007411956787109375, "learning_rate": 4.3801652892561984e-06, "loss": 7.43865966796875e-05, "memory(GiB)": 38.78, "reward": 0.3905656337738037, "reward_std": 0.040254224091768265, "rewards/VisualizationJSONCombinedORM/mean": 0.3905656337738037, "rewards/VisualizationJSONCombinedORM/std": 0.039925042539834976, "step": 53, "train_speed(iter/s)": 0.026741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 279.125, "completions/min_length": 242.0, "epoch": 0.04466501240694789, "grad_norm": 0.23068571090698242, "kl": 0.0076446533203125, "learning_rate": 4.462809917355372e-06, "loss": 7.641315460205078e-05, "memory(GiB)": 38.78, "reward": 0.43826937675476074, "reward_std": 0.08402872085571289, "rewards/VisualizationJSONCombinedORM/mean": 0.43826937675476074, "rewards/VisualizationJSONCombinedORM/std": 0.2260580211877823, "step": 54, "train_speed(iter/s)": 0.026926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 264.875, "completions/min_length": 219.0, "epoch": 0.045492142266335814, "grad_norm": 0.21562765538692474, "kl": 0.0117950439453125, "learning_rate": 4.5454545454545455e-06, "loss": 0.00011871010065078735, "memory(GiB)": 38.78, "reward": 0.21879787743091583, "reward_std": 0.023357398808002472, "rewards/VisualizationJSONCombinedORM/mean": 0.21879787743091583, "rewards/VisualizationJSONCombinedORM/std": 0.09946725517511368, "step": 55, "train_speed(iter/s)": 0.027052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 271.5, "completions/min_length": 216.0, "epoch": 0.04631927212572374, "grad_norm": 0.21820183098316193, "kl": 0.0182037353515625, "learning_rate": 4.62809917355372e-06, "loss": 0.00018234923481941223, "memory(GiB)": 38.78, "reward": 0.2249087393283844, "reward_std": 0.03993435204029083, "rewards/VisualizationJSONCombinedORM/mean": 0.2249087393283844, "rewards/VisualizationJSONCombinedORM/std": 0.039893507957458496, "step": 56, "train_speed(iter/s)": 0.027226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 299.125, "completions/min_length": 235.0, "epoch": 0.04714640198511166, "grad_norm": 0.24649100005626678, "kl": 0.01714324951171875, "learning_rate": 4.710743801652893e-06, "loss": 0.00017169862985610962, "memory(GiB)": 38.93, "reward": 0.3344082236289978, "reward_std": 0.08885800093412399, "rewards/VisualizationJSONCombinedORM/mean": 0.3344082236289978, "rewards/VisualizationJSONCombinedORM/std": 0.09750257432460785, "step": 57, "train_speed(iter/s)": 0.027271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 288.0625, "completions/min_length": 220.0, "epoch": 0.04797353184449959, "grad_norm": 0.23450075089931488, "kl": 0.0341644287109375, "learning_rate": 4.793388429752067e-06, "loss": 0.00034170597791671753, "memory(GiB)": 38.93, "reward": 0.303808331489563, "reward_std": 0.07536155730485916, "rewards/VisualizationJSONCombinedORM/mean": 0.303808331489563, "rewards/VisualizationJSONCombinedORM/std": 0.08438864350318909, "step": 58, "train_speed(iter/s)": 0.027441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 275.1875, "completions/min_length": 223.0, "epoch": 0.04880066170388751, "grad_norm": 0.2562869191169739, "kl": 0.017486572265625, "learning_rate": 4.87603305785124e-06, "loss": 0.0001747235655784607, "memory(GiB)": 38.93, "reward": 0.4005550146102905, "reward_std": 0.13662131130695343, "rewards/VisualizationJSONCombinedORM/mean": 0.4005550146102905, "rewards/VisualizationJSONCombinedORM/std": 0.21129441261291504, "step": 59, "train_speed(iter/s)": 0.027655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 280.1875, "completions/min_length": 221.0, "epoch": 0.04962779156327544, "grad_norm": 0.22892075777053833, "kl": 0.01788330078125, "learning_rate": 4.958677685950414e-06, "loss": 0.00017904862761497498, "memory(GiB)": 38.93, "reward": 0.443825900554657, "reward_std": 0.057601917535066605, "rewards/VisualizationJSONCombinedORM/mean": 0.443825900554657, "rewards/VisualizationJSONCombinedORM/std": 0.06446675211191177, "step": 60, "train_speed(iter/s)": 0.027802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 281.125, "completions/min_length": 235.0, "epoch": 0.050454921422663356, "grad_norm": 0.25773656368255615, "kl": 0.03411865234375, "learning_rate": 5.041322314049587e-06, "loss": 0.0003403313457965851, "memory(GiB)": 38.93, "reward": 0.28145384788513184, "reward_std": 0.050304312258958817, "rewards/VisualizationJSONCombinedORM/mean": 0.28145384788513184, "rewards/VisualizationJSONCombinedORM/std": 0.19099754095077515, "step": 61, "train_speed(iter/s)": 0.027908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 301.5625, "completions/min_length": 247.0, "epoch": 0.05128205128205128, "grad_norm": 0.2067447155714035, "kl": 0.0726318359375, "learning_rate": 5.12396694214876e-06, "loss": 0.0007263496518135071, "memory(GiB)": 38.93, "reward": 0.3546355962753296, "reward_std": 0.04100070521235466, "rewards/VisualizationJSONCombinedORM/mean": 0.3546355962753296, "rewards/VisualizationJSONCombinedORM/std": 0.06367792934179306, "step": 62, "train_speed(iter/s)": 0.028048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 281.5625, "completions/min_length": 254.0, "epoch": 0.052109181141439205, "grad_norm": 0.21220125257968903, "kl": 0.04498291015625, "learning_rate": 5.206611570247935e-06, "loss": 0.00044986605644226074, "memory(GiB)": 38.93, "reward": 0.5331703424453735, "reward_std": 0.14817847311496735, "rewards/VisualizationJSONCombinedORM/mean": 0.5331703424453735, "rewards/VisualizationJSONCombinedORM/std": 0.17092272639274597, "step": 63, "train_speed(iter/s)": 0.028175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 301.3125, "completions/min_length": 244.0, "epoch": 0.05293631100082713, "grad_norm": 0.23428140580654144, "kl": 0.060333251953125, "learning_rate": 5.289256198347108e-06, "loss": 0.0006037577986717224, "memory(GiB)": 38.93, "reward": 0.49315452575683594, "reward_std": 0.08388897031545639, "rewards/VisualizationJSONCombinedORM/mean": 0.49315452575683594, "rewards/VisualizationJSONCombinedORM/std": 0.15487921237945557, "step": 64, "train_speed(iter/s)": 0.028305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 273.125, "completions/min_length": 234.0, "epoch": 0.053763440860215055, "grad_norm": 0.2085675746202469, "kl": 0.052276611328125, "learning_rate": 5.371900826446281e-06, "loss": 0.0005230642855167389, "memory(GiB)": 38.93, "reward": 0.42164620757102966, "reward_std": 0.11722993105649948, "rewards/VisualizationJSONCombinedORM/mean": 0.42164620757102966, "rewards/VisualizationJSONCombinedORM/std": 0.1306317001581192, "step": 65, "train_speed(iter/s)": 0.028445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 265.5, "completions/min_length": 234.0, "epoch": 0.05459057071960298, "grad_norm": 0.20163214206695557, "kl": 0.0911865234375, "learning_rate": 5.4545454545454545e-06, "loss": 0.000908873975276947, "memory(GiB)": 38.93, "reward": 0.5433167219161987, "reward_std": 0.1096176877617836, "rewards/VisualizationJSONCombinedORM/mean": 0.5433167219161987, "rewards/VisualizationJSONCombinedORM/std": 0.10660884529352188, "step": 66, "train_speed(iter/s)": 0.028525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 289.9375, "completions/min_length": 224.0, "epoch": 0.055417700578990904, "grad_norm": 0.18214698135852814, "kl": 0.07830810546875, "learning_rate": 5.537190082644629e-06, "loss": 0.0007839538156986237, "memory(GiB)": 38.93, "reward": 0.5796471238136292, "reward_std": 0.10588403046131134, "rewards/VisualizationJSONCombinedORM/mean": 0.5796471238136292, "rewards/VisualizationJSONCombinedORM/std": 0.24124516546726227, "step": 67, "train_speed(iter/s)": 0.028636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 311.625, "completions/min_length": 236.0, "epoch": 0.05624483043837883, "grad_norm": 0.20936839282512665, "kl": 0.1314697265625, "learning_rate": 5.619834710743802e-06, "loss": 0.0013168305158615112, "memory(GiB)": 38.93, "reward": 0.3958051800727844, "reward_std": 0.04035237058997154, "rewards/VisualizationJSONCombinedORM/mean": 0.3958051800727844, "rewards/VisualizationJSONCombinedORM/std": 0.14815300703048706, "step": 68, "train_speed(iter/s)": 0.028724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 313.75, "completions/min_length": 253.0, "epoch": 0.05707196029776675, "grad_norm": 0.22811086475849152, "kl": 0.1175537109375, "learning_rate": 5.702479338842976e-06, "loss": 0.0011732950806617737, "memory(GiB)": 38.93, "reward": 0.3512464463710785, "reward_std": 0.13584402203559875, "rewards/VisualizationJSONCombinedORM/mean": 0.3512464463710785, "rewards/VisualizationJSONCombinedORM/std": 0.16067063808441162, "step": 69, "train_speed(iter/s)": 0.028821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 280.25, "completions/min_length": 237.0, "epoch": 0.05789909015715467, "grad_norm": 0.21812494099140167, "kl": 0.1180419921875, "learning_rate": 5.785123966942149e-06, "loss": 0.001177661120891571, "memory(GiB)": 38.93, "reward": 0.7392556667327881, "reward_std": 0.11306892335414886, "rewards/VisualizationJSONCombinedORM/mean": 0.7392556667327881, "rewards/VisualizationJSONCombinedORM/std": 0.1921801120042801, "step": 70, "train_speed(iter/s)": 0.028925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 279.1875, "completions/min_length": 223.0, "epoch": 0.058726220016542596, "grad_norm": 0.2408645898103714, "kl": 0.0791015625, "learning_rate": 5.867768595041323e-06, "loss": 0.0007919464260339737, "memory(GiB)": 38.93, "reward": 0.4902362525463104, "reward_std": 0.10270573198795319, "rewards/VisualizationJSONCombinedORM/mean": 0.4902362525463104, "rewards/VisualizationJSONCombinedORM/std": 0.24504855275154114, "step": 71, "train_speed(iter/s)": 0.028962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 278.875, "completions/min_length": 236.0, "epoch": 0.05955334987593052, "grad_norm": 0.19977769255638123, "kl": 0.08331298828125, "learning_rate": 5.9504132231404965e-06, "loss": 0.0008344203233718872, "memory(GiB)": 38.93, "reward": 0.403807669878006, "reward_std": 0.056516848504543304, "rewards/VisualizationJSONCombinedORM/mean": 0.403807669878006, "rewards/VisualizationJSONCombinedORM/std": 0.07375326752662659, "step": 72, "train_speed(iter/s)": 0.029133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 292.5, "completions/min_length": 211.0, "epoch": 0.060380479735318446, "grad_norm": 0.27058470249176025, "kl": 0.2791748046875, "learning_rate": 6.03305785123967e-06, "loss": 0.002783462405204773, "memory(GiB)": 38.93, "reward": 0.4964684247970581, "reward_std": 0.11654908955097198, "rewards/VisualizationJSONCombinedORM/mean": 0.4964684247970581, "rewards/VisualizationJSONCombinedORM/std": 0.17020389437675476, "step": 73, "train_speed(iter/s)": 0.029202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 263.625, "completions/min_length": 226.0, "epoch": 0.06120760959470637, "grad_norm": 0.24398232996463776, "kl": 0.142822265625, "learning_rate": 6.115702479338843e-06, "loss": 0.0014266930520534515, "memory(GiB)": 38.93, "reward": 0.5433200597763062, "reward_std": 0.11267431080341339, "rewards/VisualizationJSONCombinedORM/mean": 0.5433200597763062, "rewards/VisualizationJSONCombinedORM/std": 0.19863057136535645, "step": 74, "train_speed(iter/s)": 0.02937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 309.1875, "completions/min_length": 262.0, "epoch": 0.062034739454094295, "grad_norm": 0.20811501145362854, "kl": 0.1240234375, "learning_rate": 6.198347107438017e-06, "loss": 0.0012429915368556976, "memory(GiB)": 38.93, "reward": 0.4416992962360382, "reward_std": 0.08745713531970978, "rewards/VisualizationJSONCombinedORM/mean": 0.4416992962360382, "rewards/VisualizationJSONCombinedORM/std": 0.12668398022651672, "step": 75, "train_speed(iter/s)": 0.029442 }, { "epoch": 0.062034739454094295, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 339.625, "eval_completions/mean_length": 286.3020833333333, "eval_completions/min_length": 243.83333333333334, "eval_kl": 0.456787109375, "eval_loss": 0.004414061550050974, "eval_reward": 0.3923199561734994, "eval_reward_std": 0.08498914815330257, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.3923199561734994, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08498915168456733, "eval_runtime": 294.4894, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 310.9375, "completions/min_length": 254.0, "epoch": 0.06286186931348221, "grad_norm": 0.221737802028656, "kl": 0.10821533203125, "learning_rate": 6.280991735537191e-06, "loss": 0.001082099974155426, "memory(GiB)": 38.93, "reward": 0.4862482249736786, "reward_std": 0.07949072122573853, "rewards/VisualizationJSONCombinedORM/mean": 0.4862482249736786, "rewards/VisualizationJSONCombinedORM/std": 0.1567733734846115, "step": 76, "train_speed(iter/s)": 0.026466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 311.6875, "completions/min_length": 235.0, "epoch": 0.06368899917287014, "grad_norm": 0.21351268887519836, "kl": 0.13397216796875, "learning_rate": 6.363636363636364e-06, "loss": 0.0013405755162239075, "memory(GiB)": 38.93, "reward": 0.4444642961025238, "reward_std": 0.10467079281806946, "rewards/VisualizationJSONCombinedORM/mean": 0.4444642961025238, "rewards/VisualizationJSONCombinedORM/std": 0.15210101008415222, "step": 77, "train_speed(iter/s)": 0.026548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 287.3125, "completions/min_length": 255.0, "epoch": 0.06451612903225806, "grad_norm": 0.2147461175918579, "kl": 0.115234375, "learning_rate": 6.446280991735537e-06, "loss": 0.0011538490653038025, "memory(GiB)": 38.93, "reward": 0.21117082238197327, "reward_std": 0.026981903240084648, "rewards/VisualizationJSONCombinedORM/mean": 0.21117082238197327, "rewards/VisualizationJSONCombinedORM/std": 0.07492166757583618, "step": 78, "train_speed(iter/s)": 0.026676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 283.6875, "completions/min_length": 224.0, "epoch": 0.065343258891646, "grad_norm": 0.25654637813568115, "kl": 0.1458740234375, "learning_rate": 6.528925619834712e-06, "loss": 0.0014588069170713425, "memory(GiB)": 38.93, "reward": 0.6230406761169434, "reward_std": 0.10282319784164429, "rewards/VisualizationJSONCombinedORM/mean": 0.6230406761169434, "rewards/VisualizationJSONCombinedORM/std": 0.14112277328968048, "step": 79, "train_speed(iter/s)": 0.026796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 281.5, "completions/min_length": 211.0, "epoch": 0.06617038875103391, "grad_norm": 0.21146531403064728, "kl": 0.224365234375, "learning_rate": 6.611570247933885e-06, "loss": 0.002242486923933029, "memory(GiB)": 38.93, "reward": 0.6165254712104797, "reward_std": 0.11609330773353577, "rewards/VisualizationJSONCombinedORM/mean": 0.6165254712104797, "rewards/VisualizationJSONCombinedORM/std": 0.12766911089420319, "step": 80, "train_speed(iter/s)": 0.026921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 270.0625, "completions/min_length": 224.0, "epoch": 0.06699751861042183, "grad_norm": 0.23964668810367584, "kl": 0.27587890625, "learning_rate": 6.694214876033058e-06, "loss": 0.0027510225772857666, "memory(GiB)": 38.93, "reward": 0.48246726393699646, "reward_std": 0.12040196359157562, "rewards/VisualizationJSONCombinedORM/mean": 0.48246726393699646, "rewards/VisualizationJSONCombinedORM/std": 0.22918204963207245, "step": 81, "train_speed(iter/s)": 0.026993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 276.4375, "completions/min_length": 250.0, "epoch": 0.06782464846980976, "grad_norm": 0.2225945144891739, "kl": 0.2203369140625, "learning_rate": 6.776859504132232e-06, "loss": 0.0022095367312431335, "memory(GiB)": 38.93, "reward": 0.42607083916664124, "reward_std": 0.059713806957006454, "rewards/VisualizationJSONCombinedORM/mean": 0.42607083916664124, "rewards/VisualizationJSONCombinedORM/std": 0.07610607892274857, "step": 82, "train_speed(iter/s)": 0.027108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 268.5625, "completions/min_length": 225.0, "epoch": 0.06865177832919768, "grad_norm": 0.2170988917350769, "kl": 0.1046142578125, "learning_rate": 6.859504132231406e-06, "loss": 0.0010454729199409485, "memory(GiB)": 38.93, "reward": 0.43470263481140137, "reward_std": 0.07406627386808395, "rewards/VisualizationJSONCombinedORM/mean": 0.43470263481140137, "rewards/VisualizationJSONCombinedORM/std": 0.19906888902187347, "step": 83, "train_speed(iter/s)": 0.02722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 276.5, "completions/min_length": 220.0, "epoch": 0.06947890818858561, "grad_norm": 0.2307700365781784, "kl": 0.12158203125, "learning_rate": 6.942148760330579e-06, "loss": 0.0012170523405075073, "memory(GiB)": 38.93, "reward": 0.5303221940994263, "reward_std": 0.13360276818275452, "rewards/VisualizationJSONCombinedORM/mean": 0.5303221940994263, "rewards/VisualizationJSONCombinedORM/std": 0.20784853398799896, "step": 84, "train_speed(iter/s)": 0.027335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 276.5625, "completions/min_length": 225.0, "epoch": 0.07030603804797353, "grad_norm": 0.20596067607402802, "kl": 0.1171875, "learning_rate": 7.0247933884297525e-06, "loss": 0.0011730790138244629, "memory(GiB)": 38.93, "reward": 0.3876790404319763, "reward_std": 0.1182737648487091, "rewards/VisualizationJSONCombinedORM/mean": 0.3876790404319763, "rewards/VisualizationJSONCombinedORM/std": 0.16473643481731415, "step": 85, "train_speed(iter/s)": 0.027434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 240.3125, "completions/min_length": 209.0, "epoch": 0.07113316790736146, "grad_norm": 0.21293622255325317, "kl": 0.07159423828125, "learning_rate": 7.107438016528926e-06, "loss": 0.0007190071046352386, "memory(GiB)": 38.93, "reward": 0.3798253536224365, "reward_std": 0.06691616028547287, "rewards/VisualizationJSONCombinedORM/mean": 0.3798253536224365, "rewards/VisualizationJSONCombinedORM/std": 0.20603568851947784, "step": 86, "train_speed(iter/s)": 0.027501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 293.625, "completions/min_length": 211.0, "epoch": 0.07196029776674938, "grad_norm": 0.23166286945343018, "kl": 0.070068359375, "learning_rate": 7.1900826446281005e-06, "loss": 0.000700823962688446, "memory(GiB)": 38.93, "reward": 0.26609593629837036, "reward_std": 0.04909756779670715, "rewards/VisualizationJSONCombinedORM/mean": 0.26609593629837036, "rewards/VisualizationJSONCombinedORM/std": 0.06726846843957901, "step": 87, "train_speed(iter/s)": 0.027586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 284.0, "completions/min_length": 229.0, "epoch": 0.07278742762613731, "grad_norm": 0.2531713843345642, "kl": 0.18310546875, "learning_rate": 7.272727272727273e-06, "loss": 0.001827344298362732, "memory(GiB)": 38.93, "reward": 0.5128841400146484, "reward_std": 0.09153702855110168, "rewards/VisualizationJSONCombinedORM/mean": 0.5128841400146484, "rewards/VisualizationJSONCombinedORM/std": 0.16124016046524048, "step": 88, "train_speed(iter/s)": 0.027666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 280.3125, "completions/min_length": 240.0, "epoch": 0.07361455748552523, "grad_norm": 0.20203499495983124, "kl": 0.1317138671875, "learning_rate": 7.355371900826447e-06, "loss": 0.0013167411088943481, "memory(GiB)": 38.93, "reward": 0.24959136545658112, "reward_std": 0.05275476351380348, "rewards/VisualizationJSONCombinedORM/mean": 0.24959136545658112, "rewards/VisualizationJSONCombinedORM/std": 0.06231621652841568, "step": 89, "train_speed(iter/s)": 0.027787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 275.1875, "completions/min_length": 239.0, "epoch": 0.07444168734491315, "grad_norm": 0.24183164536952972, "kl": 0.128173828125, "learning_rate": 7.43801652892562e-06, "loss": 0.0012843385338783264, "memory(GiB)": 38.93, "reward": 0.298653781414032, "reward_std": 0.05939670652151108, "rewards/VisualizationJSONCombinedORM/mean": 0.298653781414032, "rewards/VisualizationJSONCombinedORM/std": 0.1808978170156479, "step": 90, "train_speed(iter/s)": 0.027859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 274.9375, "completions/min_length": 209.0, "epoch": 0.07526881720430108, "grad_norm": 0.34721335768699646, "kl": 0.3907470703125, "learning_rate": 7.520661157024795e-06, "loss": 0.003909014165401459, "memory(GiB)": 38.93, "reward": 0.3855639100074768, "reward_std": 0.07266607880592346, "rewards/VisualizationJSONCombinedORM/mean": 0.3855639100074768, "rewards/VisualizationJSONCombinedORM/std": 0.0741477981209755, "step": 91, "train_speed(iter/s)": 0.027968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 295.625, "completions/min_length": 250.0, "epoch": 0.076095947063689, "grad_norm": 0.20058870315551758, "kl": 0.1351318359375, "learning_rate": 7.603305785123968e-06, "loss": 0.0013503506779670715, "memory(GiB)": 38.93, "reward": 0.2952834963798523, "reward_std": 0.06390966475009918, "rewards/VisualizationJSONCombinedORM/mean": 0.2952834963798523, "rewards/VisualizationJSONCombinedORM/std": 0.08199948072433472, "step": 92, "train_speed(iter/s)": 0.028066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 277.625, "completions/min_length": 238.0, "epoch": 0.07692307692307693, "grad_norm": 0.20507648587226868, "kl": 0.094970703125, "learning_rate": 7.685950413223142e-06, "loss": 0.0009512640535831451, "memory(GiB)": 38.93, "reward": 0.5966698527336121, "reward_std": 0.04579680785536766, "rewards/VisualizationJSONCombinedORM/mean": 0.5966698527336121, "rewards/VisualizationJSONCombinedORM/std": 0.1763661652803421, "step": 93, "train_speed(iter/s)": 0.028158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 272.125, "completions/min_length": 226.0, "epoch": 0.07775020678246485, "grad_norm": 0.2195262312889099, "kl": 0.0592041015625, "learning_rate": 7.768595041322314e-06, "loss": 0.0005919672548770905, "memory(GiB)": 38.93, "reward": 0.5809385776519775, "reward_std": 0.11577644944190979, "rewards/VisualizationJSONCombinedORM/mean": 0.5809385776519775, "rewards/VisualizationJSONCombinedORM/std": 0.12786544859409332, "step": 94, "train_speed(iter/s)": 0.028245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 270.8125, "completions/min_length": 231.0, "epoch": 0.07857733664185278, "grad_norm": 0.21420803666114807, "kl": 0.10723876953125, "learning_rate": 7.851239669421489e-06, "loss": 0.0010726917535066605, "memory(GiB)": 38.93, "reward": 0.5671088695526123, "reward_std": 0.11949049681425095, "rewards/VisualizationJSONCombinedORM/mean": 0.5671088695526123, "rewards/VisualizationJSONCombinedORM/std": 0.14510852098464966, "step": 95, "train_speed(iter/s)": 0.028358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 293.9375, "completions/min_length": 255.0, "epoch": 0.0794044665012407, "grad_norm": 0.22135670483112335, "kl": 0.06658935546875, "learning_rate": 7.933884297520661e-06, "loss": 0.0006668306887149811, "memory(GiB)": 38.93, "reward": 0.46360746026039124, "reward_std": 0.12160421907901764, "rewards/VisualizationJSONCombinedORM/mean": 0.46360746026039124, "rewards/VisualizationJSONCombinedORM/std": 0.17894117534160614, "step": 96, "train_speed(iter/s)": 0.028448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 278.25, "completions/min_length": 233.0, "epoch": 0.08023159636062861, "grad_norm": 0.21384917199611664, "kl": 0.075927734375, "learning_rate": 8.016528925619836e-06, "loss": 0.0007597431540489197, "memory(GiB)": 38.95, "reward": 0.20013804733753204, "reward_std": 0.036122001707553864, "rewards/VisualizationJSONCombinedORM/mean": 0.20013804733753204, "rewards/VisualizationJSONCombinedORM/std": 0.0672534853219986, "step": 97, "train_speed(iter/s)": 0.028477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 270.1875, "completions/min_length": 221.0, "epoch": 0.08105872622001654, "grad_norm": 0.19491934776306152, "kl": 0.04534912109375, "learning_rate": 8.099173553719009e-06, "loss": 0.000453852117061615, "memory(GiB)": 38.95, "reward": 0.4873707890510559, "reward_std": 0.07341524958610535, "rewards/VisualizationJSONCombinedORM/mean": 0.4873707890510559, "rewards/VisualizationJSONCombinedORM/std": 0.21582207083702087, "step": 98, "train_speed(iter/s)": 0.028587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 289.0625, "completions/min_length": 238.0, "epoch": 0.08188585607940446, "grad_norm": 0.21813078224658966, "kl": 0.07958984375, "learning_rate": 8.181818181818183e-06, "loss": 0.0007968693971633911, "memory(GiB)": 38.95, "reward": 0.4041167199611664, "reward_std": 0.0660363957285881, "rewards/VisualizationJSONCombinedORM/mean": 0.4041167199611664, "rewards/VisualizationJSONCombinedORM/std": 0.15906895697116852, "step": 99, "train_speed(iter/s)": 0.028643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 258.0625, "completions/min_length": 220.0, "epoch": 0.0827129859387924, "grad_norm": 0.21885332465171814, "kl": 0.0670166015625, "learning_rate": 8.264462809917356e-06, "loss": 0.0006691664457321167, "memory(GiB)": 38.95, "reward": 0.2934790849685669, "reward_std": 0.06167110428214073, "rewards/VisualizationJSONCombinedORM/mean": 0.2934790849685669, "rewards/VisualizationJSONCombinedORM/std": 0.1108633354306221, "step": 100, "train_speed(iter/s)": 0.028728 }, { "epoch": 0.0827129859387924, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 341.125, "eval_completions/mean_length": 284.890625, "eval_completions/min_length": 240.54166666666666, "eval_kl": 0.153289794921875, "eval_loss": 0.0015416772803291678, "eval_reward": 0.4000083909680446, "eval_reward_std": 0.08463676211734612, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4000083909680446, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08463676619188239, "eval_runtime": 295.6354, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 279.4375, "completions/min_length": 224.0, "epoch": 0.08354011579818031, "grad_norm": 0.24593444168567657, "kl": 0.0787353515625, "learning_rate": 8.34710743801653e-06, "loss": 0.0007861778140068054, "memory(GiB)": 38.95, "reward": 0.4759935140609741, "reward_std": 0.0961407870054245, "rewards/VisualizationJSONCombinedORM/mean": 0.4759935140609741, "rewards/VisualizationJSONCombinedORM/std": 0.10292033106088638, "step": 101, "train_speed(iter/s)": 0.026574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 272.8125, "completions/min_length": 219.0, "epoch": 0.08436724565756824, "grad_norm": 0.2598411440849304, "kl": 0.07464599609375, "learning_rate": 8.429752066115703e-06, "loss": 0.0007454454898834229, "memory(GiB)": 38.95, "reward": 0.38237565755844116, "reward_std": 0.0903061255812645, "rewards/VisualizationJSONCombinedORM/mean": 0.38237565755844116, "rewards/VisualizationJSONCombinedORM/std": 0.09251082688570023, "step": 102, "train_speed(iter/s)": 0.026641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 277.3125, "completions/min_length": 234.0, "epoch": 0.08519437551695616, "grad_norm": 0.22361420094966888, "kl": 0.07373046875, "learning_rate": 8.512396694214877e-06, "loss": 0.0007353629916906357, "memory(GiB)": 38.95, "reward": 0.6596471667289734, "reward_std": 0.1022479236125946, "rewards/VisualizationJSONCombinedORM/mean": 0.6596471667289734, "rewards/VisualizationJSONCombinedORM/std": 0.10037974268198013, "step": 103, "train_speed(iter/s)": 0.026715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 282.5, "completions/min_length": 234.0, "epoch": 0.08602150537634409, "grad_norm": 0.45501708984375, "kl": 0.1103515625, "learning_rate": 8.59504132231405e-06, "loss": 0.0011038482189178467, "memory(GiB)": 38.95, "reward": 0.5930397510528564, "reward_std": 0.13401177525520325, "rewards/VisualizationJSONCombinedORM/mean": 0.5930397510528564, "rewards/VisualizationJSONCombinedORM/std": 0.15752582252025604, "step": 104, "train_speed(iter/s)": 0.026805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 284.3125, "completions/min_length": 209.0, "epoch": 0.08684863523573201, "grad_norm": 0.2403556853532791, "kl": 0.179931640625, "learning_rate": 8.677685950413224e-06, "loss": 0.001801956444978714, "memory(GiB)": 38.95, "reward": 0.2545745372772217, "reward_std": 0.040673382580280304, "rewards/VisualizationJSONCombinedORM/mean": 0.2545745372772217, "rewards/VisualizationJSONCombinedORM/std": 0.044875141233205795, "step": 105, "train_speed(iter/s)": 0.026882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 271.375, "completions/min_length": 218.0, "epoch": 0.08767576509511993, "grad_norm": 0.2763100266456604, "kl": 0.095947265625, "learning_rate": 8.760330578512397e-06, "loss": 0.0009587779641151428, "memory(GiB)": 38.95, "reward": 0.31999462842941284, "reward_std": 0.10333455353975296, "rewards/VisualizationJSONCombinedORM/mean": 0.31999462842941284, "rewards/VisualizationJSONCombinedORM/std": 0.11494028568267822, "step": 106, "train_speed(iter/s)": 0.026944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 294.0, "completions/min_length": 250.0, "epoch": 0.08850289495450786, "grad_norm": 0.2615436315536499, "kl": 0.1689453125, "learning_rate": 8.842975206611571e-06, "loss": 0.0016932934522628784, "memory(GiB)": 38.95, "reward": 0.28296640515327454, "reward_std": 0.06513598561286926, "rewards/VisualizationJSONCombinedORM/mean": 0.28296640515327454, "rewards/VisualizationJSONCombinedORM/std": 0.08509799093008041, "step": 107, "train_speed(iter/s)": 0.027059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 283.25, "completions/min_length": 256.0, "epoch": 0.08933002481389578, "grad_norm": 0.24835233390331268, "kl": 0.141845703125, "learning_rate": 8.925619834710744e-06, "loss": 0.0014203041791915894, "memory(GiB)": 38.95, "reward": 0.6618316173553467, "reward_std": 0.12672200798988342, "rewards/VisualizationJSONCombinedORM/mean": 0.6618316173553467, "rewards/VisualizationJSONCombinedORM/std": 0.13362181186676025, "step": 108, "train_speed(iter/s)": 0.027148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 269.1875, "completions/min_length": 204.0, "epoch": 0.09015715467328371, "grad_norm": 0.20632304251194, "kl": 0.1171875, "learning_rate": 9.008264462809918e-06, "loss": 0.001173168420791626, "memory(GiB)": 38.95, "reward": 0.5624185800552368, "reward_std": 0.06625716388225555, "rewards/VisualizationJSONCombinedORM/mean": 0.5624185800552368, "rewards/VisualizationJSONCombinedORM/std": 0.10773315280675888, "step": 109, "train_speed(iter/s)": 0.027251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 293.125, "completions/min_length": 239.0, "epoch": 0.09098428453267163, "grad_norm": 0.20766191184520721, "kl": 0.1260986328125, "learning_rate": 9.090909090909091e-06, "loss": 0.0012617213651537895, "memory(GiB)": 38.95, "reward": 0.6006754040718079, "reward_std": 0.0820385217666626, "rewards/VisualizationJSONCombinedORM/mean": 0.6006754040718079, "rewards/VisualizationJSONCombinedORM/std": 0.169478639960289, "step": 110, "train_speed(iter/s)": 0.027285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 282.4375, "completions/min_length": 242.0, "epoch": 0.09181141439205956, "grad_norm": 0.2561264932155609, "kl": 0.1632080078125, "learning_rate": 9.173553719008265e-06, "loss": 0.0016342736780643463, "memory(GiB)": 38.95, "reward": 0.641480565071106, "reward_std": 0.1245025172829628, "rewards/VisualizationJSONCombinedORM/mean": 0.641480565071106, "rewards/VisualizationJSONCombinedORM/std": 0.13093848526477814, "step": 111, "train_speed(iter/s)": 0.027377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 265.5625, "completions/min_length": 233.0, "epoch": 0.09263854425144748, "grad_norm": 0.2369615137577057, "kl": 0.355712890625, "learning_rate": 9.25619834710744e-06, "loss": 0.003562742844223976, "memory(GiB)": 38.95, "reward": 0.4200859069824219, "reward_std": 0.10057148337364197, "rewards/VisualizationJSONCombinedORM/mean": 0.4200859069824219, "rewards/VisualizationJSONCombinedORM/std": 0.11224790662527084, "step": 112, "train_speed(iter/s)": 0.027446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 267.3125, "completions/min_length": 218.0, "epoch": 0.0934656741108354, "grad_norm": 0.2340521663427353, "kl": 0.168212890625, "learning_rate": 9.338842975206613e-06, "loss": 0.0016813650727272034, "memory(GiB)": 38.95, "reward": 0.5819565653800964, "reward_std": 0.09077343344688416, "rewards/VisualizationJSONCombinedORM/mean": 0.5819565653800964, "rewards/VisualizationJSONCombinedORM/std": 0.20328061282634735, "step": 113, "train_speed(iter/s)": 0.027538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 277.3125, "completions/min_length": 248.0, "epoch": 0.09429280397022333, "grad_norm": 0.24215435981750488, "kl": 0.1640625, "learning_rate": 9.421487603305785e-06, "loss": 0.0016377530992031097, "memory(GiB)": 38.95, "reward": 0.4037814736366272, "reward_std": 0.08825882524251938, "rewards/VisualizationJSONCombinedORM/mean": 0.4037814736366272, "rewards/VisualizationJSONCombinedORM/std": 0.1829083263874054, "step": 114, "train_speed(iter/s)": 0.027582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 291.0625, "completions/min_length": 227.0, "epoch": 0.09511993382961124, "grad_norm": 0.2562548518180847, "kl": 0.0986328125, "learning_rate": 9.50413223140496e-06, "loss": 0.0009869597852230072, "memory(GiB)": 38.95, "reward": 0.4638156294822693, "reward_std": 0.10151034593582153, "rewards/VisualizationJSONCombinedORM/mean": 0.4638156294822693, "rewards/VisualizationJSONCombinedORM/std": 0.19953373074531555, "step": 115, "train_speed(iter/s)": 0.027671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 274.75, "completions/min_length": 212.0, "epoch": 0.09594706368899918, "grad_norm": 0.26825106143951416, "kl": 0.1685791015625, "learning_rate": 9.586776859504134e-06, "loss": 0.0016848370432853699, "memory(GiB)": 38.95, "reward": 0.4485335648059845, "reward_std": 0.13360373675823212, "rewards/VisualizationJSONCombinedORM/mean": 0.4485335648059845, "rewards/VisualizationJSONCombinedORM/std": 0.17123237252235413, "step": 116, "train_speed(iter/s)": 0.027739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 291.0625, "completions/min_length": 235.0, "epoch": 0.0967741935483871, "grad_norm": 0.23021277785301208, "kl": 0.118408203125, "learning_rate": 9.669421487603307e-06, "loss": 0.0011815540492534637, "memory(GiB)": 38.95, "reward": 0.44447821378707886, "reward_std": 0.11126147955656052, "rewards/VisualizationJSONCombinedORM/mean": 0.44447821378707886, "rewards/VisualizationJSONCombinedORM/std": 0.15582036972045898, "step": 117, "train_speed(iter/s)": 0.027814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 283.875, "completions/min_length": 213.0, "epoch": 0.09760132340777503, "grad_norm": 0.23727023601531982, "kl": 0.1165771484375, "learning_rate": 9.75206611570248e-06, "loss": 0.0011668726801872253, "memory(GiB)": 38.95, "reward": 0.5131579637527466, "reward_std": 0.10168218612670898, "rewards/VisualizationJSONCombinedORM/mean": 0.5131579637527466, "rewards/VisualizationJSONCombinedORM/std": 0.19319340586662292, "step": 118, "train_speed(iter/s)": 0.027889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 298.3125, "completions/min_length": 235.0, "epoch": 0.09842845326716294, "grad_norm": 0.2593723237514496, "kl": 0.110595703125, "learning_rate": 9.834710743801654e-06, "loss": 0.0011043399572372437, "memory(GiB)": 38.95, "reward": 0.4555820822715759, "reward_std": 0.08387061208486557, "rewards/VisualizationJSONCombinedORM/mean": 0.4555820822715759, "rewards/VisualizationJSONCombinedORM/std": 0.2211308777332306, "step": 119, "train_speed(iter/s)": 0.027947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 290.0, "completions/min_length": 218.0, "epoch": 0.09925558312655088, "grad_norm": 0.25867336988449097, "kl": 0.158447265625, "learning_rate": 9.917355371900828e-06, "loss": 0.00158768892288208, "memory(GiB)": 38.95, "reward": 0.41314393281936646, "reward_std": 0.09253566712141037, "rewards/VisualizationJSONCombinedORM/mean": 0.41314393281936646, "rewards/VisualizationJSONCombinedORM/std": 0.11780789494514465, "step": 120, "train_speed(iter/s)": 0.027996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 274.0, "completions/min_length": 220.0, "epoch": 0.1000827129859388, "grad_norm": 0.21255667507648468, "kl": 0.17919921875, "learning_rate": 1e-05, "loss": 0.001790553331375122, "memory(GiB)": 38.95, "reward": 0.44084876775741577, "reward_std": 0.09307848662137985, "rewards/VisualizationJSONCombinedORM/mean": 0.44084876775741577, "rewards/VisualizationJSONCombinedORM/std": 0.14132054150104523, "step": 121, "train_speed(iter/s)": 0.028061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 289.125, "completions/min_length": 195.0, "epoch": 0.10090984284532671, "grad_norm": 0.2528976798057556, "kl": 0.1075439453125, "learning_rate": 9.999979155971343e-06, "loss": 0.0010759495198726654, "memory(GiB)": 38.95, "reward": 0.45832884311676025, "reward_std": 0.1157294288277626, "rewards/VisualizationJSONCombinedORM/mean": 0.45832884311676025, "rewards/VisualizationJSONCombinedORM/std": 0.11549737304449081, "step": 122, "train_speed(iter/s)": 0.028118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 281.0, "completions/min_length": 241.0, "epoch": 0.10173697270471464, "grad_norm": 0.22273407876491547, "kl": 0.091552734375, "learning_rate": 9.99991662405916e-06, "loss": 0.0009164623916149139, "memory(GiB)": 38.95, "reward": 0.3674558401107788, "reward_std": 0.09176706522703171, "rewards/VisualizationJSONCombinedORM/mean": 0.3674558401107788, "rewards/VisualizationJSONCombinedORM/std": 0.1769493967294693, "step": 123, "train_speed(iter/s)": 0.028191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 269.625, "completions/min_length": 227.0, "epoch": 0.10256410256410256, "grad_norm": 0.22321157157421112, "kl": 0.14306640625, "learning_rate": 9.999812404784818e-06, "loss": 0.0014292187988758087, "memory(GiB)": 38.95, "reward": 0.513483464717865, "reward_std": 0.12210549414157867, "rewards/VisualizationJSONCombinedORM/mean": 0.513483464717865, "rewards/VisualizationJSONCombinedORM/std": 0.17209231853485107, "step": 124, "train_speed(iter/s)": 0.02824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 283.0, "completions/min_length": 217.0, "epoch": 0.10339123242349049, "grad_norm": 0.24105572700500488, "kl": 0.05987548828125, "learning_rate": 9.999666499017257e-06, "loss": 0.0005991905927658081, "memory(GiB)": 38.95, "reward": 0.5068931579589844, "reward_std": 0.1268250048160553, "rewards/VisualizationJSONCombinedORM/mean": 0.5068931579589844, "rewards/VisualizationJSONCombinedORM/std": 0.20269986987113953, "step": 125, "train_speed(iter/s)": 0.028297 }, { "epoch": 0.10339123242349049, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 316.5833333333333, "eval_completions/mean_length": 274.3177083333333, "eval_completions/min_length": 237.75, "eval_kl": 0.14054361979166666, "eval_loss": 0.0014308914542198181, "eval_reward": 0.4583592265844345, "eval_reward_std": 0.08916706506473322, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4583592265844345, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08916706743184477, "eval_runtime": 280.7979, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 256.5, "completions/min_length": 213.0, "epoch": 0.10421836228287841, "grad_norm": 0.21350523829460144, "kl": 0.2684326171875, "learning_rate": 9.999478907972981e-06, "loss": 0.0026857033371925354, "memory(GiB)": 38.95, "reward": 0.2489265501499176, "reward_std": 0.05208855867385864, "rewards/VisualizationJSONCombinedORM/mean": 0.2489265501499176, "rewards/VisualizationJSONCombinedORM/std": 0.07265554368495941, "step": 126, "train_speed(iter/s)": 0.026687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 281.3125, "completions/min_length": 231.0, "epoch": 0.10504549214226634, "grad_norm": 0.2556333541870117, "kl": 0.18798828125, "learning_rate": 9.999249633216054e-06, "loss": 0.0018769800662994385, "memory(GiB)": 38.95, "reward": 0.6208512783050537, "reward_std": 0.12552079558372498, "rewards/VisualizationJSONCombinedORM/mean": 0.6208512783050537, "rewards/VisualizationJSONCombinedORM/std": 0.1579023003578186, "step": 127, "train_speed(iter/s)": 0.026759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 288.875, "completions/min_length": 269.0, "epoch": 0.10587262200165426, "grad_norm": 0.21102263033390045, "kl": 0.08453369140625, "learning_rate": 9.99897867665808e-06, "loss": 0.0008447170257568359, "memory(GiB)": 38.95, "reward": 0.44335058331489563, "reward_std": 0.09253101050853729, "rewards/VisualizationJSONCombinedORM/mean": 0.44335058331489563, "rewards/VisualizationJSONCombinedORM/std": 0.20488595962524414, "step": 128, "train_speed(iter/s)": 0.02683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 269.25, "completions/min_length": 216.0, "epoch": 0.10669975186104218, "grad_norm": 0.26393699645996094, "kl": 0.0950927734375, "learning_rate": 9.998666040558187e-06, "loss": 0.0009530484676361084, "memory(GiB)": 38.95, "reward": 0.5608755946159363, "reward_std": 0.10240112245082855, "rewards/VisualizationJSONCombinedORM/mean": 0.5608755946159363, "rewards/VisualizationJSONCombinedORM/std": 0.15052682161331177, "step": 129, "train_speed(iter/s)": 0.026906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 263.625, "completions/min_length": 232.0, "epoch": 0.10752688172043011, "grad_norm": 0.21915896236896515, "kl": 0.098388671875, "learning_rate": 9.998311727523014e-06, "loss": 0.0009824465960264206, "memory(GiB)": 38.95, "reward": 0.393373966217041, "reward_std": 0.056983329355716705, "rewards/VisualizationJSONCombinedORM/mean": 0.393373966217041, "rewards/VisualizationJSONCombinedORM/std": 0.24498805403709412, "step": 130, "train_speed(iter/s)": 0.02697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 243.8125, "completions/min_length": 211.0, "epoch": 0.10835401157981803, "grad_norm": 0.21010519564151764, "kl": 0.10040283203125, "learning_rate": 9.997915740506688e-06, "loss": 0.0010029114782810211, "memory(GiB)": 38.95, "reward": 0.5689505934715271, "reward_std": 0.054452791810035706, "rewards/VisualizationJSONCombinedORM/mean": 0.5689505934715271, "rewards/VisualizationJSONCombinedORM/std": 0.1669481247663498, "step": 131, "train_speed(iter/s)": 0.027064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 263.875, "completions/min_length": 220.0, "epoch": 0.10918114143920596, "grad_norm": 0.23587076365947723, "kl": 0.18603515625, "learning_rate": 9.99747808281079e-06, "loss": 0.0018613804131746292, "memory(GiB)": 38.95, "reward": 0.5905872583389282, "reward_std": 0.133216992020607, "rewards/VisualizationJSONCombinedORM/mean": 0.5905872583389282, "rewards/VisualizationJSONCombinedORM/std": 0.13230037689208984, "step": 132, "train_speed(iter/s)": 0.027118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 266.375, "completions/min_length": 215.0, "epoch": 0.11000827129859388, "grad_norm": 0.29631659388542175, "kl": 0.288818359375, "learning_rate": 9.996998758084344e-06, "loss": 0.0028917789459228516, "memory(GiB)": 38.95, "reward": 0.4692475199699402, "reward_std": 0.12832632660865784, "rewards/VisualizationJSONCombinedORM/mean": 0.4692475199699402, "rewards/VisualizationJSONCombinedORM/std": 0.1303378790616989, "step": 133, "train_speed(iter/s)": 0.027173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 265.0, "completions/min_length": 230.0, "epoch": 0.11083540115798181, "grad_norm": 0.2622207701206207, "kl": 0.1474609375, "learning_rate": 9.996477770323772e-06, "loss": 0.001475553959608078, "memory(GiB)": 38.95, "reward": 0.46549704670906067, "reward_std": 0.1041446328163147, "rewards/VisualizationJSONCombinedORM/mean": 0.46549704670906067, "rewards/VisualizationJSONCombinedORM/std": 0.15795163810253143, "step": 134, "train_speed(iter/s)": 0.027222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 280.4375, "completions/min_length": 229.0, "epoch": 0.11166253101736973, "grad_norm": 0.2607896029949188, "kl": 0.1640625, "learning_rate": 9.995915123872866e-06, "loss": 0.001641005277633667, "memory(GiB)": 38.95, "reward": 0.31221649050712585, "reward_std": 0.07137951254844666, "rewards/VisualizationJSONCombinedORM/mean": 0.31221649050712585, "rewards/VisualizationJSONCombinedORM/std": 0.07195191085338593, "step": 135, "train_speed(iter/s)": 0.027307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 259.875, "completions/min_length": 210.0, "epoch": 0.11248966087675766, "grad_norm": 0.21442830562591553, "kl": 0.145263671875, "learning_rate": 9.995310823422756e-06, "loss": 0.0014544948935508728, "memory(GiB)": 38.95, "reward": 0.5850557088851929, "reward_std": 0.05490027368068695, "rewards/VisualizationJSONCombinedORM/mean": 0.5850557088851929, "rewards/VisualizationJSONCombinedORM/std": 0.1908618062734604, "step": 136, "train_speed(iter/s)": 0.027368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 260.625, "completions/min_length": 225.0, "epoch": 0.11331679073614558, "grad_norm": 0.2173696905374527, "kl": 0.0849609375, "learning_rate": 9.994664874011864e-06, "loss": 0.0008490718901157379, "memory(GiB)": 38.95, "reward": 0.5041723251342773, "reward_std": 0.09264437854290009, "rewards/VisualizationJSONCombinedORM/mean": 0.5041723251342773, "rewards/VisualizationJSONCombinedORM/std": 0.1421910524368286, "step": 137, "train_speed(iter/s)": 0.027454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 273.1875, "completions/min_length": 213.0, "epoch": 0.1141439205955335, "grad_norm": 0.23423154652118683, "kl": 0.1512451171875, "learning_rate": 9.993977281025862e-06, "loss": 0.0015169233083724976, "memory(GiB)": 38.95, "reward": 0.33051180839538574, "reward_std": 0.046408627182245255, "rewards/VisualizationJSONCombinedORM/mean": 0.33051180839538574, "rewards/VisualizationJSONCombinedORM/std": 0.085053451359272, "step": 138, "train_speed(iter/s)": 0.027512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 259.125, "completions/min_length": 195.0, "epoch": 0.11497105045492143, "grad_norm": 0.2407083362340927, "kl": 0.151611328125, "learning_rate": 9.993248050197638e-06, "loss": 0.001511797308921814, "memory(GiB)": 38.95, "reward": 0.35721075534820557, "reward_std": 0.09262184053659439, "rewards/VisualizationJSONCombinedORM/mean": 0.35721075534820557, "rewards/VisualizationJSONCombinedORM/std": 0.09594057500362396, "step": 139, "train_speed(iter/s)": 0.027597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/mean_length": 220.75, "completions/min_length": 206.0, "epoch": 0.11579818031430934, "grad_norm": 0.20646856725215912, "kl": 0.1011962890625, "learning_rate": 9.992477187607232e-06, "loss": 0.001012161374092102, "memory(GiB)": 38.95, "reward": 0.4821818470954895, "reward_std": 0.05951101332902908, "rewards/VisualizationJSONCombinedORM/mean": 0.4821818470954895, "rewards/VisualizationJSONCombinedORM/std": 0.22142145037651062, "step": 140, "train_speed(iter/s)": 0.02767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 265.25, "completions/min_length": 200.0, "epoch": 0.11662531017369727, "grad_norm": 0.23458649218082428, "kl": 0.1292724609375, "learning_rate": 9.9916646996818e-06, "loss": 0.0012924671173095703, "memory(GiB)": 38.95, "reward": 0.4041658341884613, "reward_std": 0.10921403020620346, "rewards/VisualizationJSONCombinedORM/mean": 0.4041658341884613, "rewards/VisualizationJSONCombinedORM/std": 0.12606096267700195, "step": 141, "train_speed(iter/s)": 0.027743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 241.875, "completions/min_length": 210.0, "epoch": 0.11745244003308519, "grad_norm": 0.24195422232151031, "kl": 0.15478515625, "learning_rate": 9.990810593195545e-06, "loss": 0.0015499144792556763, "memory(GiB)": 38.95, "reward": 0.5510006546974182, "reward_std": 0.12240107357501984, "rewards/VisualizationJSONCombinedORM/mean": 0.5510006546974182, "rewards/VisualizationJSONCombinedORM/std": 0.1436348408460617, "step": 142, "train_speed(iter/s)": 0.027798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 253.125, "completions/min_length": 227.0, "epoch": 0.11827956989247312, "grad_norm": 0.20207707583904266, "kl": 0.07086181640625, "learning_rate": 9.98991487526968e-06, "loss": 0.0007087141275405884, "memory(GiB)": 38.95, "reward": 0.44090279936790466, "reward_std": 0.08230552822351456, "rewards/VisualizationJSONCombinedORM/mean": 0.44090279936790466, "rewards/VisualizationJSONCombinedORM/std": 0.29991936683654785, "step": 143, "train_speed(iter/s)": 0.02784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 261.0, "completions/min_length": 208.0, "epoch": 0.11910669975186104, "grad_norm": 0.2274201214313507, "kl": 0.204345703125, "learning_rate": 9.988977553372353e-06, "loss": 0.002042394131422043, "memory(GiB)": 38.95, "reward": 0.5906664133071899, "reward_std": 0.07907424122095108, "rewards/VisualizationJSONCombinedORM/mean": 0.5906664133071899, "rewards/VisualizationJSONCombinedORM/std": 0.18051598966121674, "step": 144, "train_speed(iter/s)": 0.02789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 254.375, "completions/min_length": 226.0, "epoch": 0.11993382961124896, "grad_norm": 0.183087557554245, "kl": 0.20733642578125, "learning_rate": 9.987998635318586e-06, "loss": 0.0020719245076179504, "memory(GiB)": 38.95, "reward": 0.704560399055481, "reward_std": 0.1033347100019455, "rewards/VisualizationJSONCombinedORM/mean": 0.704560399055481, "rewards/VisualizationJSONCombinedORM/std": 0.10329759120941162, "step": 145, "train_speed(iter/s)": 0.02795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 249.5625, "completions/min_length": 205.0, "epoch": 0.12076095947063689, "grad_norm": 0.19842606782913208, "kl": 0.07489013671875, "learning_rate": 9.986978129270219e-06, "loss": 0.0007489509880542755, "memory(GiB)": 38.95, "reward": 0.4583089351654053, "reward_std": 0.06281284987926483, "rewards/VisualizationJSONCombinedORM/mean": 0.4583089351654053, "rewards/VisualizationJSONCombinedORM/std": 0.21779765188694, "step": 146, "train_speed(iter/s)": 0.028021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 242.375, "completions/min_length": 204.0, "epoch": 0.12158808933002481, "grad_norm": 0.13216261565685272, "kl": 0.06231689453125, "learning_rate": 9.985916043735839e-06, "loss": 0.0006223655072972178, "memory(GiB)": 38.95, "reward": 0.4202919602394104, "reward_std": 0.04144679754972458, "rewards/VisualizationJSONCombinedORM/mean": 0.4202919602394104, "rewards/VisualizationJSONCombinedORM/std": 0.10613545775413513, "step": 147, "train_speed(iter/s)": 0.028071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 236.125, "completions/min_length": 207.0, "epoch": 0.12241521918941274, "grad_norm": 0.23983831703662872, "kl": 0.07666015625, "learning_rate": 9.984812387570697e-06, "loss": 0.0007660724222660065, "memory(GiB)": 38.95, "reward": 0.5905719995498657, "reward_std": 0.09164696931838989, "rewards/VisualizationJSONCombinedORM/mean": 0.5905719995498657, "rewards/VisualizationJSONCombinedORM/std": 0.2660940885543823, "step": 148, "train_speed(iter/s)": 0.028119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 243.6875, "completions/min_length": 204.0, "epoch": 0.12324234904880066, "grad_norm": 0.20147928595542908, "kl": 0.049072265625, "learning_rate": 9.983667169976651e-06, "loss": 0.0004910565912723541, "memory(GiB)": 38.95, "reward": 0.33903032541275024, "reward_std": 0.03964676335453987, "rewards/VisualizationJSONCombinedORM/mean": 0.33903032541275024, "rewards/VisualizationJSONCombinedORM/std": 0.1246919259428978, "step": 149, "train_speed(iter/s)": 0.028178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 272.5625, "completions/min_length": 207.0, "epoch": 0.12406947890818859, "grad_norm": 0.21923202276229858, "kl": 0.06732177734375, "learning_rate": 9.982480400502082e-06, "loss": 0.0006715245544910431, "memory(GiB)": 38.95, "reward": 0.6205911636352539, "reward_std": 0.10947215557098389, "rewards/VisualizationJSONCombinedORM/mean": 0.6205911636352539, "rewards/VisualizationJSONCombinedORM/std": 0.10666517168283463, "step": 150, "train_speed(iter/s)": 0.028236 }, { "epoch": 0.12406947890818859, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 283.4166666666667, "eval_completions/mean_length": 246.359375, "eval_completions/min_length": 216.75, "eval_kl": 0.078582763671875, "eval_loss": 0.0007896348834037781, "eval_reward": 0.40687826462090015, "eval_reward_std": 0.07915904842472325, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.40687826462090015, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07915904807547729, "eval_runtime": 260.7832, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.012, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 259.8125, "completions/min_length": 211.0, "epoch": 0.12489660876757651, "grad_norm": 0.21156692504882812, "kl": 0.0728759765625, "learning_rate": 9.98125208904181e-06, "loss": 0.0007291287183761597, "memory(GiB)": 38.95, "reward": 0.45459967851638794, "reward_std": 0.054216109216213226, "rewards/VisualizationJSONCombinedORM/mean": 0.45459967851638794, "rewards/VisualizationJSONCombinedORM/std": 0.0538950152695179, "step": 151, "train_speed(iter/s)": 0.026991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 256.5625, "completions/min_length": 214.0, "epoch": 0.12572373862696443, "grad_norm": 0.22103427350521088, "kl": 0.0892333984375, "learning_rate": 9.97998224583702e-06, "loss": 0.0008927788585424423, "memory(GiB)": 38.95, "reward": 0.5364219546318054, "reward_std": 0.07775963097810745, "rewards/VisualizationJSONCombinedORM/mean": 0.5364219546318054, "rewards/VisualizationJSONCombinedORM/std": 0.21038801968097687, "step": 152, "train_speed(iter/s)": 0.027074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 260.9375, "completions/min_length": 211.0, "epoch": 0.12655086848635236, "grad_norm": 0.20134465396404266, "kl": 0.064208984375, "learning_rate": 9.978670881475173e-06, "loss": 0.0006433241069316864, "memory(GiB)": 38.95, "reward": 0.46123749017715454, "reward_std": 0.05140110105276108, "rewards/VisualizationJSONCombinedORM/mean": 0.46123749017715454, "rewards/VisualizationJSONCombinedORM/std": 0.06679568439722061, "step": 153, "train_speed(iter/s)": 0.027122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 237.875, "completions/min_length": 186.0, "epoch": 0.1273779983457403, "grad_norm": 0.2208600491285324, "kl": 0.06134033203125, "learning_rate": 9.977318006889913e-06, "loss": 0.0006138384342193604, "memory(GiB)": 38.95, "reward": 0.5350985527038574, "reward_std": 0.12540599703788757, "rewards/VisualizationJSONCombinedORM/mean": 0.5350985527038574, "rewards/VisualizationJSONCombinedORM/std": 0.270576536655426, "step": 154, "train_speed(iter/s)": 0.027169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 253.1875, "completions/min_length": 216.0, "epoch": 0.1282051282051282, "grad_norm": 0.2093411684036255, "kl": 0.07977294921875, "learning_rate": 9.975923633360985e-06, "loss": 0.0007975250482559204, "memory(GiB)": 38.95, "reward": 0.42132285237312317, "reward_std": 0.1100415289402008, "rewards/VisualizationJSONCombinedORM/mean": 0.42132285237312317, "rewards/VisualizationJSONCombinedORM/std": 0.14762108027935028, "step": 155, "train_speed(iter/s)": 0.027225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 259.5625, "completions/min_length": 207.0, "epoch": 0.12903225806451613, "grad_norm": 0.19961653649806976, "kl": 0.0692138671875, "learning_rate": 9.974487772514131e-06, "loss": 0.0006915926933288574, "memory(GiB)": 38.95, "reward": 0.5008173584938049, "reward_std": 0.09245631098747253, "rewards/VisualizationJSONCombinedORM/mean": 0.5008173584938049, "rewards/VisualizationJSONCombinedORM/std": 0.2163374423980713, "step": 156, "train_speed(iter/s)": 0.027271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 263.5625, "completions/min_length": 230.0, "epoch": 0.12985938792390406, "grad_norm": 0.24163830280303955, "kl": 0.06500244140625, "learning_rate": 9.973010436321005e-06, "loss": 0.0006486549973487854, "memory(GiB)": 38.95, "reward": 0.34626129269599915, "reward_std": 0.08271616697311401, "rewards/VisualizationJSONCombinedORM/mean": 0.34626129269599915, "rewards/VisualizationJSONCombinedORM/std": 0.08021192252635956, "step": 157, "train_speed(iter/s)": 0.027327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 241.3125, "completions/min_length": 205.0, "epoch": 0.130686517783292, "grad_norm": 0.18970023095607758, "kl": 0.07110595703125, "learning_rate": 9.971491637099055e-06, "loss": 0.0007127523422241211, "memory(GiB)": 38.95, "reward": 0.5658500790596008, "reward_std": 0.13862082362174988, "rewards/VisualizationJSONCombinedORM/mean": 0.5658500790596008, "rewards/VisualizationJSONCombinedORM/std": 0.23437201976776123, "step": 158, "train_speed(iter/s)": 0.027385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 254.0, "completions/min_length": 223.0, "epoch": 0.1315136476426799, "grad_norm": 0.179074227809906, "kl": 0.05230712890625, "learning_rate": 9.969931387511447e-06, "loss": 0.0005236975848674774, "memory(GiB)": 38.95, "reward": 0.40549352765083313, "reward_std": 0.0653383731842041, "rewards/VisualizationJSONCombinedORM/mean": 0.40549352765083313, "rewards/VisualizationJSONCombinedORM/std": 0.07778842747211456, "step": 159, "train_speed(iter/s)": 0.027416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 255.1875, "completions/min_length": 215.0, "epoch": 0.13234077750206782, "grad_norm": 0.20142607390880585, "kl": 0.05694580078125, "learning_rate": 9.96832970056693e-06, "loss": 0.000571027398109436, "memory(GiB)": 38.95, "reward": 0.4513774812221527, "reward_std": 0.08796633780002594, "rewards/VisualizationJSONCombinedORM/mean": 0.4513774812221527, "rewards/VisualizationJSONCombinedORM/std": 0.21094927191734314, "step": 160, "train_speed(iter/s)": 0.027475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 266.1875, "completions/min_length": 223.0, "epoch": 0.13316790736145576, "grad_norm": 0.2090325951576233, "kl": 0.07073974609375, "learning_rate": 9.96668658961975e-06, "loss": 0.0007076803594827652, "memory(GiB)": 38.95, "reward": 0.4226635694503784, "reward_std": 0.09986946731805801, "rewards/VisualizationJSONCombinedORM/mean": 0.4226635694503784, "rewards/VisualizationJSONCombinedORM/std": 0.1638367474079132, "step": 161, "train_speed(iter/s)": 0.027556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 250.75, "completions/min_length": 216.0, "epoch": 0.13399503722084366, "grad_norm": 0.1915542334318161, "kl": 0.1014404296875, "learning_rate": 9.965002068369529e-06, "loss": 0.0010152310132980347, "memory(GiB)": 38.95, "reward": 0.5956462025642395, "reward_std": 0.14480029046535492, "rewards/VisualizationJSONCombinedORM/mean": 0.5956462025642395, "rewards/VisualizationJSONCombinedORM/std": 0.1528318226337433, "step": 162, "train_speed(iter/s)": 0.027617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 255.6875, "completions/min_length": 207.0, "epoch": 0.1348221670802316, "grad_norm": 0.2243652194738388, "kl": 0.06024169921875, "learning_rate": 9.963276150861145e-06, "loss": 0.0006020963191986084, "memory(GiB)": 38.95, "reward": 0.458371102809906, "reward_std": 0.0982431173324585, "rewards/VisualizationJSONCombinedORM/mean": 0.458371102809906, "rewards/VisualizationJSONCombinedORM/std": 0.22360634803771973, "step": 163, "train_speed(iter/s)": 0.027648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 269.3125, "completions/min_length": 234.0, "epoch": 0.13564929693961952, "grad_norm": 0.20501665771007538, "kl": 0.09478759765625, "learning_rate": 9.961508851484635e-06, "loss": 0.0009487010538578033, "memory(GiB)": 38.95, "reward": 0.20720313489437103, "reward_std": 0.032616496086120605, "rewards/VisualizationJSONCombinedORM/mean": 0.20720313489437103, "rewards/VisualizationJSONCombinedORM/std": 0.03457450121641159, "step": 164, "train_speed(iter/s)": 0.027676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 256.25, "completions/min_length": 207.0, "epoch": 0.13647642679900746, "grad_norm": 0.20169539749622345, "kl": 0.076416015625, "learning_rate": 9.95970018497505e-06, "loss": 0.0007632598280906677, "memory(GiB)": 38.95, "reward": 0.296558141708374, "reward_std": 0.07393553107976913, "rewards/VisualizationJSONCombinedORM/mean": 0.296558141708374, "rewards/VisualizationJSONCombinedORM/std": 0.1102294847369194, "step": 165, "train_speed(iter/s)": 0.027736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 273.5625, "completions/min_length": 221.0, "epoch": 0.13730355665839536, "grad_norm": 0.20982064306735992, "kl": 0.089599609375, "learning_rate": 9.957850166412348e-06, "loss": 0.0008964240550994873, "memory(GiB)": 38.95, "reward": 0.5116703510284424, "reward_std": 0.09904651343822479, "rewards/VisualizationJSONCombinedORM/mean": 0.5116703510284424, "rewards/VisualizationJSONCombinedORM/std": 0.1916923075914383, "step": 166, "train_speed(iter/s)": 0.027785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 258.125, "completions/min_length": 223.0, "epoch": 0.1381306865177833, "grad_norm": 0.21633115410804749, "kl": 0.1175537109375, "learning_rate": 9.95595881122127e-06, "loss": 0.0011779889464378357, "memory(GiB)": 38.95, "reward": 0.5373896360397339, "reward_std": 0.11404556035995483, "rewards/VisualizationJSONCombinedORM/mean": 0.5373896360397339, "rewards/VisualizationJSONCombinedORM/std": 0.16199548542499542, "step": 167, "train_speed(iter/s)": 0.027858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 246.6875, "completions/min_length": 218.0, "epoch": 0.13895781637717122, "grad_norm": 0.23985426127910614, "kl": 0.09991455078125, "learning_rate": 9.954026135171194e-06, "loss": 0.001001942902803421, "memory(GiB)": 38.95, "reward": 0.5600293278694153, "reward_std": 0.1101062148809433, "rewards/VisualizationJSONCombinedORM/mean": 0.5600293278694153, "rewards/VisualizationJSONCombinedORM/std": 0.11101371794939041, "step": 168, "train_speed(iter/s)": 0.027935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 275.6875, "completions/min_length": 223.0, "epoch": 0.13978494623655913, "grad_norm": 0.2320019155740738, "kl": 0.125, "learning_rate": 9.952052154376027e-06, "loss": 0.0012513697147369385, "memory(GiB)": 38.95, "reward": 0.3748459219932556, "reward_std": 0.10449467599391937, "rewards/VisualizationJSONCombinedORM/mean": 0.3748459219932556, "rewards/VisualizationJSONCombinedORM/std": 0.2146061658859253, "step": 169, "train_speed(iter/s)": 0.027983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 251.0625, "completions/min_length": 217.0, "epoch": 0.14061207609594706, "grad_norm": 0.22412671148777008, "kl": 0.1171875, "learning_rate": 9.950036885294052e-06, "loss": 0.0011685118079185486, "memory(GiB)": 38.95, "reward": 0.5657688975334167, "reward_std": 0.10918079316616058, "rewards/VisualizationJSONCombinedORM/mean": 0.5657688975334167, "rewards/VisualizationJSONCombinedORM/std": 0.11924164742231369, "step": 170, "train_speed(iter/s)": 0.02804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 293.625, "completions/min_length": 249.0, "epoch": 0.141439205955335, "grad_norm": 0.19151027500629425, "kl": 0.072998046875, "learning_rate": 9.947980344727799e-06, "loss": 0.0007285289466381073, "memory(GiB)": 38.95, "reward": 0.5527814626693726, "reward_std": 0.12194909155368805, "rewards/VisualizationJSONCombinedORM/mean": 0.5527814626693726, "rewards/VisualizationJSONCombinedORM/std": 0.1814371943473816, "step": 171, "train_speed(iter/s)": 0.02808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 262.9375, "completions/min_length": 215.0, "epoch": 0.14226633581472292, "grad_norm": 0.19652101397514343, "kl": 0.1038818359375, "learning_rate": 9.945882549823906e-06, "loss": 0.0010393410921096802, "memory(GiB)": 38.95, "reward": 0.6258490085601807, "reward_std": 0.15656426548957825, "rewards/VisualizationJSONCombinedORM/mean": 0.6258490085601807, "rewards/VisualizationJSONCombinedORM/std": 0.1853366196155548, "step": 172, "train_speed(iter/s)": 0.02813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 276.8125, "completions/min_length": 207.0, "epoch": 0.14309346567411083, "grad_norm": 0.20590083301067352, "kl": 0.08154296875, "learning_rate": 9.943743518072971e-06, "loss": 0.0008139126002788544, "memory(GiB)": 38.95, "reward": 0.541608452796936, "reward_std": 0.11864335834980011, "rewards/VisualizationJSONCombinedORM/mean": 0.541608452796936, "rewards/VisualizationJSONCombinedORM/std": 0.1847870945930481, "step": 173, "train_speed(iter/s)": 0.028181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 268.4375, "completions/min_length": 236.0, "epoch": 0.14392059553349876, "grad_norm": 0.20102408528327942, "kl": 0.083251953125, "learning_rate": 9.941563267309409e-06, "loss": 0.0008310750126838684, "memory(GiB)": 38.95, "reward": 0.2838311195373535, "reward_std": 0.06876488775014877, "rewards/VisualizationJSONCombinedORM/mean": 0.2838311195373535, "rewards/VisualizationJSONCombinedORM/std": 0.14053364098072052, "step": 174, "train_speed(iter/s)": 0.028237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 266.0, "completions/min_length": 202.0, "epoch": 0.1447477253928867, "grad_norm": 0.20543619990348816, "kl": 0.126708984375, "learning_rate": 9.939341815711302e-06, "loss": 0.0012703314423561096, "memory(GiB)": 38.95, "reward": 0.323303759098053, "reward_std": 0.09156471490859985, "rewards/VisualizationJSONCombinedORM/mean": 0.323303759098053, "rewards/VisualizationJSONCombinedORM/std": 0.11510197073221207, "step": 175, "train_speed(iter/s)": 0.028277 }, { "epoch": 0.1447477253928867, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 308.3333333333333, "eval_completions/mean_length": 262.7135416666667, "eval_completions/min_length": 227.75, "eval_kl": 0.08758544921875, "eval_loss": 0.0008794975583441556, "eval_reward": 0.4559529845913251, "eval_reward_std": 0.09316190083821614, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4559529845913251, "eval_rewards/VisualizationJSONCombinedORM/std": 0.09316190138148765, "eval_runtime": 275.7756, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 272.5, "completions/min_length": 229.0, "epoch": 0.14557485525227462, "grad_norm": 0.1939128190279007, "kl": 0.1163330078125, "learning_rate": 9.937079181800256e-06, "loss": 0.0011663027107715607, "memory(GiB)": 38.95, "reward": 0.5229030847549438, "reward_std": 0.13476547598838806, "rewards/VisualizationJSONCombinedORM/mean": 0.5229030847549438, "rewards/VisualizationJSONCombinedORM/std": 0.17417742311954498, "step": 176, "train_speed(iter/s)": 0.027116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 248.9375, "completions/min_length": 215.0, "epoch": 0.14640198511166252, "grad_norm": 0.20743156969547272, "kl": 0.099365234375, "learning_rate": 9.93477538444123e-06, "loss": 0.0009927116334438324, "memory(GiB)": 38.95, "reward": 0.4242471158504486, "reward_std": 0.09575492143630981, "rewards/VisualizationJSONCombinedORM/mean": 0.4242471158504486, "rewards/VisualizationJSONCombinedORM/std": 0.10021141916513443, "step": 177, "train_speed(iter/s)": 0.027169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 243.25, "completions/min_length": 215.0, "epoch": 0.14722911497105046, "grad_norm": 0.20431950688362122, "kl": 0.0751953125, "learning_rate": 9.93243044284239e-06, "loss": 0.0007507279515266418, "memory(GiB)": 38.95, "reward": 0.46451306343078613, "reward_std": 0.06578619033098221, "rewards/VisualizationJSONCombinedORM/mean": 0.46451306343078613, "rewards/VisualizationJSONCombinedORM/std": 0.25986090302467346, "step": 178, "train_speed(iter/s)": 0.027224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 284.875, "completions/min_length": 222.0, "epoch": 0.1480562448304384, "grad_norm": 0.21861892938613892, "kl": 0.06573486328125, "learning_rate": 9.930044376554948e-06, "loss": 0.0006571821868419647, "memory(GiB)": 38.95, "reward": 0.33389565348625183, "reward_std": 0.06661220639944077, "rewards/VisualizationJSONCombinedORM/mean": 0.33389565348625183, "rewards/VisualizationJSONCombinedORM/std": 0.07318571209907532, "step": 179, "train_speed(iter/s)": 0.027292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 270.8125, "completions/min_length": 236.0, "epoch": 0.1488833746898263, "grad_norm": 0.20671769976615906, "kl": 0.07684326171875, "learning_rate": 9.927617205473001e-06, "loss": 0.0007670298218727112, "memory(GiB)": 38.95, "reward": 0.4452974796295166, "reward_std": 0.07037388533353806, "rewards/VisualizationJSONCombinedORM/mean": 0.4452974796295166, "rewards/VisualizationJSONCombinedORM/std": 0.17173801362514496, "step": 180, "train_speed(iter/s)": 0.027337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 256.5, "completions/min_length": 221.0, "epoch": 0.14971050454921422, "grad_norm": 0.20143824815750122, "kl": 0.0760498046875, "learning_rate": 9.925148949833356e-06, "loss": 0.0007606707513332367, "memory(GiB)": 38.95, "reward": 0.6896728277206421, "reward_std": 0.12716075778007507, "rewards/VisualizationJSONCombinedORM/mean": 0.6896728277206421, "rewards/VisualizationJSONCombinedORM/std": 0.12931062281131744, "step": 181, "train_speed(iter/s)": 0.027368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 265.625, "completions/min_length": 216.0, "epoch": 0.15053763440860216, "grad_norm": 0.20818907022476196, "kl": 0.05645751953125, "learning_rate": 9.92263963021537e-06, "loss": 0.0005630329251289368, "memory(GiB)": 38.95, "reward": 0.41280287504196167, "reward_std": 0.05737457051873207, "rewards/VisualizationJSONCombinedORM/mean": 0.41280287504196167, "rewards/VisualizationJSONCombinedORM/std": 0.22445927560329437, "step": 182, "train_speed(iter/s)": 0.02742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 273.1875, "completions/min_length": 229.0, "epoch": 0.1513647642679901, "grad_norm": 0.320994108915329, "kl": 0.0750732421875, "learning_rate": 9.920089267540774e-06, "loss": 0.0007510259747505188, "memory(GiB)": 38.95, "reward": 0.48029446601867676, "reward_std": 0.07570455968379974, "rewards/VisualizationJSONCombinedORM/mean": 0.48029446601867676, "rewards/VisualizationJSONCombinedORM/std": 0.23783689737319946, "step": 183, "train_speed(iter/s)": 0.027457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 290.5, "completions/min_length": 242.0, "epoch": 0.152191894127378, "grad_norm": 0.2142488956451416, "kl": 0.0947265625, "learning_rate": 9.9174978830735e-06, "loss": 0.0009488696232438087, "memory(GiB)": 38.95, "reward": 0.46027112007141113, "reward_std": 0.08323042094707489, "rewards/VisualizationJSONCombinedORM/mean": 0.46027112007141113, "rewards/VisualizationJSONCombinedORM/std": 0.14959006011486053, "step": 184, "train_speed(iter/s)": 0.027502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 272.75, "completions/min_length": 239.0, "epoch": 0.15301902398676592, "grad_norm": 0.22146828472614288, "kl": 0.11114501953125, "learning_rate": 9.91486549841951e-06, "loss": 0.0011119060218334198, "memory(GiB)": 38.95, "reward": 0.440010666847229, "reward_std": 0.12777937948703766, "rewards/VisualizationJSONCombinedORM/mean": 0.440010666847229, "rewards/VisualizationJSONCombinedORM/std": 0.1640411615371704, "step": 185, "train_speed(iter/s)": 0.027552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 241.0, "completions/min_length": 219.0, "epoch": 0.15384615384615385, "grad_norm": 0.18531924486160278, "kl": 0.08123779296875, "learning_rate": 9.9121921355266e-06, "loss": 0.0008136667311191559, "memory(GiB)": 38.95, "reward": 0.36938759684562683, "reward_std": 0.06594273447990417, "rewards/VisualizationJSONCombinedORM/mean": 0.36938759684562683, "rewards/VisualizationJSONCombinedORM/std": 0.06936048716306686, "step": 186, "train_speed(iter/s)": 0.027623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 277.5625, "completions/min_length": 228.0, "epoch": 0.15467328370554176, "grad_norm": 0.23779405653476715, "kl": 0.10107421875, "learning_rate": 9.909477816684232e-06, "loss": 0.0010091736912727356, "memory(GiB)": 38.95, "reward": 0.4148039221763611, "reward_std": 0.09647826850414276, "rewards/VisualizationJSONCombinedORM/mean": 0.4148039221763611, "rewards/VisualizationJSONCombinedORM/std": 0.09986911714076996, "step": 187, "train_speed(iter/s)": 0.027673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 264.5, "completions/min_length": 209.0, "epoch": 0.1555004135649297, "grad_norm": 0.2621404528617859, "kl": 0.06781005859375, "learning_rate": 9.906722564523342e-06, "loss": 0.000679798424243927, "memory(GiB)": 38.95, "reward": 0.433694064617157, "reward_std": 0.06175607070326805, "rewards/VisualizationJSONCombinedORM/mean": 0.433694064617157, "rewards/VisualizationJSONCombinedORM/std": 0.22447749972343445, "step": 188, "train_speed(iter/s)": 0.027727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 263.5, "completions/min_length": 219.0, "epoch": 0.15632754342431762, "grad_norm": 0.20167392492294312, "kl": 0.08929443359375, "learning_rate": 9.903926402016153e-06, "loss": 0.0008941441774368286, "memory(GiB)": 38.95, "reward": 0.43204110860824585, "reward_std": 0.10287231206893921, "rewards/VisualizationJSONCombinedORM/mean": 0.43204110860824585, "rewards/VisualizationJSONCombinedORM/std": 0.12850433588027954, "step": 189, "train_speed(iter/s)": 0.02776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 260.1875, "completions/min_length": 221.0, "epoch": 0.15715467328370555, "grad_norm": 0.2112402468919754, "kl": 0.0784912109375, "learning_rate": 9.90108935247598e-06, "loss": 0.0007850974798202515, "memory(GiB)": 38.95, "reward": 0.5903710126876831, "reward_std": 0.14418110251426697, "rewards/VisualizationJSONCombinedORM/mean": 0.5903710126876831, "rewards/VisualizationJSONCombinedORM/std": 0.163186714053154, "step": 190, "train_speed(iter/s)": 0.027811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 262.5625, "completions/min_length": 206.0, "epoch": 0.15798180314309346, "grad_norm": 0.22896113991737366, "kl": 0.0875244140625, "learning_rate": 9.898211439557041e-06, "loss": 0.0008765421807765961, "memory(GiB)": 38.95, "reward": 0.5275944471359253, "reward_std": 0.06268803775310516, "rewards/VisualizationJSONCombinedORM/mean": 0.5275944471359253, "rewards/VisualizationJSONCombinedORM/std": 0.20921272039413452, "step": 191, "train_speed(iter/s)": 0.027858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 251.9375, "completions/min_length": 200.0, "epoch": 0.1588089330024814, "grad_norm": 0.19753515720367432, "kl": 0.102294921875, "learning_rate": 9.895292687254256e-06, "loss": 0.001021604984998703, "memory(GiB)": 38.95, "reward": 0.675472617149353, "reward_std": 0.1136307567358017, "rewards/VisualizationJSONCombinedORM/mean": 0.675472617149353, "rewards/VisualizationJSONCombinedORM/std": 0.11244551837444305, "step": 192, "train_speed(iter/s)": 0.027918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 266.875, "completions/min_length": 205.0, "epoch": 0.15963606286186932, "grad_norm": 0.21443302929401398, "kl": 0.082763671875, "learning_rate": 9.892333119903045e-06, "loss": 0.0008257590234279633, "memory(GiB)": 38.95, "reward": 0.616775631904602, "reward_std": 0.11453084647655487, "rewards/VisualizationJSONCombinedORM/mean": 0.616775631904602, "rewards/VisualizationJSONCombinedORM/std": 0.1283320039510727, "step": 193, "train_speed(iter/s)": 0.027968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 251.3125, "completions/min_length": 223.0, "epoch": 0.16046319272125723, "grad_norm": 0.22648122906684875, "kl": 0.07037353515625, "learning_rate": 9.889332762179134e-06, "loss": 0.0007036216557025909, "memory(GiB)": 38.95, "reward": 0.5190675258636475, "reward_std": 0.13673686981201172, "rewards/VisualizationJSONCombinedORM/mean": 0.5190675258636475, "rewards/VisualizationJSONCombinedORM/std": 0.13428063690662384, "step": 194, "train_speed(iter/s)": 0.028016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 265.5, "completions/min_length": 215.0, "epoch": 0.16129032258064516, "grad_norm": 0.21998129785060883, "kl": 0.11767578125, "learning_rate": 9.886291639098338e-06, "loss": 0.001177072525024414, "memory(GiB)": 38.95, "reward": 0.4951602816581726, "reward_std": 0.13374952971935272, "rewards/VisualizationJSONCombinedORM/mean": 0.4951602816581726, "rewards/VisualizationJSONCombinedORM/std": 0.20947176218032837, "step": 195, "train_speed(iter/s)": 0.028068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 251.3125, "completions/min_length": 204.0, "epoch": 0.1621174524400331, "grad_norm": 0.22356818616390228, "kl": 0.04693603515625, "learning_rate": 9.883209776016362e-06, "loss": 0.0004684925079345703, "memory(GiB)": 38.95, "reward": 0.37186700105667114, "reward_std": 0.09663483500480652, "rewards/VisualizationJSONCombinedORM/mean": 0.37186700105667114, "rewards/VisualizationJSONCombinedORM/std": 0.12178278714418411, "step": 196, "train_speed(iter/s)": 0.028106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 245.1875, "completions/min_length": 218.0, "epoch": 0.16294458229942102, "grad_norm": 0.2000282108783722, "kl": 0.08154296875, "learning_rate": 9.880087198628579e-06, "loss": 0.0008139237761497498, "memory(GiB)": 38.95, "reward": 0.342764675617218, "reward_std": 0.05368073284626007, "rewards/VisualizationJSONCombinedORM/mean": 0.342764675617218, "rewards/VisualizationJSONCombinedORM/std": 0.06641518324613571, "step": 197, "train_speed(iter/s)": 0.028161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 254.5625, "completions/min_length": 225.0, "epoch": 0.16377171215880892, "grad_norm": 0.2119799107313156, "kl": 0.069091796875, "learning_rate": 9.876923932969828e-06, "loss": 0.0006899293512105942, "memory(GiB)": 38.95, "reward": 0.5657061338424683, "reward_std": 0.08050379902124405, "rewards/VisualizationJSONCombinedORM/mean": 0.5657061338424683, "rewards/VisualizationJSONCombinedORM/std": 0.1313706338405609, "step": 198, "train_speed(iter/s)": 0.028206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 248.6875, "completions/min_length": 209.0, "epoch": 0.16459884201819686, "grad_norm": 0.232082799077034, "kl": 0.0701904296875, "learning_rate": 9.873720005414192e-06, "loss": 0.000702202320098877, "memory(GiB)": 38.95, "reward": 0.658155620098114, "reward_std": 0.15999102592468262, "rewards/VisualizationJSONCombinedORM/mean": 0.658155620098114, "rewards/VisualizationJSONCombinedORM/std": 0.15500925481319427, "step": 199, "train_speed(iter/s)": 0.028252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 268.375, "completions/min_length": 223.0, "epoch": 0.1654259718775848, "grad_norm": 0.2278880774974823, "kl": 0.07232666015625, "learning_rate": 9.870475442674768e-06, "loss": 0.0007230862975120544, "memory(GiB)": 38.95, "reward": 0.45341068506240845, "reward_std": 0.1293564736843109, "rewards/VisualizationJSONCombinedORM/mean": 0.45341068506240845, "rewards/VisualizationJSONCombinedORM/std": 0.15446294844150543, "step": 200, "train_speed(iter/s)": 0.028299 }, { "epoch": 0.1654259718775848, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 295.4583333333333, "eval_completions/mean_length": 253.04166666666666, "eval_completions/min_length": 221.75, "eval_kl": 0.08258056640625, "eval_loss": 0.0008302293717861176, "eval_reward": 0.47523769612113637, "eval_reward_std": 0.09047069610096514, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47523769612113637, "eval_rewards/VisualizationJSONCombinedORM/std": 0.09047069850688179, "eval_runtime": 267.7675, "eval_samples_per_second": 0.09, "eval_steps_per_second": 0.011, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/mean_length": 227.6875, "completions/min_length": 208.0, "epoch": 0.1662531017369727, "grad_norm": 0.2049998939037323, "kl": 0.0858154296875, "learning_rate": 9.867190271803466e-06, "loss": 0.0008569508790969849, "memory(GiB)": 38.95, "reward": 0.5831426382064819, "reward_std": 0.10406290739774704, "rewards/VisualizationJSONCombinedORM/mean": 0.5831426382064819, "rewards/VisualizationJSONCombinedORM/std": 0.13241223990917206, "step": 201, "train_speed(iter/s)": 0.027326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 231.9375, "completions/min_length": 180.0, "epoch": 0.16708023159636062, "grad_norm": 0.24690844118595123, "kl": 0.1492919921875, "learning_rate": 9.863864520190758e-06, "loss": 0.0014947354793548584, "memory(GiB)": 38.95, "reward": 0.5198575854301453, "reward_std": 0.17276525497436523, "rewards/VisualizationJSONCombinedORM/mean": 0.5198575854301453, "rewards/VisualizationJSONCombinedORM/std": 0.16882778704166412, "step": 202, "train_speed(iter/s)": 0.027373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 245.6875, "completions/min_length": 204.0, "epoch": 0.16790736145574855, "grad_norm": 0.18500851094722748, "kl": 0.1317138671875, "learning_rate": 9.860498215565473e-06, "loss": 0.0013173297047615051, "memory(GiB)": 38.95, "reward": 0.4905198812484741, "reward_std": 0.13312509655952454, "rewards/VisualizationJSONCombinedORM/mean": 0.4905198812484741, "rewards/VisualizationJSONCombinedORM/std": 0.14514310657978058, "step": 203, "train_speed(iter/s)": 0.027421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 247.1875, "completions/min_length": 206.0, "epoch": 0.1687344913151365, "grad_norm": 0.2440861165523529, "kl": 0.079833984375, "learning_rate": 9.85709138599455e-06, "loss": 0.0007990412414073944, "memory(GiB)": 38.95, "reward": 0.5247654914855957, "reward_std": 0.11209090054035187, "rewards/VisualizationJSONCombinedORM/mean": 0.5247654914855957, "rewards/VisualizationJSONCombinedORM/std": 0.1734558343887329, "step": 204, "train_speed(iter/s)": 0.027472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 249.5625, "completions/min_length": 199.0, "epoch": 0.1695616211745244, "grad_norm": 0.2999682128429413, "kl": 0.0860595703125, "learning_rate": 9.853644059882812e-06, "loss": 0.0008617639541625977, "memory(GiB)": 38.95, "reward": 0.39775794744491577, "reward_std": 0.14906403422355652, "rewards/VisualizationJSONCombinedORM/mean": 0.39775794744491577, "rewards/VisualizationJSONCombinedORM/std": 0.14613693952560425, "step": 205, "train_speed(iter/s)": 0.0275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 254.375, "completions/min_length": 211.0, "epoch": 0.17038875103391232, "grad_norm": 0.21556894481182098, "kl": 0.0850830078125, "learning_rate": 9.850156265972722e-06, "loss": 0.0008508935570716858, "memory(GiB)": 38.95, "reward": 0.405191570520401, "reward_std": 0.10810934752225876, "rewards/VisualizationJSONCombinedORM/mean": 0.405191570520401, "rewards/VisualizationJSONCombinedORM/std": 0.12872841954231262, "step": 206, "train_speed(iter/s)": 0.027568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 255.25, "completions/min_length": 190.0, "epoch": 0.17121588089330025, "grad_norm": 0.21427218616008759, "kl": 0.0780029296875, "learning_rate": 9.84662803334415e-06, "loss": 0.0007804557681083679, "memory(GiB)": 38.95, "reward": 0.3102916479110718, "reward_std": 0.1115175113081932, "rewards/VisualizationJSONCombinedORM/mean": 0.3102916479110718, "rewards/VisualizationJSONCombinedORM/std": 0.15342305600643158, "step": 207, "train_speed(iter/s)": 0.027634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 225.75, "completions/min_length": 191.0, "epoch": 0.17204301075268819, "grad_norm": 0.2169770449399948, "kl": 0.0809326171875, "learning_rate": 9.84305939141413e-06, "loss": 0.0008095353841781616, "memory(GiB)": 38.95, "reward": 0.32049617171287537, "reward_std": 0.08596335351467133, "rewards/VisualizationJSONCombinedORM/mean": 0.32049617171287537, "rewards/VisualizationJSONCombinedORM/std": 0.15530194342136383, "step": 208, "train_speed(iter/s)": 0.027688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 233.375, "completions/min_length": 192.0, "epoch": 0.1728701406120761, "grad_norm": 0.21537406742572784, "kl": 0.07830810546875, "learning_rate": 9.839450369936615e-06, "loss": 0.0007815882563591003, "memory(GiB)": 38.95, "reward": 0.4236185848712921, "reward_std": 0.06617939472198486, "rewards/VisualizationJSONCombinedORM/mean": 0.4236185848712921, "rewards/VisualizationJSONCombinedORM/std": 0.12534043192863464, "step": 209, "train_speed(iter/s)": 0.027733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 229.375, "completions/min_length": 203.0, "epoch": 0.17369727047146402, "grad_norm": 0.231692835688591, "kl": 0.054443359375, "learning_rate": 9.835800999002218e-06, "loss": 0.0005446076393127441, "memory(GiB)": 38.95, "reward": 0.5722898244857788, "reward_std": 0.14035683870315552, "rewards/VisualizationJSONCombinedORM/mean": 0.5722898244857788, "rewards/VisualizationJSONCombinedORM/std": 0.1363520473241806, "step": 210, "train_speed(iter/s)": 0.027775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 241.4375, "completions/min_length": 202.0, "epoch": 0.17452440033085195, "grad_norm": 0.2005119025707245, "kl": 0.0810546875, "learning_rate": 9.832111309037979e-06, "loss": 0.0008099712431430817, "memory(GiB)": 38.95, "reward": 0.31197261810302734, "reward_std": 0.04129549115896225, "rewards/VisualizationJSONCombinedORM/mean": 0.31197261810302734, "rewards/VisualizationJSONCombinedORM/std": 0.11208729445934296, "step": 211, "train_speed(iter/s)": 0.027814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 258.875, "completions/min_length": 203.0, "epoch": 0.17535153019023986, "grad_norm": 0.24424606561660767, "kl": 0.0850830078125, "learning_rate": 9.8283813308071e-06, "loss": 0.0008515343070030212, "memory(GiB)": 38.95, "reward": 0.39775392413139343, "reward_std": 0.14136649668216705, "rewards/VisualizationJSONCombinedORM/mean": 0.39775392413139343, "rewards/VisualizationJSONCombinedORM/std": 0.1609075665473938, "step": 212, "train_speed(iter/s)": 0.027868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 239.5, "completions/min_length": 202.0, "epoch": 0.1761786600496278, "grad_norm": 0.2128223478794098, "kl": 0.054931640625, "learning_rate": 9.824611095408691e-06, "loss": 0.0005493834614753723, "memory(GiB)": 38.95, "reward": 0.3284393548965454, "reward_std": 0.12910285592079163, "rewards/VisualizationJSONCombinedORM/mean": 0.3284393548965454, "rewards/VisualizationJSONCombinedORM/std": 0.1380840539932251, "step": 213, "train_speed(iter/s)": 0.027899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 252.0, "completions/min_length": 211.0, "epoch": 0.17700578990901572, "grad_norm": 0.21681946516036987, "kl": 0.111083984375, "learning_rate": 9.820800634277505e-06, "loss": 0.001111382618546486, "memory(GiB)": 38.95, "reward": 0.6001484394073486, "reward_std": 0.14565421640872955, "rewards/VisualizationJSONCombinedORM/mean": 0.6001484394073486, "rewards/VisualizationJSONCombinedORM/std": 0.16927708685398102, "step": 214, "train_speed(iter/s)": 0.027962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 252.9375, "completions/min_length": 221.0, "epoch": 0.17783291976840365, "grad_norm": 0.212870791554451, "kl": 0.059326171875, "learning_rate": 9.816949979183692e-06, "loss": 0.0005909949541091919, "memory(GiB)": 38.95, "reward": 0.2384912371635437, "reward_std": 0.04397374019026756, "rewards/VisualizationJSONCombinedORM/mean": 0.2384912371635437, "rewards/VisualizationJSONCombinedORM/std": 0.1037481501698494, "step": 215, "train_speed(iter/s)": 0.027999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 247.1875, "completions/min_length": 214.0, "epoch": 0.17866004962779156, "grad_norm": 0.196989968419075, "kl": 0.0394287109375, "learning_rate": 9.813059162232517e-06, "loss": 0.00039442628622055054, "memory(GiB)": 38.95, "reward": 0.4950277507305145, "reward_std": 0.10349979996681213, "rewards/VisualizationJSONCombinedORM/mean": 0.4950277507305145, "rewards/VisualizationJSONCombinedORM/std": 0.1275572031736374, "step": 216, "train_speed(iter/s)": 0.028021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 246.6875, "completions/min_length": 196.0, "epoch": 0.1794871794871795, "grad_norm": 0.1987491399049759, "kl": 0.057373046875, "learning_rate": 9.809128215864096e-06, "loss": 0.0005757734179496765, "memory(GiB)": 38.95, "reward": 0.6180538535118103, "reward_std": 0.13504168391227722, "rewards/VisualizationJSONCombinedORM/mean": 0.6180538535118103, "rewards/VisualizationJSONCombinedORM/std": 0.13914920389652252, "step": 217, "train_speed(iter/s)": 0.028068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 217.875, "completions/min_length": 188.0, "epoch": 0.18031430934656742, "grad_norm": 0.19572235643863678, "kl": 0.05523681640625, "learning_rate": 9.805157172853137e-06, "loss": 0.000552300363779068, "memory(GiB)": 38.95, "reward": 0.6946470141410828, "reward_std": 0.13023175299167633, "rewards/VisualizationJSONCombinedORM/mean": 0.6946470141410828, "rewards/VisualizationJSONCombinedORM/std": 0.1597507745027542, "step": 218, "train_speed(iter/s)": 0.028121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 240.75, "completions/min_length": 217.0, "epoch": 0.18114143920595532, "grad_norm": 0.24163095653057098, "kl": 0.0843505859375, "learning_rate": 9.80114606630865e-06, "loss": 0.000843685120344162, "memory(GiB)": 38.95, "reward": 0.4760308861732483, "reward_std": 0.1398252695798874, "rewards/VisualizationJSONCombinedORM/mean": 0.4760308861732483, "rewards/VisualizationJSONCombinedORM/std": 0.1442095786333084, "step": 219, "train_speed(iter/s)": 0.028169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 243.625, "completions/min_length": 208.0, "epoch": 0.18196856906534326, "grad_norm": 0.22586296498775482, "kl": 0.0640869140625, "learning_rate": 9.797094929673688e-06, "loss": 0.0006407275795936584, "memory(GiB)": 38.95, "reward": 0.6437135934829712, "reward_std": 0.1344442069530487, "rewards/VisualizationJSONCombinedORM/mean": 0.6437135934829712, "rewards/VisualizationJSONCombinedORM/std": 0.13402745127677917, "step": 220, "train_speed(iter/s)": 0.028208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 246.0625, "completions/min_length": 211.0, "epoch": 0.1827956989247312, "grad_norm": 0.22367994487285614, "kl": 0.08380126953125, "learning_rate": 9.793003796725049e-06, "loss": 0.0008374303579330444, "memory(GiB)": 38.95, "reward": 0.46753832697868347, "reward_std": 0.060435377061367035, "rewards/VisualizationJSONCombinedORM/mean": 0.46753832697868347, "rewards/VisualizationJSONCombinedORM/std": 0.16569778323173523, "step": 221, "train_speed(iter/s)": 0.028229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 227.1875, "completions/min_length": 215.0, "epoch": 0.18362282878411912, "grad_norm": 0.1782565712928772, "kl": 0.060302734375, "learning_rate": 9.788872701573013e-06, "loss": 0.0006019845604896545, "memory(GiB)": 38.95, "reward": 0.5097800493240356, "reward_std": 0.1010248064994812, "rewards/VisualizationJSONCombinedORM/mean": 0.5097800493240356, "rewards/VisualizationJSONCombinedORM/std": 0.11858845502138138, "step": 222, "train_speed(iter/s)": 0.028278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 253.3125, "completions/min_length": 207.0, "epoch": 0.18444995864350702, "grad_norm": 0.20405089855194092, "kl": 0.0643310546875, "learning_rate": 9.784701678661045e-06, "loss": 0.000642530620098114, "memory(GiB)": 38.95, "reward": 0.47745516896247864, "reward_std": 0.06550852954387665, "rewards/VisualizationJSONCombinedORM/mean": 0.47745516896247864, "rewards/VisualizationJSONCombinedORM/std": 0.2821834087371826, "step": 223, "train_speed(iter/s)": 0.028322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 235.9375, "completions/min_length": 192.0, "epoch": 0.18527708850289495, "grad_norm": 0.15247294306755066, "kl": 0.07147216796875, "learning_rate": 9.780490762765514e-06, "loss": 0.0007148757576942444, "memory(GiB)": 38.95, "reward": 0.4892740845680237, "reward_std": 0.07608969509601593, "rewards/VisualizationJSONCombinedORM/mean": 0.4892740845680237, "rewards/VisualizationJSONCombinedORM/std": 0.1962691694498062, "step": 224, "train_speed(iter/s)": 0.02836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 247.625, "completions/min_length": 224.0, "epoch": 0.18610421836228289, "grad_norm": 0.17675364017486572, "kl": 0.034393310546875, "learning_rate": 9.776239988995401e-06, "loss": 0.00034432485699653625, "memory(GiB)": 38.95, "reward": 0.5150632262229919, "reward_std": 0.12947450578212738, "rewards/VisualizationJSONCombinedORM/mean": 0.5150632262229919, "rewards/VisualizationJSONCombinedORM/std": 0.16395652294158936, "step": 225, "train_speed(iter/s)": 0.028414 }, { "epoch": 0.18610421836228289, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 275.25, "eval_completions/mean_length": 243.65104166666666, "eval_completions/min_length": 217.45833333333334, "eval_kl": 0.0576171875, "eval_loss": 0.0005771319265477359, "eval_reward": 0.47216571929554146, "eval_reward_std": 0.08977066298636298, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47216571929554146, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08977066236548126, "eval_runtime": 255.7531, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.012, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 256.9375, "completions/min_length": 207.0, "epoch": 0.1869313482216708, "grad_norm": 0.22640112042427063, "kl": 0.095703125, "learning_rate": 9.771949392792005e-06, "loss": 0.0009567290544509888, "memory(GiB)": 38.95, "reward": 0.5216346383094788, "reward_std": 0.09412378817796707, "rewards/VisualizationJSONCombinedORM/mean": 0.5216346383094788, "rewards/VisualizationJSONCombinedORM/std": 0.1600925624370575, "step": 226, "train_speed(iter/s)": 0.027559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 247.0625, "completions/min_length": 204.0, "epoch": 0.18775847808105872, "grad_norm": 0.19718092679977417, "kl": 0.0887451171875, "learning_rate": 9.76761900992865e-06, "loss": 0.000890880823135376, "memory(GiB)": 38.95, "reward": 0.4503234624862671, "reward_std": 0.08620904386043549, "rewards/VisualizationJSONCombinedORM/mean": 0.4503234624862671, "rewards/VisualizationJSONCombinedORM/std": 0.2530075013637543, "step": 227, "train_speed(iter/s)": 0.027599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 245.0625, "completions/min_length": 200.0, "epoch": 0.18858560794044665, "grad_norm": 0.21433350443840027, "kl": 0.0677490234375, "learning_rate": 9.763248876510388e-06, "loss": 0.0006769970059394836, "memory(GiB)": 38.95, "reward": 0.5509445667266846, "reward_std": 0.1239115297794342, "rewards/VisualizationJSONCombinedORM/mean": 0.5509445667266846, "rewards/VisualizationJSONCombinedORM/std": 0.12830384075641632, "step": 228, "train_speed(iter/s)": 0.027628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 252.875, "completions/min_length": 207.0, "epoch": 0.18941273779983459, "grad_norm": 0.2097633183002472, "kl": 0.07470703125, "learning_rate": 9.758839028973693e-06, "loss": 0.0007465202361345291, "memory(GiB)": 38.95, "reward": 0.5392754673957825, "reward_std": 0.15034878253936768, "rewards/VisualizationJSONCombinedORM/mean": 0.5392754673957825, "rewards/VisualizationJSONCombinedORM/std": 0.1755189299583435, "step": 229, "train_speed(iter/s)": 0.027667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 241.125, "completions/min_length": 215.0, "epoch": 0.1902398676592225, "grad_norm": 0.207869753241539, "kl": 0.0623779296875, "learning_rate": 9.754389504086157e-06, "loss": 0.0006246566772460938, "memory(GiB)": 38.95, "reward": 0.3670133352279663, "reward_std": 0.08525373041629791, "rewards/VisualizationJSONCombinedORM/mean": 0.3670133352279663, "rewards/VisualizationJSONCombinedORM/std": 0.10268371552228928, "step": 230, "train_speed(iter/s)": 0.027716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 272.6875, "completions/min_length": 218.0, "epoch": 0.19106699751861042, "grad_norm": 0.19566360116004944, "kl": 0.0423583984375, "learning_rate": 9.749900338946193e-06, "loss": 0.00042384862899780273, "memory(GiB)": 38.95, "reward": 0.3765525817871094, "reward_std": 0.06270764023065567, "rewards/VisualizationJSONCombinedORM/mean": 0.3765525817871094, "rewards/VisualizationJSONCombinedORM/std": 0.08971662074327469, "step": 231, "train_speed(iter/s)": 0.02774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 259.25, "completions/min_length": 214.0, "epoch": 0.19189412737799835, "grad_norm": 0.2067040055990219, "kl": 0.04595947265625, "learning_rate": 9.745371570982715e-06, "loss": 0.0004602000117301941, "memory(GiB)": 39.01, "reward": 0.44061481952667236, "reward_std": 0.09726008772850037, "rewards/VisualizationJSONCombinedORM/mean": 0.44061481952667236, "rewards/VisualizationJSONCombinedORM/std": 0.11451906710863113, "step": 232, "train_speed(iter/s)": 0.027758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 251.3125, "completions/min_length": 211.0, "epoch": 0.19272125723738626, "grad_norm": 0.215209499001503, "kl": 0.05792236328125, "learning_rate": 9.74080323795483e-06, "loss": 0.0005800053477287292, "memory(GiB)": 39.01, "reward": 0.4883589446544647, "reward_std": 0.11652319133281708, "rewards/VisualizationJSONCombinedORM/mean": 0.4883589446544647, "rewards/VisualizationJSONCombinedORM/std": 0.12239092588424683, "step": 233, "train_speed(iter/s)": 0.027801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 238.125, "completions/min_length": 194.0, "epoch": 0.1935483870967742, "grad_norm": 0.20950676500797272, "kl": 0.0528564453125, "learning_rate": 9.736195377951525e-06, "loss": 0.0005280300974845886, "memory(GiB)": 39.01, "reward": 0.501238226890564, "reward_std": 0.1034289002418518, "rewards/VisualizationJSONCombinedORM/mean": 0.501238226890564, "rewards/VisualizationJSONCombinedORM/std": 0.13988588750362396, "step": 234, "train_speed(iter/s)": 0.02784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 274.375, "completions/min_length": 242.0, "epoch": 0.19437551695616212, "grad_norm": 0.21809644997119904, "kl": 0.0543212890625, "learning_rate": 9.731548029391345e-06, "loss": 0.0005424767732620239, "memory(GiB)": 39.01, "reward": 0.3945314288139343, "reward_std": 0.07348795235157013, "rewards/VisualizationJSONCombinedORM/mean": 0.3945314288139343, "rewards/VisualizationJSONCombinedORM/std": 0.10264281183481216, "step": 235, "train_speed(iter/s)": 0.027891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 254.375, "completions/min_length": 210.0, "epoch": 0.19520264681555005, "grad_norm": 0.17917849123477936, "kl": 0.04168701171875, "learning_rate": 9.726861231022078e-06, "loss": 0.00041620805859565735, "memory(GiB)": 39.01, "reward": 0.4872328042984009, "reward_std": 0.09134070575237274, "rewards/VisualizationJSONCombinedORM/mean": 0.4872328042984009, "rewards/VisualizationJSONCombinedORM/std": 0.18859656155109406, "step": 236, "train_speed(iter/s)": 0.027929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 243.1875, "completions/min_length": 214.0, "epoch": 0.19602977667493796, "grad_norm": 0.1860739141702652, "kl": 0.0521240234375, "learning_rate": 9.722135021920427e-06, "loss": 0.0005202367901802063, "memory(GiB)": 39.01, "reward": 0.26218509674072266, "reward_std": 0.031598594039678574, "rewards/VisualizationJSONCombinedORM/mean": 0.26218509674072266, "rewards/VisualizationJSONCombinedORM/std": 0.04890074580907822, "step": 237, "train_speed(iter/s)": 0.027969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 263.25, "completions/min_length": 212.0, "epoch": 0.1968569065343259, "grad_norm": 0.1821741908788681, "kl": 0.04541015625, "learning_rate": 9.717369441491686e-06, "loss": 0.000454166904091835, "memory(GiB)": 39.01, "reward": 0.6375573873519897, "reward_std": 0.15003857016563416, "rewards/VisualizationJSONCombinedORM/mean": 0.6375573873519897, "rewards/VisualizationJSONCombinedORM/std": 0.15131431818008423, "step": 238, "train_speed(iter/s)": 0.028009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 270.9375, "completions/min_length": 229.0, "epoch": 0.19768403639371382, "grad_norm": 0.19695161283016205, "kl": 0.0467529296875, "learning_rate": 9.712564529469417e-06, "loss": 0.0004678070545196533, "memory(GiB)": 39.01, "reward": 0.45501747727394104, "reward_std": 0.10583680123090744, "rewards/VisualizationJSONCombinedORM/mean": 0.45501747727394104, "rewards/VisualizationJSONCombinedORM/std": 0.10394206643104553, "step": 239, "train_speed(iter/s)": 0.02804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 295.125, "completions/min_length": 244.0, "epoch": 0.19851116625310175, "grad_norm": 0.1957579255104065, "kl": 0.0482177734375, "learning_rate": 9.707720325915105e-06, "loss": 0.0004824623465538025, "memory(GiB)": 39.01, "reward": 0.41247862577438354, "reward_std": 0.10493730753660202, "rewards/VisualizationJSONCombinedORM/mean": 0.41247862577438354, "rewards/VisualizationJSONCombinedORM/std": 0.2142544984817505, "step": 240, "train_speed(iter/s)": 0.028075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 273.9375, "completions/min_length": 213.0, "epoch": 0.19933829611248965, "grad_norm": 0.1732953041791916, "kl": 0.05084228515625, "learning_rate": 9.702836871217838e-06, "loss": 0.0005094259977340698, "memory(GiB)": 39.01, "reward": 0.24943691492080688, "reward_std": 0.06385398656129837, "rewards/VisualizationJSONCombinedORM/mean": 0.24943691492080688, "rewards/VisualizationJSONCombinedORM/std": 0.0772002562880516, "step": 241, "train_speed(iter/s)": 0.028103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 238.0625, "completions/min_length": 215.0, "epoch": 0.2001654259718776, "grad_norm": 0.2479345202445984, "kl": 0.0421142578125, "learning_rate": 9.697914206093967e-06, "loss": 0.00042127445340156555, "memory(GiB)": 39.01, "reward": 0.584599494934082, "reward_std": 0.11924503743648529, "rewards/VisualizationJSONCombinedORM/mean": 0.584599494934082, "rewards/VisualizationJSONCombinedORM/std": 0.1174686998128891, "step": 242, "train_speed(iter/s)": 0.028157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 252.1875, "completions/min_length": 224.0, "epoch": 0.20099255583126552, "grad_norm": 0.18966487050056458, "kl": 0.04449462890625, "learning_rate": 9.69295237158676e-06, "loss": 0.00044514238834381104, "memory(GiB)": 39.01, "reward": 0.571473240852356, "reward_std": 0.15233373641967773, "rewards/VisualizationJSONCombinedORM/mean": 0.571473240852356, "rewards/VisualizationJSONCombinedORM/std": 0.1490766853094101, "step": 243, "train_speed(iter/s)": 0.02819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 281.75, "completions/min_length": 242.0, "epoch": 0.20181968569065342, "grad_norm": 0.20750166475772858, "kl": 0.0489501953125, "learning_rate": 9.687951409066061e-06, "loss": 0.0004889722913503647, "memory(GiB)": 39.01, "reward": 0.4284293055534363, "reward_std": 0.0744432806968689, "rewards/VisualizationJSONCombinedORM/mean": 0.4284293055534363, "rewards/VisualizationJSONCombinedORM/std": 0.306714802980423, "step": 244, "train_speed(iter/s)": 0.028217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 271.4375, "completions/min_length": 239.0, "epoch": 0.20264681555004135, "grad_norm": 0.17114312946796417, "kl": 0.037139892578125, "learning_rate": 9.682911360227958e-06, "loss": 0.0003712363541126251, "memory(GiB)": 39.01, "reward": 0.5230479836463928, "reward_std": 0.11521878838539124, "rewards/VisualizationJSONCombinedORM/mean": 0.5230479836463928, "rewards/VisualizationJSONCombinedORM/std": 0.11236890405416489, "step": 245, "train_speed(iter/s)": 0.028236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 243.8125, "completions/min_length": 215.0, "epoch": 0.20347394540942929, "grad_norm": 0.17181722819805145, "kl": 0.0386962890625, "learning_rate": 9.677832267094416e-06, "loss": 0.000387042760848999, "memory(GiB)": 39.01, "reward": 0.6298810839653015, "reward_std": 0.11615307629108429, "rewards/VisualizationJSONCombinedORM/mean": 0.6298810839653015, "rewards/VisualizationJSONCombinedORM/std": 0.12745021283626556, "step": 246, "train_speed(iter/s)": 0.028288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 270.1875, "completions/min_length": 215.0, "epoch": 0.20430107526881722, "grad_norm": 0.1818174123764038, "kl": 0.0623779296875, "learning_rate": 9.672714172012942e-06, "loss": 0.0006225407123565674, "memory(GiB)": 39.01, "reward": 0.4218238890171051, "reward_std": 0.08424603939056396, "rewards/VisualizationJSONCombinedORM/mean": 0.4218238890171051, "rewards/VisualizationJSONCombinedORM/std": 0.21885691583156586, "step": 247, "train_speed(iter/s)": 0.028326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 255.9375, "completions/min_length": 228.0, "epoch": 0.20512820512820512, "grad_norm": 0.1997831016778946, "kl": 0.05078125, "learning_rate": 9.667557117656225e-06, "loss": 0.0005077719688415527, "memory(GiB)": 39.01, "reward": 0.675026535987854, "reward_std": 0.12850192189216614, "rewards/VisualizationJSONCombinedORM/mean": 0.675026535987854, "rewards/VisualizationJSONCombinedORM/std": 0.12441008538007736, "step": 248, "train_speed(iter/s)": 0.028377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 257.9375, "completions/min_length": 220.0, "epoch": 0.20595533498759305, "grad_norm": 0.21039944887161255, "kl": 0.04901123046875, "learning_rate": 9.66236114702178e-06, "loss": 0.0004900842905044556, "memory(GiB)": 39.01, "reward": 0.5059512853622437, "reward_std": 0.12080627679824829, "rewards/VisualizationJSONCombinedORM/mean": 0.5059512853622437, "rewards/VisualizationJSONCombinedORM/std": 0.19209271669387817, "step": 249, "train_speed(iter/s)": 0.028426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 251.375, "completions/min_length": 207.0, "epoch": 0.20678246484698098, "grad_norm": 0.17650045454502106, "kl": 0.04278564453125, "learning_rate": 9.65712630343159e-06, "loss": 0.0004282444715499878, "memory(GiB)": 39.01, "reward": 0.6443052291870117, "reward_std": 0.10100919008255005, "rewards/VisualizationJSONCombinedORM/mean": 0.6443052291870117, "rewards/VisualizationJSONCombinedORM/std": 0.15038566291332245, "step": 250, "train_speed(iter/s)": 0.028469 }, { "epoch": 0.20678246484698098, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 293.1666666666667, "eval_completions/mean_length": 256.6145833333333, "eval_completions/min_length": 225.04166666666666, "eval_kl": 0.04986572265625, "eval_loss": 0.0004983656108379364, "eval_reward": 0.4566580690443516, "eval_reward_std": 0.09950272234467168, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4566580690443516, "eval_rewards/VisualizationJSONCombinedORM/std": 0.09950272537147005, "eval_runtime": 266.8878, "eval_samples_per_second": 0.09, "eval_steps_per_second": 0.011, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 259.3125, "completions/min_length": 235.0, "epoch": 0.2076095947063689, "grad_norm": 0.14125047624111176, "kl": 0.05645751953125, "learning_rate": 9.651852630531748e-06, "loss": 0.0005640871822834015, "memory(GiB)": 39.01, "reward": 0.48353812098503113, "reward_std": 0.07693643867969513, "rewards/VisualizationJSONCombinedORM/mean": 0.48353812098503113, "rewards/VisualizationJSONCombinedORM/std": 0.1973668932914734, "step": 251, "train_speed(iter/s)": 0.027659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 244.0, "completions/min_length": 214.0, "epoch": 0.20843672456575682, "grad_norm": 0.18823425471782684, "kl": 0.04376220703125, "learning_rate": 9.64654017229209e-06, "loss": 0.0004370361566543579, "memory(GiB)": 39.01, "reward": 0.37981414794921875, "reward_std": 0.04074893519282341, "rewards/VisualizationJSONCombinedORM/mean": 0.37981414794921875, "rewards/VisualizationJSONCombinedORM/std": 0.04454696550965309, "step": 252, "train_speed(iter/s)": 0.027683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 257.3125, "completions/min_length": 211.0, "epoch": 0.20926385442514475, "grad_norm": 0.19335675239562988, "kl": 0.04327392578125, "learning_rate": 9.641188973005826e-06, "loss": 0.00043332576751708984, "memory(GiB)": 39.01, "reward": 0.6194522380828857, "reward_std": 0.1178097352385521, "rewards/VisualizationJSONCombinedORM/mean": 0.6194522380828857, "rewards/VisualizationJSONCombinedORM/std": 0.16098803281784058, "step": 253, "train_speed(iter/s)": 0.027721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 271.75, "completions/min_length": 222.0, "epoch": 0.21009098428453268, "grad_norm": 0.1940339356660843, "kl": 0.05322265625, "learning_rate": 9.63579907728918e-06, "loss": 0.0005326941609382629, "memory(GiB)": 39.01, "reward": 0.4177371561527252, "reward_std": 0.0840124860405922, "rewards/VisualizationJSONCombinedORM/mean": 0.4177371561527252, "rewards/VisualizationJSONCombinedORM/std": 0.17124706506729126, "step": 254, "train_speed(iter/s)": 0.027756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 265.75, "completions/min_length": 218.0, "epoch": 0.2109181141439206, "grad_norm": 0.2038147896528244, "kl": 0.04644775390625, "learning_rate": 9.630370530081007e-06, "loss": 0.00046468526124954224, "memory(GiB)": 39.01, "reward": 0.5838002562522888, "reward_std": 0.10885505378246307, "rewards/VisualizationJSONCombinedORM/mean": 0.5838002562522888, "rewards/VisualizationJSONCombinedORM/std": 0.17310117185115814, "step": 255, "train_speed(iter/s)": 0.027782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 264.0, "completions/min_length": 228.0, "epoch": 0.21174524400330852, "grad_norm": 0.1999177485704422, "kl": 0.0830078125, "learning_rate": 9.624903376642426e-06, "loss": 0.0008301511406898499, "memory(GiB)": 39.01, "reward": 0.45934054255485535, "reward_std": 0.1260988414287567, "rewards/VisualizationJSONCombinedORM/mean": 0.45934054255485535, "rewards/VisualizationJSONCombinedORM/std": 0.12205896526575089, "step": 256, "train_speed(iter/s)": 0.0278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 252.5, "completions/min_length": 212.0, "epoch": 0.21257237386269645, "grad_norm": 0.21153922379016876, "kl": 0.0645751953125, "learning_rate": 9.619397662556434e-06, "loss": 0.0006465762853622437, "memory(GiB)": 39.01, "reward": 0.6746277809143066, "reward_std": 0.09005606174468994, "rewards/VisualizationJSONCombinedORM/mean": 0.6746277809143066, "rewards/VisualizationJSONCombinedORM/std": 0.09952393919229507, "step": 257, "train_speed(iter/s)": 0.027838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 270.1875, "completions/min_length": 232.0, "epoch": 0.21339950372208435, "grad_norm": 0.20370206236839294, "kl": 0.076416015625, "learning_rate": 9.61385343372754e-06, "loss": 0.00076240673661232, "memory(GiB)": 39.01, "reward": 0.5214205980300903, "reward_std": 0.08629085123538971, "rewards/VisualizationJSONCombinedORM/mean": 0.5214205980300903, "rewards/VisualizationJSONCombinedORM/std": 0.1066875085234642, "step": 258, "train_speed(iter/s)": 0.027864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 258.3125, "completions/min_length": 212.0, "epoch": 0.2142266335814723, "grad_norm": 0.22494752705097198, "kl": 0.05340576171875, "learning_rate": 9.608270736381368e-06, "loss": 0.0005344972014427185, "memory(GiB)": 39.01, "reward": 0.436858594417572, "reward_std": 0.10591332614421844, "rewards/VisualizationJSONCombinedORM/mean": 0.436858594417572, "rewards/VisualizationJSONCombinedORM/std": 0.16992634534835815, "step": 259, "train_speed(iter/s)": 0.027901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 259.375, "completions/min_length": 213.0, "epoch": 0.21505376344086022, "grad_norm": 0.18311962485313416, "kl": 0.06048583984375, "learning_rate": 9.602649617064279e-06, "loss": 0.0006054937839508057, "memory(GiB)": 39.01, "reward": 0.44641613960266113, "reward_std": 0.09300079196691513, "rewards/VisualizationJSONCombinedORM/mean": 0.44641613960266113, "rewards/VisualizationJSONCombinedORM/std": 0.2162383496761322, "step": 260, "train_speed(iter/s)": 0.027932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 241.75, "completions/min_length": 219.0, "epoch": 0.21588089330024815, "grad_norm": 0.1804453730583191, "kl": 0.05523681640625, "learning_rate": 9.596990122642984e-06, "loss": 0.000551469624042511, "memory(GiB)": 39.01, "reward": 0.20702117681503296, "reward_std": 0.014497442170977592, "rewards/VisualizationJSONCombinedORM/mean": 0.20702117681503296, "rewards/VisualizationJSONCombinedORM/std": 0.014481959864497185, "step": 261, "train_speed(iter/s)": 0.027963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 263.9375, "completions/min_length": 225.0, "epoch": 0.21670802315963605, "grad_norm": 0.21917474269866943, "kl": 0.0687255859375, "learning_rate": 9.591292300304145e-06, "loss": 0.0006881467998027802, "memory(GiB)": 39.01, "reward": 0.3275268077850342, "reward_std": 0.06431729346513748, "rewards/VisualizationJSONCombinedORM/mean": 0.3275268077850342, "rewards/VisualizationJSONCombinedORM/std": 0.09772391617298126, "step": 262, "train_speed(iter/s)": 0.027992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 247.625, "completions/min_length": 218.0, "epoch": 0.21753515301902399, "grad_norm": 0.17116519808769226, "kl": 0.08148193359375, "learning_rate": 9.585556197553994e-06, "loss": 0.0008146390318870544, "memory(GiB)": 39.01, "reward": 0.4144788384437561, "reward_std": 0.06926068663597107, "rewards/VisualizationJSONCombinedORM/mean": 0.4144788384437561, "rewards/VisualizationJSONCombinedORM/std": 0.09113219380378723, "step": 263, "train_speed(iter/s)": 0.028027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 255.125, "completions/min_length": 218.0, "epoch": 0.21836228287841192, "grad_norm": 0.1949836015701294, "kl": 0.0531005859375, "learning_rate": 9.579781862217926e-06, "loss": 0.0005300603806972504, "memory(GiB)": 39.01, "reward": 0.6712462306022644, "reward_std": 0.10630404204130173, "rewards/VisualizationJSONCombinedORM/mean": 0.6712462306022644, "rewards/VisualizationJSONCombinedORM/std": 0.1119503602385521, "step": 264, "train_speed(iter/s)": 0.028051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 273.875, "completions/min_length": 222.0, "epoch": 0.21918941273779982, "grad_norm": 0.19952596724033356, "kl": 0.0667724609375, "learning_rate": 9.573969342440107e-06, "loss": 0.0006674006581306458, "memory(GiB)": 39.01, "reward": 0.4834970235824585, "reward_std": 0.09422646462917328, "rewards/VisualizationJSONCombinedORM/mean": 0.4834970235824585, "rewards/VisualizationJSONCombinedORM/std": 0.1930904984474182, "step": 265, "train_speed(iter/s)": 0.028065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 253.9375, "completions/min_length": 225.0, "epoch": 0.22001654259718775, "grad_norm": 0.21238061785697937, "kl": 0.0848388671875, "learning_rate": 9.568118686683063e-06, "loss": 0.000846467912197113, "memory(GiB)": 39.01, "reward": 0.47771909832954407, "reward_std": 0.1266951709985733, "rewards/VisualizationJSONCombinedORM/mean": 0.47771909832954407, "rewards/VisualizationJSONCombinedORM/std": 0.12379838526248932, "step": 266, "train_speed(iter/s)": 0.028095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 263.375, "completions/min_length": 235.0, "epoch": 0.22084367245657568, "grad_norm": 0.17017877101898193, "kl": 0.05035400390625, "learning_rate": 9.562229943727295e-06, "loss": 0.0005031153559684753, "memory(GiB)": 39.01, "reward": 0.1927976757287979, "reward_std": 0.03665110468864441, "rewards/VisualizationJSONCombinedORM/mean": 0.1927976757287979, "rewards/VisualizationJSONCombinedORM/std": 0.0474601611495018, "step": 267, "train_speed(iter/s)": 0.028131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 260.5, "completions/min_length": 212.0, "epoch": 0.22167080231596362, "grad_norm": 0.18453817069530487, "kl": 0.05609130859375, "learning_rate": 9.556303162670853e-06, "loss": 0.0005607157945632935, "memory(GiB)": 39.01, "reward": 0.6026835441589355, "reward_std": 0.1551249474287033, "rewards/VisualizationJSONCombinedORM/mean": 0.6026835441589355, "rewards/VisualizationJSONCombinedORM/std": 0.15609301626682281, "step": 268, "train_speed(iter/s)": 0.02817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 259.25, "completions/min_length": 211.0, "epoch": 0.22249793217535152, "grad_norm": 0.17216075956821442, "kl": 0.055908203125, "learning_rate": 9.550338392928931e-06, "loss": 0.0005590617656707764, "memory(GiB)": 39.01, "reward": 0.32331013679504395, "reward_std": 0.06630705296993256, "rewards/VisualizationJSONCombinedORM/mean": 0.32331013679504395, "rewards/VisualizationJSONCombinedORM/std": 0.07226663827896118, "step": 269, "train_speed(iter/s)": 0.028204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 257.9375, "completions/min_length": 216.0, "epoch": 0.22332506203473945, "grad_norm": 0.22227539122104645, "kl": 0.0692138671875, "learning_rate": 9.544335684233464e-06, "loss": 0.0006921850144863129, "memory(GiB)": 39.01, "reward": 0.44140398502349854, "reward_std": 0.11086130142211914, "rewards/VisualizationJSONCombinedORM/mean": 0.44140398502349854, "rewards/VisualizationJSONCombinedORM/std": 0.16981033980846405, "step": 270, "train_speed(iter/s)": 0.028241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 248.875, "completions/min_length": 225.0, "epoch": 0.22415219189412738, "grad_norm": 0.1774255633354187, "kl": 0.06201171875, "learning_rate": 9.538295086632703e-06, "loss": 0.0006197243928909302, "memory(GiB)": 39.01, "reward": 0.5245518684387207, "reward_std": 0.12925413250923157, "rewards/VisualizationJSONCombinedORM/mean": 0.5245518684387207, "rewards/VisualizationJSONCombinedORM/std": 0.1537046879529953, "step": 271, "train_speed(iter/s)": 0.028282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 246.6875, "completions/min_length": 212.0, "epoch": 0.22497932175351532, "grad_norm": 0.2062574326992035, "kl": 0.06854248046875, "learning_rate": 9.532216650490806e-06, "loss": 0.0006847195327281952, "memory(GiB)": 39.01, "reward": 0.41120225191116333, "reward_std": 0.09303532540798187, "rewards/VisualizationJSONCombinedORM/mean": 0.41120225191116333, "rewards/VisualizationJSONCombinedORM/std": 0.18703117966651917, "step": 272, "train_speed(iter/s)": 0.028321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 242.5625, "completions/min_length": 188.0, "epoch": 0.22580645161290322, "grad_norm": 0.21167954802513123, "kl": 0.06463623046875, "learning_rate": 9.52610042648741e-06, "loss": 0.000646006315946579, "memory(GiB)": 39.01, "reward": 0.6814398765563965, "reward_std": 0.11804996430873871, "rewards/VisualizationJSONCombinedORM/mean": 0.6814398765563965, "rewards/VisualizationJSONCombinedORM/std": 0.1145298182964325, "step": 273, "train_speed(iter/s)": 0.028358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 263.375, "completions/min_length": 234.0, "epoch": 0.22663358147229115, "grad_norm": 0.21102246642112732, "kl": 0.0609130859375, "learning_rate": 9.519946465617217e-06, "loss": 0.0006086267530918121, "memory(GiB)": 39.01, "reward": 0.5990126132965088, "reward_std": 0.1348898708820343, "rewards/VisualizationJSONCombinedORM/mean": 0.5990126132965088, "rewards/VisualizationJSONCombinedORM/std": 0.13771115243434906, "step": 274, "train_speed(iter/s)": 0.028394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 263.6875, "completions/min_length": 208.0, "epoch": 0.22746071133167908, "grad_norm": 0.20804066956043243, "kl": 0.07666015625, "learning_rate": 9.513754819189561e-06, "loss": 0.000766851007938385, "memory(GiB)": 39.01, "reward": 0.5600904822349548, "reward_std": 0.09297396242618561, "rewards/VisualizationJSONCombinedORM/mean": 0.5600904822349548, "rewards/VisualizationJSONCombinedORM/std": 0.12074784934520721, "step": 275, "train_speed(iter/s)": 0.028414 }, { "epoch": 0.22746071133167908, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 292.6666666666667, "eval_completions/mean_length": 254.08333333333334, "eval_completions/min_length": 223.0, "eval_kl": 0.07184855143229167, "eval_loss": 0.0007203829591162503, "eval_reward": 0.47565149577955407, "eval_reward_std": 0.1023714766682436, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47565149577955407, "eval_rewards/VisualizationJSONCombinedORM/std": 0.10237147922938068, "eval_runtime": 266.1798, "eval_samples_per_second": 0.09, "eval_steps_per_second": 0.011, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 250.6875, "completions/min_length": 211.0, "epoch": 0.228287841191067, "grad_norm": 0.197395458817482, "kl": 0.0811767578125, "learning_rate": 9.507525538827982e-06, "loss": 0.0008109509944915771, "memory(GiB)": 39.01, "reward": 0.44740793108940125, "reward_std": 0.08189627528190613, "rewards/VisualizationJSONCombinedORM/mean": 0.44740793108940125, "rewards/VisualizationJSONCombinedORM/std": 0.11469361186027527, "step": 276, "train_speed(iter/s)": 0.027702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 245.25, "completions/min_length": 199.0, "epoch": 0.22911497105045492, "grad_norm": 0.1975592076778412, "kl": 0.06158447265625, "learning_rate": 9.5012586764698e-06, "loss": 0.0006153061985969543, "memory(GiB)": 39.01, "reward": 0.3994566798210144, "reward_std": 0.141847625374794, "rewards/VisualizationJSONCombinedORM/mean": 0.3994566798210144, "rewards/VisualizationJSONCombinedORM/std": 0.22323840856552124, "step": 277, "train_speed(iter/s)": 0.027738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 249.3125, "completions/min_length": 223.0, "epoch": 0.22994210090984285, "grad_norm": 0.17533791065216064, "kl": 0.072509765625, "learning_rate": 9.494954284365678e-06, "loss": 0.0007254444062709808, "memory(GiB)": 39.01, "reward": 0.5128206014633179, "reward_std": 0.16398558020591736, "rewards/VisualizationJSONCombinedORM/mean": 0.5128206014633179, "rewards/VisualizationJSONCombinedORM/std": 0.18432417511940002, "step": 278, "train_speed(iter/s)": 0.027759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 259.625, "completions/min_length": 231.0, "epoch": 0.23076923076923078, "grad_norm": 0.19907909631729126, "kl": 0.075439453125, "learning_rate": 9.48861241507919e-06, "loss": 0.0007535107433795929, "memory(GiB)": 39.01, "reward": 0.49469995498657227, "reward_std": 0.11308543384075165, "rewards/VisualizationJSONCombinedORM/mean": 0.49469995498657227, "rewards/VisualizationJSONCombinedORM/std": 0.1407565325498581, "step": 279, "train_speed(iter/s)": 0.027792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 279.125, "completions/min_length": 215.0, "epoch": 0.23159636062861869, "grad_norm": 0.19339624047279358, "kl": 0.053955078125, "learning_rate": 9.482233121486379e-06, "loss": 0.0005396716296672821, "memory(GiB)": 39.01, "reward": 0.5956220626831055, "reward_std": 0.1109207272529602, "rewards/VisualizationJSONCombinedORM/mean": 0.5956220626831055, "rewards/VisualizationJSONCombinedORM/std": 0.1321009248495102, "step": 280, "train_speed(iter/s)": 0.027822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 244.0625, "completions/min_length": 213.0, "epoch": 0.23242349048800662, "grad_norm": 0.21510450541973114, "kl": 0.0677490234375, "learning_rate": 9.475816456775313e-06, "loss": 0.0006793588399887085, "memory(GiB)": 39.01, "reward": 0.3694351315498352, "reward_std": 0.08190250396728516, "rewards/VisualizationJSONCombinedORM/mean": 0.3694351315498352, "rewards/VisualizationJSONCombinedORM/std": 0.14704686403274536, "step": 281, "train_speed(iter/s)": 0.027853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 258.9375, "completions/min_length": 223.0, "epoch": 0.23325062034739455, "grad_norm": 0.21244119107723236, "kl": 0.06597900390625, "learning_rate": 9.46936247444565e-06, "loss": 0.000659668818116188, "memory(GiB)": 39.01, "reward": 0.4119427800178528, "reward_std": 0.07938252389431, "rewards/VisualizationJSONCombinedORM/mean": 0.4119427800178528, "rewards/VisualizationJSONCombinedORM/std": 0.2215101271867752, "step": 282, "train_speed(iter/s)": 0.02789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 270.5, "completions/min_length": 204.0, "epoch": 0.23407775020678245, "grad_norm": 0.21663986146450043, "kl": 0.0634765625, "learning_rate": 9.462871228308188e-06, "loss": 0.0006344486027956009, "memory(GiB)": 39.01, "reward": 0.454415887594223, "reward_std": 0.11369909346103668, "rewards/VisualizationJSONCombinedORM/mean": 0.454415887594223, "rewards/VisualizationJSONCombinedORM/std": 0.24390070140361786, "step": 283, "train_speed(iter/s)": 0.027916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 267.0, "completions/min_length": 220.0, "epoch": 0.23490488006617039, "grad_norm": 0.19976364076137543, "kl": 0.07965087890625, "learning_rate": 9.456342772484415e-06, "loss": 0.0007954016327857971, "memory(GiB)": 39.01, "reward": 0.24576899409294128, "reward_std": 0.04615161195397377, "rewards/VisualizationJSONCombinedORM/mean": 0.24576899409294128, "rewards/VisualizationJSONCombinedORM/std": 0.09822645783424377, "step": 284, "train_speed(iter/s)": 0.027961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 262.75, "completions/min_length": 216.0, "epoch": 0.23573200992555832, "grad_norm": 0.19211073219776154, "kl": 0.070068359375, "learning_rate": 9.44977716140606e-06, "loss": 0.0006996616721153259, "memory(GiB)": 39.01, "reward": 0.4870198667049408, "reward_std": 0.07169009745121002, "rewards/VisualizationJSONCombinedORM/mean": 0.4870198667049408, "rewards/VisualizationJSONCombinedORM/std": 0.25518515706062317, "step": 285, "train_speed(iter/s)": 0.027991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 245.0625, "completions/min_length": 206.0, "epoch": 0.23655913978494625, "grad_norm": 0.19814912974834442, "kl": 0.0989990234375, "learning_rate": 9.443174449814634e-06, "loss": 0.0009898990392684937, "memory(GiB)": 39.01, "reward": 0.5012764930725098, "reward_std": 0.0540192574262619, "rewards/VisualizationJSONCombinedORM/mean": 0.5012764930725098, "rewards/VisualizationJSONCombinedORM/std": 0.17546792328357697, "step": 286, "train_speed(iter/s)": 0.028025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 248.875, "completions/min_length": 210.0, "epoch": 0.23738626964433415, "grad_norm": 0.19400812685489655, "kl": 0.0904541015625, "learning_rate": 9.436534692760985e-06, "loss": 0.0009047314524650574, "memory(GiB)": 39.01, "reward": 0.44066405296325684, "reward_std": 0.14504970610141754, "rewards/VisualizationJSONCombinedORM/mean": 0.44066405296325684, "rewards/VisualizationJSONCombinedORM/std": 0.1723245084285736, "step": 287, "train_speed(iter/s)": 0.028057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 253.8125, "completions/min_length": 207.0, "epoch": 0.23821339950372208, "grad_norm": 0.20844444632530212, "kl": 0.0853271484375, "learning_rate": 9.429857945604824e-06, "loss": 0.000854097306728363, "memory(GiB)": 39.01, "reward": 0.6647976636886597, "reward_std": 0.13774408400058746, "rewards/VisualizationJSONCombinedORM/mean": 0.6647976636886597, "rewards/VisualizationJSONCombinedORM/std": 0.13691800832748413, "step": 288, "train_speed(iter/s)": 0.028096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 258.125, "completions/min_length": 202.0, "epoch": 0.23904052936311002, "grad_norm": 0.214450404047966, "kl": 0.05462646484375, "learning_rate": 9.423144264014278e-06, "loss": 0.0005460865795612335, "memory(GiB)": 39.01, "reward": 0.48840904235839844, "reward_std": 0.07607436180114746, "rewards/VisualizationJSONCombinedORM/mean": 0.48840904235839844, "rewards/VisualizationJSONCombinedORM/std": 0.21545037627220154, "step": 289, "train_speed(iter/s)": 0.028125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 272.875, "completions/min_length": 228.0, "epoch": 0.23986765922249792, "grad_norm": 0.18522946536540985, "kl": 0.06634521484375, "learning_rate": 9.416393703965412e-06, "loss": 0.0006636679172515869, "memory(GiB)": 39.01, "reward": 0.5758665204048157, "reward_std": 0.10886353999376297, "rewards/VisualizationJSONCombinedORM/mean": 0.5758665204048157, "rewards/VisualizationJSONCombinedORM/std": 0.19000278413295746, "step": 290, "train_speed(iter/s)": 0.028138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 250.3125, "completions/min_length": 200.0, "epoch": 0.24069478908188585, "grad_norm": 0.20066671073436737, "kl": 0.0819091796875, "learning_rate": 9.409606321741776e-06, "loss": 0.0008191019296646118, "memory(GiB)": 39.01, "reward": 0.37006330490112305, "reward_std": 0.10644356161355972, "rewards/VisualizationJSONCombinedORM/mean": 0.37006330490112305, "rewards/VisualizationJSONCombinedORM/std": 0.21264569461345673, "step": 291, "train_speed(iter/s)": 0.028175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 259.375, "completions/min_length": 212.0, "epoch": 0.24152191894127378, "grad_norm": 0.1909230351448059, "kl": 0.07305908203125, "learning_rate": 9.402782173933925e-06, "loss": 0.0007316111586987972, "memory(GiB)": 39.01, "reward": 0.30044442415237427, "reward_std": 0.08625465631484985, "rewards/VisualizationJSONCombinedORM/mean": 0.30044442415237427, "rewards/VisualizationJSONCombinedORM/std": 0.1835813671350479, "step": 292, "train_speed(iter/s)": 0.028211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 254.125, "completions/min_length": 206.0, "epoch": 0.24234904880066171, "grad_norm": 0.19886484742164612, "kl": 0.05963134765625, "learning_rate": 9.39592131743895e-06, "loss": 0.0005965530872344971, "memory(GiB)": 39.01, "reward": 0.5871598124504089, "reward_std": 0.09867453575134277, "rewards/VisualizationJSONCombinedORM/mean": 0.5871598124504089, "rewards/VisualizationJSONCombinedORM/std": 0.11883474886417389, "step": 293, "train_speed(iter/s)": 0.028245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 257.625, "completions/min_length": 185.0, "epoch": 0.24317617866004962, "grad_norm": 0.20077304542064667, "kl": 0.0745849609375, "learning_rate": 9.389023809460008e-06, "loss": 0.0007450953125953674, "memory(GiB)": 39.01, "reward": 0.6816247701644897, "reward_std": 0.10078410059213638, "rewards/VisualizationJSONCombinedORM/mean": 0.6816247701644897, "rewards/VisualizationJSONCombinedORM/std": 0.11023916304111481, "step": 294, "train_speed(iter/s)": 0.028283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 269.375, "completions/min_length": 227.0, "epoch": 0.24400330851943755, "grad_norm": 0.20292922854423523, "kl": 0.09326171875, "learning_rate": 9.38208970750584e-06, "loss": 0.0009337589144706726, "memory(GiB)": 39.01, "reward": 0.5382654666900635, "reward_std": 0.1369444578886032, "rewards/VisualizationJSONCombinedORM/mean": 0.5382654666900635, "rewards/VisualizationJSONCombinedORM/std": 0.16479115188121796, "step": 295, "train_speed(iter/s)": 0.028313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 255.875, "completions/min_length": 192.0, "epoch": 0.24483043837882548, "grad_norm": 0.19816266000270844, "kl": 0.0753173828125, "learning_rate": 9.375119069390297e-06, "loss": 0.000753302127122879, "memory(GiB)": 39.01, "reward": 0.6076270937919617, "reward_std": 0.17891940474510193, "rewards/VisualizationJSONCombinedORM/mean": 0.6076270937919617, "rewards/VisualizationJSONCombinedORM/std": 0.18526485562324524, "step": 296, "train_speed(iter/s)": 0.028342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 275.1875, "completions/min_length": 226.0, "epoch": 0.2456575682382134, "grad_norm": 0.17929479479789734, "kl": 0.0711669921875, "learning_rate": 9.368111953231849e-06, "loss": 0.0007119439542293549, "memory(GiB)": 39.01, "reward": 0.44849932193756104, "reward_std": 0.07780615240335464, "rewards/VisualizationJSONCombinedORM/mean": 0.44849932193756104, "rewards/VisualizationJSONCombinedORM/std": 0.2026691734790802, "step": 297, "train_speed(iter/s)": 0.028369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 239.5, "completions/min_length": 204.0, "epoch": 0.24648469809760132, "grad_norm": 0.21244099736213684, "kl": 0.0775146484375, "learning_rate": 9.361068417453107e-06, "loss": 0.0007746554911136627, "memory(GiB)": 39.01, "reward": 0.536784291267395, "reward_std": 0.1401863694190979, "rewards/VisualizationJSONCombinedORM/mean": 0.536784291267395, "rewards/VisualizationJSONCombinedORM/std": 0.17901694774627686, "step": 298, "train_speed(iter/s)": 0.028405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 262.375, "completions/min_length": 224.0, "epoch": 0.24731182795698925, "grad_norm": 0.20793671905994415, "kl": 0.077392578125, "learning_rate": 9.353988520780336e-06, "loss": 0.000773061066865921, "memory(GiB)": 39.01, "reward": 0.3436957597732544, "reward_std": 0.04295782372355461, "rewards/VisualizationJSONCombinedORM/mean": 0.3436957597732544, "rewards/VisualizationJSONCombinedORM/std": 0.06709646433591843, "step": 299, "train_speed(iter/s)": 0.028429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 257.1875, "completions/min_length": 193.0, "epoch": 0.24813895781637718, "grad_norm": 0.20944778621196747, "kl": 0.1036376953125, "learning_rate": 9.346872322242965e-06, "loss": 0.0010363571345806122, "memory(GiB)": 39.01, "reward": 0.44902563095092773, "reward_std": 0.13473168015480042, "rewards/VisualizationJSONCombinedORM/mean": 0.44902563095092773, "rewards/VisualizationJSONCombinedORM/std": 0.15714344382286072, "step": 300, "train_speed(iter/s)": 0.028467 }, { "epoch": 0.24813895781637718, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 298.5, "eval_completions/mean_length": 255.16145833333334, "eval_completions/min_length": 220.83333333333334, "eval_kl": 0.07789103190104167, "eval_loss": 0.0007812089170329273, "eval_reward": 0.4509114194661379, "eval_reward_std": 0.10425717833762367, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4509114194661379, "eval_rewards/VisualizationJSONCombinedORM/std": 0.10425718229574461, "eval_runtime": 269.6142, "eval_samples_per_second": 0.089, "eval_steps_per_second": 0.011, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 249.0625, "completions/min_length": 220.0, "epoch": 0.24896608767576509, "grad_norm": 0.18159785866737366, "kl": 0.071533203125, "learning_rate": 9.339719881173093e-06, "loss": 0.0007147639989852905, "memory(GiB)": 39.01, "reward": 0.4519127607345581, "reward_std": 0.10412117093801498, "rewards/VisualizationJSONCombinedORM/mean": 0.4519127607345581, "rewards/VisualizationJSONCombinedORM/std": 0.1494109332561493, "step": 301, "train_speed(iter/s)": 0.027785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 260.125, "completions/min_length": 211.0, "epoch": 0.24979321753515302, "grad_norm": 0.19881699979305267, "kl": 0.0677490234375, "learning_rate": 9.332531257204992e-06, "loss": 0.0006759427487850189, "memory(GiB)": 39.01, "reward": 0.5475409626960754, "reward_std": 0.09388669580221176, "rewards/VisualizationJSONCombinedORM/mean": 0.5475409626960754, "rewards/VisualizationJSONCombinedORM/std": 0.11745606362819672, "step": 302, "train_speed(iter/s)": 0.027818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 254.75, "completions/min_length": 201.0, "epoch": 0.2506203473945409, "grad_norm": 0.21688811480998993, "kl": 0.0670166015625, "learning_rate": 9.325306510274616e-06, "loss": 0.0006699133664369583, "memory(GiB)": 39.01, "reward": 0.5528820753097534, "reward_std": 0.11437063664197922, "rewards/VisualizationJSONCombinedORM/mean": 0.5528820753097534, "rewards/VisualizationJSONCombinedORM/std": 0.17852887511253357, "step": 303, "train_speed(iter/s)": 0.027853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 270.5625, "completions/min_length": 226.0, "epoch": 0.25144747725392885, "grad_norm": 0.21443979442119598, "kl": 0.0723876953125, "learning_rate": 9.3180457006191e-06, "loss": 0.0007226094603538513, "memory(GiB)": 39.01, "reward": 0.4971635043621063, "reward_std": 0.10623489320278168, "rewards/VisualizationJSONCombinedORM/mean": 0.4971635043621063, "rewards/VisualizationJSONCombinedORM/std": 0.2544105350971222, "step": 304, "train_speed(iter/s)": 0.027892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 271.3125, "completions/min_length": 207.0, "epoch": 0.2522746071133168, "grad_norm": 0.1938425451517105, "kl": 0.072998046875, "learning_rate": 9.310748888776254e-06, "loss": 0.0007292609661817551, "memory(GiB)": 39.01, "reward": 0.47867655754089355, "reward_std": 0.07421805709600449, "rewards/VisualizationJSONCombinedORM/mean": 0.47867655754089355, "rewards/VisualizationJSONCombinedORM/std": 0.17622962594032288, "step": 305, "train_speed(iter/s)": 0.027916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 267.3125, "completions/min_length": 226.0, "epoch": 0.2531017369727047, "grad_norm": 0.18057003617286682, "kl": 0.05462646484375, "learning_rate": 9.303416135584058e-06, "loss": 0.0005470886826515198, "memory(GiB)": 39.01, "reward": 0.6079132556915283, "reward_std": 0.12142674624919891, "rewards/VisualizationJSONCombinedORM/mean": 0.6079132556915283, "rewards/VisualizationJSONCombinedORM/std": 0.21078579127788544, "step": 306, "train_speed(iter/s)": 0.027942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 255.0, "completions/min_length": 217.0, "epoch": 0.25392886683209265, "grad_norm": 0.26665428280830383, "kl": 0.0478515625, "learning_rate": 9.296047502180158e-06, "loss": 0.00047800689935684204, "memory(GiB)": 39.01, "reward": 0.690406322479248, "reward_std": 0.1331930160522461, "rewards/VisualizationJSONCombinedORM/mean": 0.690406322479248, "rewards/VisualizationJSONCombinedORM/std": 0.13753746449947357, "step": 307, "train_speed(iter/s)": 0.027973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 255.875, "completions/min_length": 210.0, "epoch": 0.2547559966914806, "grad_norm": 0.2281714528799057, "kl": 0.07049560546875, "learning_rate": 9.288643050001362e-06, "loss": 0.000705283135175705, "memory(GiB)": 39.01, "reward": 0.6070390939712524, "reward_std": 0.1292143315076828, "rewards/VisualizationJSONCombinedORM/mean": 0.6070390939712524, "rewards/VisualizationJSONCombinedORM/std": 0.12796078622341156, "step": 308, "train_speed(iter/s)": 0.027996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 262.5625, "completions/min_length": 202.0, "epoch": 0.2555831265508685, "grad_norm": 0.19372142851352692, "kl": 0.0654296875, "learning_rate": 9.281202840783109e-06, "loss": 0.0006543435156345367, "memory(GiB)": 39.01, "reward": 0.41665008664131165, "reward_std": 0.09084977209568024, "rewards/VisualizationJSONCombinedORM/mean": 0.41665008664131165, "rewards/VisualizationJSONCombinedORM/std": 0.09415699541568756, "step": 309, "train_speed(iter/s)": 0.028022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 248.75, "completions/min_length": 214.0, "epoch": 0.2564102564102564, "grad_norm": 0.20246867835521698, "kl": 0.05621337890625, "learning_rate": 9.273726936558975e-06, "loss": 0.0005630031228065491, "memory(GiB)": 39.01, "reward": 0.3794425427913666, "reward_std": 0.1521798074245453, "rewards/VisualizationJSONCombinedORM/mean": 0.3794425427913666, "rewards/VisualizationJSONCombinedORM/std": 0.22018063068389893, "step": 310, "train_speed(iter/s)": 0.028036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 270.8125, "completions/min_length": 247.0, "epoch": 0.2572373862696443, "grad_norm": 0.20105503499507904, "kl": 0.07244873046875, "learning_rate": 9.266215399660145e-06, "loss": 0.0007229484617710114, "memory(GiB)": 39.01, "reward": 0.5605393052101135, "reward_std": 0.11414918303489685, "rewards/VisualizationJSONCombinedORM/mean": 0.5605393052101135, "rewards/VisualizationJSONCombinedORM/std": 0.12687091529369354, "step": 311, "train_speed(iter/s)": 0.028078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 246.375, "completions/min_length": 204.0, "epoch": 0.25806451612903225, "grad_norm": 0.12384264916181564, "kl": 0.032318115234375, "learning_rate": 9.258668292714896e-06, "loss": 0.0003232806921005249, "memory(GiB)": 39.01, "reward": 0.6427310705184937, "reward_std": 0.12436392158269882, "rewards/VisualizationJSONCombinedORM/mean": 0.6427310705184937, "rewards/VisualizationJSONCombinedORM/std": 0.14802394807338715, "step": 312, "train_speed(iter/s)": 0.028121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 259.625, "completions/min_length": 211.0, "epoch": 0.2588916459884202, "grad_norm": 0.17591285705566406, "kl": 0.0401611328125, "learning_rate": 9.251085678648072e-06, "loss": 0.00040189921855926514, "memory(GiB)": 39.01, "reward": 0.5979295969009399, "reward_std": 0.09609762579202652, "rewards/VisualizationJSONCombinedORM/mean": 0.5979295969009399, "rewards/VisualizationJSONCombinedORM/std": 0.2499982863664627, "step": 313, "train_speed(iter/s)": 0.028144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 278.5625, "completions/min_length": 247.0, "epoch": 0.2597187758478081, "grad_norm": 0.191242054104805, "kl": 0.0472412109375, "learning_rate": 9.243467620680563e-06, "loss": 0.0004720166325569153, "memory(GiB)": 39.01, "reward": 0.39715805649757385, "reward_std": 0.07923746854066849, "rewards/VisualizationJSONCombinedORM/mean": 0.39715805649757385, "rewards/VisualizationJSONCombinedORM/std": 0.11302240192890167, "step": 314, "train_speed(iter/s)": 0.028183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 267.125, "completions/min_length": 223.0, "epoch": 0.26054590570719605, "grad_norm": 0.2029944509267807, "kl": 0.03753662109375, "learning_rate": 9.235814182328777e-06, "loss": 0.00037519633769989014, "memory(GiB)": 39.01, "reward": 0.29595717787742615, "reward_std": 0.04326695576310158, "rewards/VisualizationJSONCombinedORM/mean": 0.29595717787742615, "rewards/VisualizationJSONCombinedORM/std": 0.08748629689216614, "step": 315, "train_speed(iter/s)": 0.028213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 261.0625, "completions/min_length": 208.0, "epoch": 0.261373035566584, "grad_norm": 0.2047521471977234, "kl": 0.05841064453125, "learning_rate": 9.22812542740411e-06, "loss": 0.0005836915224790573, "memory(GiB)": 39.01, "reward": 0.34828853607177734, "reward_std": 0.05181001126766205, "rewards/VisualizationJSONCombinedORM/mean": 0.34828853607177734, "rewards/VisualizationJSONCombinedORM/std": 0.06518935412168503, "step": 316, "train_speed(iter/s)": 0.028236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 258.375, "completions/min_length": 226.0, "epoch": 0.26220016542597185, "grad_norm": 0.18973858654499054, "kl": 0.0491943359375, "learning_rate": 9.220401420012412e-06, "loss": 0.000492338091135025, "memory(GiB)": 39.01, "reward": 0.6598153114318848, "reward_std": 0.09825620800256729, "rewards/VisualizationJSONCombinedORM/mean": 0.6598153114318848, "rewards/VisualizationJSONCombinedORM/std": 0.11800764501094818, "step": 317, "train_speed(iter/s)": 0.028259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 269.625, "completions/min_length": 210.0, "epoch": 0.2630272952853598, "grad_norm": 0.2280919849872589, "kl": 0.04931640625, "learning_rate": 9.212642224553456e-06, "loss": 0.000493176281452179, "memory(GiB)": 39.01, "reward": 0.5686478018760681, "reward_std": 0.10609684139490128, "rewards/VisualizationJSONCombinedORM/mean": 0.5686478018760681, "rewards/VisualizationJSONCombinedORM/std": 0.13661885261535645, "step": 318, "train_speed(iter/s)": 0.02829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 257.125, "completions/min_length": 221.0, "epoch": 0.2638544251447477, "grad_norm": 0.19957850873470306, "kl": 0.04266357421875, "learning_rate": 9.204847905720398e-06, "loss": 0.0004266519099473953, "memory(GiB)": 39.01, "reward": 0.27114397287368774, "reward_std": 0.06479557603597641, "rewards/VisualizationJSONCombinedORM/mean": 0.27114397287368774, "rewards/VisualizationJSONCombinedORM/std": 0.10217147320508957, "step": 319, "train_speed(iter/s)": 0.028314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 252.0, "completions/min_length": 202.0, "epoch": 0.26468155500413565, "grad_norm": 0.19280105829238892, "kl": 0.05487060546875, "learning_rate": 9.197018528499243e-06, "loss": 0.0005486682057380676, "memory(GiB)": 39.01, "reward": 0.4711807370185852, "reward_std": 0.12223120778799057, "rewards/VisualizationJSONCombinedORM/mean": 0.4711807370185852, "rewards/VisualizationJSONCombinedORM/std": 0.20811082422733307, "step": 320, "train_speed(iter/s)": 0.028354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 272.4375, "completions/min_length": 215.0, "epoch": 0.2655086848635236, "grad_norm": 0.19395217299461365, "kl": 0.037109375, "learning_rate": 9.189154158168293e-06, "loss": 0.00037073343992233276, "memory(GiB)": 39.01, "reward": 0.4356372356414795, "reward_std": 0.08887164294719696, "rewards/VisualizationJSONCombinedORM/mean": 0.4356372356414795, "rewards/VisualizationJSONCombinedORM/std": 0.12252631038427353, "step": 321, "train_speed(iter/s)": 0.028381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 249.625, "completions/min_length": 205.0, "epoch": 0.2663358147229115, "grad_norm": 0.2121812403202057, "kl": 0.0423583984375, "learning_rate": 9.181254860297612e-06, "loss": 0.00042297691106796265, "memory(GiB)": 39.01, "reward": 0.4545990824699402, "reward_std": 0.14115384221076965, "rewards/VisualizationJSONCombinedORM/mean": 0.4545990824699402, "rewards/VisualizationJSONCombinedORM/std": 0.15334486961364746, "step": 322, "train_speed(iter/s)": 0.028418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 242.125, "completions/min_length": 199.0, "epoch": 0.26716294458229944, "grad_norm": 0.14957404136657715, "kl": 0.03271484375, "learning_rate": 9.17332070074848e-06, "loss": 0.0003271549940109253, "memory(GiB)": 39.01, "reward": 0.6628742218017578, "reward_std": 0.10018213093280792, "rewards/VisualizationJSONCombinedORM/mean": 0.6628742218017578, "rewards/VisualizationJSONCombinedORM/std": 0.10873223096132278, "step": 323, "train_speed(iter/s)": 0.028453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 261.0, "completions/min_length": 210.0, "epoch": 0.2679900744416873, "grad_norm": 0.16937746107578278, "kl": 0.05902099609375, "learning_rate": 9.165351745672834e-06, "loss": 0.0005887597799301147, "memory(GiB)": 39.01, "reward": 0.6882548332214355, "reward_std": 0.10701294243335724, "rewards/VisualizationJSONCombinedORM/mean": 0.6882548332214355, "rewards/VisualizationJSONCombinedORM/std": 0.11581023782491684, "step": 324, "train_speed(iter/s)": 0.028481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 248.8125, "completions/min_length": 206.0, "epoch": 0.26881720430107525, "grad_norm": 0.20181238651275635, "kl": 0.05517578125, "learning_rate": 9.157348061512728e-06, "loss": 0.000552961602807045, "memory(GiB)": 39.01, "reward": 0.685973048210144, "reward_std": 0.07039383053779602, "rewards/VisualizationJSONCombinedORM/mean": 0.685973048210144, "rewards/VisualizationJSONCombinedORM/std": 0.078691765666008, "step": 325, "train_speed(iter/s)": 0.028504 }, { "epoch": 0.26881720430107525, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 299.75, "eval_completions/mean_length": 258.1614583333333, "eval_completions/min_length": 222.75, "eval_kl": 0.044499715169270836, "eval_loss": 0.00044517716742120683, "eval_reward": 0.4317349524547656, "eval_reward_std": 0.0871856853676339, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4317349524547656, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08718568730788927, "eval_runtime": 270.9693, "eval_samples_per_second": 0.089, "eval_steps_per_second": 0.011, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 276.4375, "completions/min_length": 224.0, "epoch": 0.2696443341604632, "grad_norm": 0.18027272820472717, "kl": 0.05755615234375, "learning_rate": 9.149309714999766e-06, "loss": 0.0005748271942138672, "memory(GiB)": 39.01, "reward": 0.4706990122795105, "reward_std": 0.1217479556798935, "rewards/VisualizationJSONCombinedORM/mean": 0.4706990122795105, "rewards/VisualizationJSONCombinedORM/std": 0.1754952371120453, "step": 326, "train_speed(iter/s)": 0.027871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 249.8125, "completions/min_length": 214.0, "epoch": 0.2704714640198511, "grad_norm": 0.16292332112789154, "kl": 0.037322998046875, "learning_rate": 9.14123677315456e-06, "loss": 0.0003726482391357422, "memory(GiB)": 39.01, "reward": 0.6183153390884399, "reward_std": 0.08698610961437225, "rewards/VisualizationJSONCombinedORM/mean": 0.6183153390884399, "rewards/VisualizationJSONCombinedORM/std": 0.13004103302955627, "step": 327, "train_speed(iter/s)": 0.027892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 266.375, "completions/min_length": 231.0, "epoch": 0.27129859387923905, "grad_norm": 0.1860460638999939, "kl": 0.0604248046875, "learning_rate": 9.133129303286166e-06, "loss": 0.0006023794412612915, "memory(GiB)": 39.01, "reward": 0.5129692554473877, "reward_std": 0.13350757956504822, "rewards/VisualizationJSONCombinedORM/mean": 0.5129692554473877, "rewards/VisualizationJSONCombinedORM/std": 0.14678314328193665, "step": 328, "train_speed(iter/s)": 0.027924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 271.0, "completions/min_length": 212.0, "epoch": 0.272125723738627, "grad_norm": 0.18900741636753082, "kl": 0.0443115234375, "learning_rate": 9.124987372991512e-06, "loss": 0.00044378824532032013, "memory(GiB)": 39.01, "reward": 0.5710375308990479, "reward_std": 0.11187896877527237, "rewards/VisualizationJSONCombinedORM/mean": 0.5710375308990479, "rewards/VisualizationJSONCombinedORM/std": 0.12329269200563431, "step": 329, "train_speed(iter/s)": 0.027947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 267.25, "completions/min_length": 202.0, "epoch": 0.2729528535980149, "grad_norm": 0.19650621712207794, "kl": 0.0633544921875, "learning_rate": 9.116811050154853e-06, "loss": 0.0006331279873847961, "memory(GiB)": 39.01, "reward": 0.5944417715072632, "reward_std": 0.15099617838859558, "rewards/VisualizationJSONCombinedORM/mean": 0.5944417715072632, "rewards/VisualizationJSONCombinedORM/std": 0.14935262501239777, "step": 330, "train_speed(iter/s)": 0.027966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 263.1875, "completions/min_length": 230.0, "epoch": 0.2737799834574028, "grad_norm": 0.19027942419052124, "kl": 0.04833984375, "learning_rate": 9.108600402947191e-06, "loss": 0.00048400089144706726, "memory(GiB)": 39.01, "reward": 0.42199668288230896, "reward_std": 0.1226121336221695, "rewards/VisualizationJSONCombinedORM/mean": 0.42199668288230896, "rewards/VisualizationJSONCombinedORM/std": 0.17827735841274261, "step": 331, "train_speed(iter/s)": 0.027997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 261.625, "completions/min_length": 214.0, "epoch": 0.2746071133167907, "grad_norm": 0.18748429417610168, "kl": 0.04766845703125, "learning_rate": 9.100355499825715e-06, "loss": 0.00047660060226917267, "memory(GiB)": 39.01, "reward": 0.6929272413253784, "reward_std": 0.0877411738038063, "rewards/VisualizationJSONCombinedORM/mean": 0.6929272413253784, "rewards/VisualizationJSONCombinedORM/std": 0.09155311435461044, "step": 332, "train_speed(iter/s)": 0.028019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 261.3125, "completions/min_length": 214.0, "epoch": 0.27543424317617865, "grad_norm": 0.2131383866071701, "kl": 0.041259765625, "learning_rate": 9.092076409533218e-06, "loss": 0.0004126615822315216, "memory(GiB)": 39.01, "reward": 0.49017655849456787, "reward_std": 0.10826542973518372, "rewards/VisualizationJSONCombinedORM/mean": 0.49017655849456787, "rewards/VisualizationJSONCombinedORM/std": 0.10962408781051636, "step": 333, "train_speed(iter/s)": 0.028047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 253.375, "completions/min_length": 232.0, "epoch": 0.2762613730355666, "grad_norm": 0.1798260658979416, "kl": 0.04229736328125, "learning_rate": 9.083763201097543e-06, "loss": 0.000423673540353775, "memory(GiB)": 39.01, "reward": 0.5423468351364136, "reward_std": 0.10503479838371277, "rewards/VisualizationJSONCombinedORM/mean": 0.5423468351364136, "rewards/VisualizationJSONCombinedORM/std": 0.1059160903096199, "step": 334, "train_speed(iter/s)": 0.028076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 250.6875, "completions/min_length": 205.0, "epoch": 0.2770885028949545, "grad_norm": 0.19961459934711456, "kl": 0.0501708984375, "learning_rate": 9.07541594383099e-06, "loss": 0.000501096248626709, "memory(GiB)": 39.01, "reward": 0.631430983543396, "reward_std": 0.107563816010952, "rewards/VisualizationJSONCombinedORM/mean": 0.631430983543396, "rewards/VisualizationJSONCombinedORM/std": 0.11987459659576416, "step": 335, "train_speed(iter/s)": 0.028105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 284.375, "completions/min_length": 231.0, "epoch": 0.27791563275434245, "grad_norm": 0.20216397941112518, "kl": 0.0545654296875, "learning_rate": 9.067034707329748e-06, "loss": 0.0005455613136291504, "memory(GiB)": 39.01, "reward": 0.34743040800094604, "reward_std": 0.1154242679476738, "rewards/VisualizationJSONCombinedORM/mean": 0.34743040800094604, "rewards/VisualizationJSONCombinedORM/std": 0.2247164249420166, "step": 336, "train_speed(iter/s)": 0.028116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 241.0625, "completions/min_length": 220.0, "epoch": 0.2787427626137304, "grad_norm": 0.20991243422031403, "kl": 0.0428466796875, "learning_rate": 9.058619561473308e-06, "loss": 0.0004278421401977539, "memory(GiB)": 39.01, "reward": 0.3991404175758362, "reward_std": 0.06082037463784218, "rewards/VisualizationJSONCombinedORM/mean": 0.3991404175758362, "rewards/VisualizationJSONCombinedORM/std": 0.26758792996406555, "step": 337, "train_speed(iter/s)": 0.02815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 243.3125, "completions/min_length": 207.0, "epoch": 0.27956989247311825, "grad_norm": 0.18980611860752106, "kl": 0.0517578125, "learning_rate": 9.050170576423886e-06, "loss": 0.000517725944519043, "memory(GiB)": 39.01, "reward": 0.42176032066345215, "reward_std": 0.10453791916370392, "rewards/VisualizationJSONCombinedORM/mean": 0.42176032066345215, "rewards/VisualizationJSONCombinedORM/std": 0.10178670287132263, "step": 338, "train_speed(iter/s)": 0.028171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 255.6875, "completions/min_length": 225.0, "epoch": 0.2803970223325062, "grad_norm": 0.22217053174972534, "kl": 0.06060791015625, "learning_rate": 9.041687822625843e-06, "loss": 0.000606440007686615, "memory(GiB)": 39.01, "reward": 0.4181192219257355, "reward_std": 0.10415541380643845, "rewards/VisualizationJSONCombinedORM/mean": 0.4181192219257355, "rewards/VisualizationJSONCombinedORM/std": 0.18372324109077454, "step": 339, "train_speed(iter/s)": 0.028203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 237.1875, "completions/min_length": 196.0, "epoch": 0.2812241521918941, "grad_norm": 0.19733622670173645, "kl": 0.04205322265625, "learning_rate": 9.033171370805079e-06, "loss": 0.0004209354519844055, "memory(GiB)": 39.01, "reward": 0.6920740604400635, "reward_std": 0.12795956432819366, "rewards/VisualizationJSONCombinedORM/mean": 0.6920740604400635, "rewards/VisualizationJSONCombinedORM/std": 0.1266203075647354, "step": 340, "train_speed(iter/s)": 0.028223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 279.25, "completions/min_length": 214.0, "epoch": 0.28205128205128205, "grad_norm": 0.1630927324295044, "kl": 0.072998046875, "learning_rate": 9.024621291968461e-06, "loss": 0.000729929655790329, "memory(GiB)": 39.01, "reward": 0.5832582712173462, "reward_std": 0.07273661345243454, "rewards/VisualizationJSONCombinedORM/mean": 0.5832582712173462, "rewards/VisualizationJSONCombinedORM/std": 0.1107027679681778, "step": 341, "train_speed(iter/s)": 0.02824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 276.625, "completions/min_length": 225.0, "epoch": 0.28287841191067, "grad_norm": 0.1859336793422699, "kl": 0.0511474609375, "learning_rate": 9.016037657403225e-06, "loss": 0.0005116313695907593, "memory(GiB)": 39.01, "reward": 0.3036547303199768, "reward_std": 0.050550881773233414, "rewards/VisualizationJSONCombinedORM/mean": 0.3036547303199768, "rewards/VisualizationJSONCombinedORM/std": 0.09903331845998764, "step": 342, "train_speed(iter/s)": 0.028265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 247.5, "completions/min_length": 192.0, "epoch": 0.2837055417700579, "grad_norm": 0.16487006843090057, "kl": 0.044677734375, "learning_rate": 9.007420538676382e-06, "loss": 0.00044737011194229126, "memory(GiB)": 39.01, "reward": 0.3981260657310486, "reward_std": 0.09207160770893097, "rewards/VisualizationJSONCombinedORM/mean": 0.3981260657310486, "rewards/VisualizationJSONCombinedORM/std": 0.08901439607143402, "step": 343, "train_speed(iter/s)": 0.028282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 256.625, "completions/min_length": 201.0, "epoch": 0.28453267162944584, "grad_norm": 0.20788633823394775, "kl": 0.05682373046875, "learning_rate": 8.998770007634117e-06, "loss": 0.0005682036280632019, "memory(GiB)": 39.01, "reward": 0.48007482290267944, "reward_std": 0.10654395073652267, "rewards/VisualizationJSONCombinedORM/mean": 0.48007482290267944, "rewards/VisualizationJSONCombinedORM/std": 0.11178815364837646, "step": 344, "train_speed(iter/s)": 0.028305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 263.25, "completions/min_length": 223.0, "epoch": 0.2853598014888337, "grad_norm": 0.21327948570251465, "kl": 0.0687255859375, "learning_rate": 8.990086136401199e-06, "loss": 0.0006854534149169922, "memory(GiB)": 39.01, "reward": 0.36486002802848816, "reward_std": 0.07921648025512695, "rewards/VisualizationJSONCombinedORM/mean": 0.36486002802848816, "rewards/VisualizationJSONCombinedORM/std": 0.11954693496227264, "step": 345, "train_speed(iter/s)": 0.028333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 272.5625, "completions/min_length": 223.0, "epoch": 0.28618693134822165, "grad_norm": 0.20148655772209167, "kl": 0.0740966796875, "learning_rate": 8.981368997380371e-06, "loss": 0.0007425323128700256, "memory(GiB)": 39.01, "reward": 0.4619104266166687, "reward_std": 0.138237863779068, "rewards/VisualizationJSONCombinedORM/mean": 0.4619104266166687, "rewards/VisualizationJSONCombinedORM/std": 0.18372993171215057, "step": 346, "train_speed(iter/s)": 0.028348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 264.4375, "completions/min_length": 196.0, "epoch": 0.2870140612076096, "grad_norm": 0.18519923090934753, "kl": 0.0675048828125, "learning_rate": 8.972618663251753e-06, "loss": 0.0006750635802745819, "memory(GiB)": 39.01, "reward": 0.6666520833969116, "reward_std": 0.11576224863529205, "rewards/VisualizationJSONCombinedORM/mean": 0.6666520833969116, "rewards/VisualizationJSONCombinedORM/std": 0.11427301168441772, "step": 347, "train_speed(iter/s)": 0.028376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 282.1875, "completions/min_length": 203.0, "epoch": 0.2878411910669975, "grad_norm": 0.1959715634584427, "kl": 0.055419921875, "learning_rate": 8.963835206972229e-06, "loss": 0.0005536731332540512, "memory(GiB)": 39.01, "reward": 0.49515974521636963, "reward_std": 0.13397592306137085, "rewards/VisualizationJSONCombinedORM/mean": 0.49515974521636963, "rewards/VisualizationJSONCombinedORM/std": 0.14114777743816376, "step": 348, "train_speed(iter/s)": 0.028387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 261.0, "completions/min_length": 233.0, "epoch": 0.28866832092638545, "grad_norm": 0.20404967665672302, "kl": 0.07275390625, "learning_rate": 8.955018701774848e-06, "loss": 0.0007281303405761719, "memory(GiB)": 39.01, "reward": 0.49742937088012695, "reward_std": 0.12047688663005829, "rewards/VisualizationJSONCombinedORM/mean": 0.49742937088012695, "rewards/VisualizationJSONCombinedORM/std": 0.1802700012922287, "step": 349, "train_speed(iter/s)": 0.028413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 276.1875, "completions/min_length": 231.0, "epoch": 0.2894954507857734, "grad_norm": 0.1855413168668747, "kl": 0.06256103515625, "learning_rate": 8.9461692211682e-06, "loss": 0.0006257332861423492, "memory(GiB)": 39.01, "reward": 0.4978456199169159, "reward_std": 0.11640673875808716, "rewards/VisualizationJSONCombinedORM/mean": 0.4978456199169159, "rewards/VisualizationJSONCombinedORM/std": 0.17934995889663696, "step": 350, "train_speed(iter/s)": 0.028439 }, { "epoch": 0.2894954507857734, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 313.9166666666667, "eval_completions/mean_length": 268.2447916666667, "eval_completions/min_length": 232.25, "eval_kl": 0.06396484375, "eval_loss": 0.0006427547778002918, "eval_reward": 0.49056344913939637, "eval_reward_std": 0.09963491093367338, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.49056344913939637, "eval_rewards/VisualizationJSONCombinedORM/std": 0.09963491496940453, "eval_runtime": 279.7026, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 285.0625, "completions/min_length": 229.0, "epoch": 0.2903225806451613, "grad_norm": 0.20171329379081726, "kl": 0.107666015625, "learning_rate": 8.937286838935819e-06, "loss": 0.0010778047144412994, "memory(GiB)": 39.01, "reward": 0.5266391634941101, "reward_std": 0.09654214233160019, "rewards/VisualizationJSONCombinedORM/mean": 0.5266391634941101, "rewards/VisualizationJSONCombinedORM/std": 0.17613041400909424, "step": 351, "train_speed(iter/s)": 0.027821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 263.125, "completions/min_length": 210.0, "epoch": 0.29114971050454924, "grad_norm": 0.21658505499362946, "kl": 0.0635986328125, "learning_rate": 8.928371629135558e-06, "loss": 0.0006373468786478043, "memory(GiB)": 39.01, "reward": 0.5559599995613098, "reward_std": 0.11184760928153992, "rewards/VisualizationJSONCombinedORM/mean": 0.5559599995613098, "rewards/VisualizationJSONCombinedORM/std": 0.16957111656665802, "step": 352, "train_speed(iter/s)": 0.027851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 280.3125, "completions/min_length": 225.0, "epoch": 0.2919768403639371, "grad_norm": 0.1595996916294098, "kl": 0.07183837890625, "learning_rate": 8.91942366609897e-06, "loss": 0.0007173791527748108, "memory(GiB)": 39.01, "reward": 0.550249457359314, "reward_std": 0.07608957588672638, "rewards/VisualizationJSONCombinedORM/mean": 0.550249457359314, "rewards/VisualizationJSONCombinedORM/std": 0.1744118183851242, "step": 353, "train_speed(iter/s)": 0.027881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 247.5, "completions/min_length": 198.0, "epoch": 0.29280397022332505, "grad_norm": 0.1977897435426712, "kl": 0.08489990234375, "learning_rate": 8.910443024430695e-06, "loss": 0.0008482784032821655, "memory(GiB)": 39.01, "reward": 0.5707519054412842, "reward_std": 0.13792437314987183, "rewards/VisualizationJSONCombinedORM/mean": 0.5707519054412842, "rewards/VisualizationJSONCombinedORM/std": 0.1542736440896988, "step": 354, "train_speed(iter/s)": 0.027913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 281.9375, "completions/min_length": 211.0, "epoch": 0.293631100082713, "grad_norm": 0.19502143561840057, "kl": 0.06756591796875, "learning_rate": 8.901429779007833e-06, "loss": 0.0006754323840141296, "memory(GiB)": 39.01, "reward": 0.586725115776062, "reward_std": 0.11736904084682465, "rewards/VisualizationJSONCombinedORM/mean": 0.586725115776062, "rewards/VisualizationJSONCombinedORM/std": 0.11747459322214127, "step": 355, "train_speed(iter/s)": 0.027942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 278.6875, "completions/min_length": 225.0, "epoch": 0.2944582299421009, "grad_norm": 0.19767910242080688, "kl": 0.11474609375, "learning_rate": 8.892384004979325e-06, "loss": 0.0011460483074188232, "memory(GiB)": 39.01, "reward": 0.3754688799381256, "reward_std": 0.0968971997499466, "rewards/VisualizationJSONCombinedORM/mean": 0.3754688799381256, "rewards/VisualizationJSONCombinedORM/std": 0.17683981359004974, "step": 356, "train_speed(iter/s)": 0.027969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 276.25, "completions/min_length": 218.0, "epoch": 0.29528535980148884, "grad_norm": 0.20282316207885742, "kl": 0.09521484375, "learning_rate": 8.883305777765319e-06, "loss": 0.0009502097964286804, "memory(GiB)": 39.01, "reward": 0.7065755128860474, "reward_std": 0.16100192070007324, "rewards/VisualizationJSONCombinedORM/mean": 0.7065755128860474, "rewards/VisualizationJSONCombinedORM/std": 0.1567647010087967, "step": 357, "train_speed(iter/s)": 0.027983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 266.8125, "completions/min_length": 211.0, "epoch": 0.2961124896608768, "grad_norm": 0.19831553101539612, "kl": 0.05615234375, "learning_rate": 8.874195173056543e-06, "loss": 0.0005607567727565765, "memory(GiB)": 39.01, "reward": 0.23689743876457214, "reward_std": 0.04130905121564865, "rewards/VisualizationJSONCombinedORM/mean": 0.23689743876457214, "rewards/VisualizationJSONCombinedORM/std": 0.064460389316082, "step": 358, "train_speed(iter/s)": 0.028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 271.0625, "completions/min_length": 225.0, "epoch": 0.2969396195202647, "grad_norm": 0.21924397349357605, "kl": 0.0706787109375, "learning_rate": 8.865052266813686e-06, "loss": 0.0007088333368301392, "memory(GiB)": 39.01, "reward": 0.3357134759426117, "reward_std": 0.06377646327018738, "rewards/VisualizationJSONCombinedORM/mean": 0.3357134759426117, "rewards/VisualizationJSONCombinedORM/std": 0.07045077532529831, "step": 359, "train_speed(iter/s)": 0.028024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 290.5, "completions/min_length": 255.0, "epoch": 0.2977667493796526, "grad_norm": 0.19061563909053802, "kl": 0.1207275390625, "learning_rate": 8.855877135266742e-06, "loss": 0.0012075882405042648, "memory(GiB)": 39.01, "reward": 0.44616663455963135, "reward_std": 0.1194382831454277, "rewards/VisualizationJSONCombinedORM/mean": 0.44616663455963135, "rewards/VisualizationJSONCombinedORM/std": 0.20942460000514984, "step": 360, "train_speed(iter/s)": 0.028045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 252.625, "completions/min_length": 215.0, "epoch": 0.2985938792390405, "grad_norm": 0.21992358565330505, "kl": 0.0931396484375, "learning_rate": 8.846669854914395e-06, "loss": 0.000931683462113142, "memory(GiB)": 39.01, "reward": 0.657455563545227, "reward_std": 0.11134131252765656, "rewards/VisualizationJSONCombinedORM/mean": 0.657455563545227, "rewards/VisualizationJSONCombinedORM/std": 0.15100222826004028, "step": 361, "train_speed(iter/s)": 0.028061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 279.25, "completions/min_length": 198.0, "epoch": 0.29942100909842845, "grad_norm": 0.23154401779174805, "kl": 0.087158203125, "learning_rate": 8.837430502523372e-06, "loss": 0.0008711293339729309, "memory(GiB)": 39.01, "reward": 0.2831100821495056, "reward_std": 0.20834745466709137, "rewards/VisualizationJSONCombinedORM/mean": 0.2831100821495056, "rewards/VisualizationJSONCombinedORM/std": 0.2518567740917206, "step": 362, "train_speed(iter/s)": 0.028083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 289.0, "completions/min_length": 225.0, "epoch": 0.3002481389578164, "grad_norm": 0.18917900323867798, "kl": 0.06707763671875, "learning_rate": 8.828159155127802e-06, "loss": 0.0006706267595291138, "memory(GiB)": 39.01, "reward": 0.39107656478881836, "reward_std": 0.08769959211349487, "rewards/VisualizationJSONCombinedORM/mean": 0.39107656478881836, "rewards/VisualizationJSONCombinedORM/std": 0.1073845848441124, "step": 363, "train_speed(iter/s)": 0.028102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 266.75, "completions/min_length": 207.0, "epoch": 0.3010752688172043, "grad_norm": 0.19884416460990906, "kl": 0.0635986328125, "learning_rate": 8.818855890028578e-06, "loss": 0.0006370954215526581, "memory(GiB)": 39.01, "reward": 0.6006299257278442, "reward_std": 0.13622620701789856, "rewards/VisualizationJSONCombinedORM/mean": 0.6006299257278442, "rewards/VisualizationJSONCombinedORM/std": 0.18079374730587006, "step": 364, "train_speed(iter/s)": 0.028127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 279.6875, "completions/min_length": 208.0, "epoch": 0.30190239867659224, "grad_norm": 0.20079076290130615, "kl": 0.07568359375, "learning_rate": 8.80952078479271e-06, "loss": 0.0007572323083877563, "memory(GiB)": 39.01, "reward": 0.507414698600769, "reward_std": 0.09462037682533264, "rewards/VisualizationJSONCombinedORM/mean": 0.507414698600769, "rewards/VisualizationJSONCombinedORM/std": 0.1695495992898941, "step": 365, "train_speed(iter/s)": 0.028147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 272.125, "completions/min_length": 217.0, "epoch": 0.3027295285359802, "grad_norm": 0.19602961838245392, "kl": 0.0570068359375, "learning_rate": 8.800153917252679e-06, "loss": 0.0005712732672691345, "memory(GiB)": 39.01, "reward": 0.5660855770111084, "reward_std": 0.08167208731174469, "rewards/VisualizationJSONCombinedORM/mean": 0.5660855770111084, "rewards/VisualizationJSONCombinedORM/std": 0.15377455949783325, "step": 366, "train_speed(iter/s)": 0.028161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 302.0625, "completions/min_length": 237.0, "epoch": 0.30355665839536805, "grad_norm": 0.21063017845153809, "kl": 0.091796875, "learning_rate": 8.790755365505785e-06, "loss": 0.0009187757968902588, "memory(GiB)": 39.01, "reward": 0.2958011031150818, "reward_std": 0.06682433187961578, "rewards/VisualizationJSONCombinedORM/mean": 0.2958011031150818, "rewards/VisualizationJSONCombinedORM/std": 0.08648975193500519, "step": 367, "train_speed(iter/s)": 0.028178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 258.4375, "completions/min_length": 212.0, "epoch": 0.304383788254756, "grad_norm": 0.2039901167154312, "kl": 0.08221435546875, "learning_rate": 8.781325207913502e-06, "loss": 0.0008243662305176258, "memory(GiB)": 39.01, "reward": 0.5916420221328735, "reward_std": 0.12829133868217468, "rewards/VisualizationJSONCombinedORM/mean": 0.5916420221328735, "rewards/VisualizationJSONCombinedORM/std": 0.1344420611858368, "step": 368, "train_speed(iter/s)": 0.028213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 273.9375, "completions/min_length": 233.0, "epoch": 0.3052109181141439, "grad_norm": 0.23863548040390015, "kl": 0.05792236328125, "learning_rate": 8.771863523100821e-06, "loss": 0.0005800351500511169, "memory(GiB)": 39.01, "reward": 0.5620265007019043, "reward_std": 0.10659145563840866, "rewards/VisualizationJSONCombinedORM/mean": 0.5620265007019043, "rewards/VisualizationJSONCombinedORM/std": 0.11591935157775879, "step": 369, "train_speed(iter/s)": 0.028237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 275.8125, "completions/min_length": 220.0, "epoch": 0.30603804797353185, "grad_norm": 0.19759465754032135, "kl": 0.07958984375, "learning_rate": 8.762370389955592e-06, "loss": 0.0007962211966514587, "memory(GiB)": 39.01, "reward": 0.530056357383728, "reward_std": 0.17565089464187622, "rewards/VisualizationJSONCombinedORM/mean": 0.530056357383728, "rewards/VisualizationJSONCombinedORM/std": 0.1979132741689682, "step": 370, "train_speed(iter/s)": 0.028266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 275.4375, "completions/min_length": 206.0, "epoch": 0.3068651778329198, "grad_norm": 0.23758450150489807, "kl": 0.0751953125, "learning_rate": 8.752845887627872e-06, "loss": 0.0007534995675086975, "memory(GiB)": 39.01, "reward": 0.5193362832069397, "reward_std": 0.10174676775932312, "rewards/VisualizationJSONCombinedORM/mean": 0.5193362832069397, "rewards/VisualizationJSONCombinedORM/std": 0.2009720802307129, "step": 371, "train_speed(iter/s)": 0.028289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 257.0625, "completions/min_length": 217.0, "epoch": 0.3076923076923077, "grad_norm": 0.19559575617313385, "kl": 0.06494140625, "learning_rate": 8.74329009552926e-06, "loss": 0.0006488710641860962, "memory(GiB)": 39.01, "reward": 0.5513579249382019, "reward_std": 0.10868044197559357, "rewards/VisualizationJSONCombinedORM/mean": 0.5513579249382019, "rewards/VisualizationJSONCombinedORM/std": 0.14060662686824799, "step": 372, "train_speed(iter/s)": 0.028315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 254.125, "completions/min_length": 217.0, "epoch": 0.30851943755169564, "grad_norm": 0.23271790146827698, "kl": 0.04766845703125, "learning_rate": 8.733703093332237e-06, "loss": 0.00047742947936058044, "memory(GiB)": 39.01, "reward": 0.3419567942619324, "reward_std": 0.09137776494026184, "rewards/VisualizationJSONCombinedORM/mean": 0.3419567942619324, "rewards/VisualizationJSONCombinedORM/std": 0.10911109298467636, "step": 373, "train_speed(iter/s)": 0.028346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 244.75, "completions/min_length": 209.0, "epoch": 0.3093465674110835, "grad_norm": 0.1836162954568863, "kl": 0.08367919921875, "learning_rate": 8.724084960969507e-06, "loss": 0.0008358117192983627, "memory(GiB)": 39.01, "reward": 0.45319288969039917, "reward_std": 0.08837665617465973, "rewards/VisualizationJSONCombinedORM/mean": 0.45319288969039917, "rewards/VisualizationJSONCombinedORM/std": 0.23441316187381744, "step": 374, "train_speed(iter/s)": 0.028369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 287.375, "completions/min_length": 214.0, "epoch": 0.31017369727047145, "grad_norm": 0.19004325568675995, "kl": 0.07159423828125, "learning_rate": 8.714435778633314e-06, "loss": 0.0007165446877479553, "memory(GiB)": 39.01, "reward": 0.5591437816619873, "reward_std": 0.12092256546020508, "rewards/VisualizationJSONCombinedORM/mean": 0.5591437816619873, "rewards/VisualizationJSONCombinedORM/std": 0.22227843105793, "step": 375, "train_speed(iter/s)": 0.028393 }, { "epoch": 0.31017369727047145, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 321.2083333333333, "eval_completions/mean_length": 273.4427083333333, "eval_completions/min_length": 236.75, "eval_kl": 0.059621175130208336, "eval_loss": 0.0005955795641057193, "eval_reward": 0.4796646200120449, "eval_reward_std": 0.10473314307940502, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4796646200120449, "eval_rewards/VisualizationJSONCombinedORM/std": 0.10473314750318725, "eval_runtime": 283.8308, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 260.75, "completions/min_length": 199.0, "epoch": 0.3110008271298594, "grad_norm": 0.1966690868139267, "kl": 0.0386962890625, "learning_rate": 8.704755626774796e-06, "loss": 0.00038664788007736206, "memory(GiB)": 39.01, "reward": 0.5371428728103638, "reward_std": 0.08353325724601746, "rewards/VisualizationJSONCombinedORM/mean": 0.5371428728103638, "rewards/VisualizationJSONCombinedORM/std": 0.14168687164783478, "step": 376, "train_speed(iter/s)": 0.027817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 273.9375, "completions/min_length": 204.0, "epoch": 0.3118279569892473, "grad_norm": 0.1874321550130844, "kl": 0.03778076171875, "learning_rate": 8.695044586103297e-06, "loss": 0.00037819333374500275, "memory(GiB)": 39.01, "reward": 0.5152400732040405, "reward_std": 0.17263156175613403, "rewards/VisualizationJSONCombinedORM/mean": 0.5152400732040405, "rewards/VisualizationJSONCombinedORM/std": 0.16924883425235748, "step": 377, "train_speed(iter/s)": 0.027843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 286.4375, "completions/min_length": 239.0, "epoch": 0.31265508684863524, "grad_norm": 0.1961754560470581, "kl": 0.0623779296875, "learning_rate": 8.6853027375857e-06, "loss": 0.0006236769258975983, "memory(GiB)": 39.01, "reward": 0.311405211687088, "reward_std": 0.07394722104072571, "rewards/VisualizationJSONCombinedORM/mean": 0.311405211687088, "rewards/VisualizationJSONCombinedORM/std": 0.07708793133497238, "step": 378, "train_speed(iter/s)": 0.027866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 279.25, "completions/min_length": 240.0, "epoch": 0.3134822167080232, "grad_norm": 0.19099970161914825, "kl": 0.05596923828125, "learning_rate": 8.675530162445753e-06, "loss": 0.000559389591217041, "memory(GiB)": 39.01, "reward": 0.5730457901954651, "reward_std": 0.1264209896326065, "rewards/VisualizationJSONCombinedORM/mean": 0.5730457901954651, "rewards/VisualizationJSONCombinedORM/std": 0.1222587302327156, "step": 379, "train_speed(iter/s)": 0.02789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 260.6875, "completions/min_length": 208.0, "epoch": 0.3143093465674111, "grad_norm": 0.18329226970672607, "kl": 0.0653076171875, "learning_rate": 8.66572694216339e-06, "loss": 0.0006536096334457397, "memory(GiB)": 39.01, "reward": 0.6091811656951904, "reward_std": 0.13899920880794525, "rewards/VisualizationJSONCombinedORM/mean": 0.6091811656951904, "rewards/VisualizationJSONCombinedORM/std": 0.1567065417766571, "step": 380, "train_speed(iter/s)": 0.027901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 284.1875, "completions/min_length": 232.0, "epoch": 0.315136476426799, "grad_norm": 0.2058449536561966, "kl": 0.05841064453125, "learning_rate": 8.655893158474056e-06, "loss": 0.0005855066701769829, "memory(GiB)": 39.01, "reward": 0.6294856071472168, "reward_std": 0.1399548202753067, "rewards/VisualizationJSONCombinedORM/mean": 0.6294856071472168, "rewards/VisualizationJSONCombinedORM/std": 0.14888736605644226, "step": 381, "train_speed(iter/s)": 0.027916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 321.25, "completions/min_length": 227.0, "epoch": 0.3159636062861869, "grad_norm": 0.1862473338842392, "kl": 0.05908203125, "learning_rate": 8.646028893368014e-06, "loss": 0.000590004026889801, "memory(GiB)": 39.01, "reward": 0.6010853052139282, "reward_std": 0.16036836802959442, "rewards/VisualizationJSONCombinedORM/mean": 0.6010853052139282, "rewards/VisualizationJSONCombinedORM/std": 0.15819287300109863, "step": 382, "train_speed(iter/s)": 0.027934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 287.25, "completions/min_length": 252.0, "epoch": 0.31679073614557485, "grad_norm": 0.17452703416347504, "kl": 0.031890869140625, "learning_rate": 8.636134229089676e-06, "loss": 0.00031913816928863525, "memory(GiB)": 39.01, "reward": 0.4690382182598114, "reward_std": 0.05826621875166893, "rewards/VisualizationJSONCombinedORM/mean": 0.4690382182598114, "rewards/VisualizationJSONCombinedORM/std": 0.10890337079763412, "step": 383, "train_speed(iter/s)": 0.027952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 291.0625, "completions/min_length": 234.0, "epoch": 0.3176178660049628, "grad_norm": 0.17442353069782257, "kl": 0.06475830078125, "learning_rate": 8.626209248136911e-06, "loss": 0.0006478577852249146, "memory(GiB)": 39.01, "reward": 0.5592007637023926, "reward_std": 0.11840279400348663, "rewards/VisualizationJSONCombinedORM/mean": 0.5592007637023926, "rewards/VisualizationJSONCombinedORM/std": 0.1771722435951233, "step": 384, "train_speed(iter/s)": 0.027966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 278.75, "completions/min_length": 232.0, "epoch": 0.3184449958643507, "grad_norm": 0.20480923354625702, "kl": 0.05999755859375, "learning_rate": 8.616254033260351e-06, "loss": 0.000599738210439682, "memory(GiB)": 39.01, "reward": 0.6183185577392578, "reward_std": 0.11821451783180237, "rewards/VisualizationJSONCombinedORM/mean": 0.6183185577392578, "rewards/VisualizationJSONCombinedORM/std": 0.1364973783493042, "step": 385, "train_speed(iter/s)": 0.027992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 279.1875, "completions/min_length": 224.0, "epoch": 0.31927212572373864, "grad_norm": 0.20782791078090668, "kl": 0.07470703125, "learning_rate": 8.606268667462708e-06, "loss": 0.0007487349212169647, "memory(GiB)": 39.01, "reward": 0.4825613498687744, "reward_std": 0.12447316199541092, "rewards/VisualizationJSONCombinedORM/mean": 0.4825613498687744, "rewards/VisualizationJSONCombinedORM/std": 0.12127425521612167, "step": 386, "train_speed(iter/s)": 0.028018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 288.6875, "completions/min_length": 233.0, "epoch": 0.3200992555831266, "grad_norm": 0.17487548291683197, "kl": 0.06207275390625, "learning_rate": 8.596253233998087e-06, "loss": 0.0006196200847625732, "memory(GiB)": 39.01, "reward": 0.3891586661338806, "reward_std": 0.08995617181062698, "rewards/VisualizationJSONCombinedORM/mean": 0.3891586661338806, "rewards/VisualizationJSONCombinedORM/std": 0.0940169021487236, "step": 387, "train_speed(iter/s)": 0.028036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 290.5625, "completions/min_length": 234.0, "epoch": 0.32092638544251445, "grad_norm": 0.18543991446495056, "kl": 0.03802490234375, "learning_rate": 8.586207816371276e-06, "loss": 0.00037993118166923523, "memory(GiB)": 39.01, "reward": 0.573743462562561, "reward_std": 0.1123126670718193, "rewards/VisualizationJSONCombinedORM/mean": 0.573743462562561, "rewards/VisualizationJSONCombinedORM/std": 0.1968742311000824, "step": 388, "train_speed(iter/s)": 0.028052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 283.875, "completions/min_length": 220.0, "epoch": 0.3217535153019024, "grad_norm": 0.20733793079853058, "kl": 0.0546875, "learning_rate": 8.576132498337069e-06, "loss": 0.0005478113889694214, "memory(GiB)": 39.01, "reward": 0.5848487019538879, "reward_std": 0.13042756915092468, "rewards/VisualizationJSONCombinedORM/mean": 0.5848487019538879, "rewards/VisualizationJSONCombinedORM/std": 0.13384020328521729, "step": 389, "train_speed(iter/s)": 0.028077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 271.75, "completions/min_length": 229.0, "epoch": 0.3225806451612903, "grad_norm": 0.18798641860485077, "kl": 0.0457763671875, "learning_rate": 8.566027363899548e-06, "loss": 0.00045744143426418304, "memory(GiB)": 39.01, "reward": 0.4342663586139679, "reward_std": 0.08064348250627518, "rewards/VisualizationJSONCombinedORM/mean": 0.4342663586139679, "rewards/VisualizationJSONCombinedORM/std": 0.21159625053405762, "step": 390, "train_speed(iter/s)": 0.028099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 309.75, "completions/min_length": 235.0, "epoch": 0.32340777502067825, "grad_norm": 0.20926973223686218, "kl": 0.05731201171875, "learning_rate": 8.555892497311402e-06, "loss": 0.0005733929574489594, "memory(GiB)": 39.01, "reward": 0.522990345954895, "reward_std": 0.12336818128824234, "rewards/VisualizationJSONCombinedORM/mean": 0.522990345954895, "rewards/VisualizationJSONCombinedORM/std": 0.12708650529384613, "step": 391, "train_speed(iter/s)": 0.02811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 302.9375, "completions/min_length": 247.0, "epoch": 0.3242349048800662, "grad_norm": 0.17674772441387177, "kl": 0.06390380859375, "learning_rate": 8.545727983073209e-06, "loss": 0.0006399694830179214, "memory(GiB)": 39.01, "reward": 0.5865004658699036, "reward_std": 0.07531968504190445, "rewards/VisualizationJSONCombinedORM/mean": 0.5865004658699036, "rewards/VisualizationJSONCombinedORM/std": 0.15983358025550842, "step": 392, "train_speed(iter/s)": 0.028132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 304.625, "completions/min_length": 222.0, "epoch": 0.3250620347394541, "grad_norm": 0.18682710826396942, "kl": 0.0838623046875, "learning_rate": 8.535533905932739e-06, "loss": 0.0008385069668292999, "memory(GiB)": 39.01, "reward": 0.5240486264228821, "reward_std": 0.10909845679998398, "rewards/VisualizationJSONCombinedORM/mean": 0.5240486264228821, "rewards/VisualizationJSONCombinedORM/std": 0.12953178584575653, "step": 393, "train_speed(iter/s)": 0.028158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 263.25, "completions/min_length": 203.0, "epoch": 0.32588916459884204, "grad_norm": 0.18365876376628876, "kl": 0.06378173828125, "learning_rate": 8.525310350884246e-06, "loss": 0.0006383620202541351, "memory(GiB)": 39.01, "reward": 0.3819335401058197, "reward_std": 0.11879275739192963, "rewards/VisualizationJSONCombinedORM/mean": 0.3819335401058197, "rewards/VisualizationJSONCombinedORM/std": 0.19825471937656403, "step": 394, "train_speed(iter/s)": 0.028178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 290.3125, "completions/min_length": 226.0, "epoch": 0.3267162944582299, "grad_norm": 0.20103289186954498, "kl": 0.05902099609375, "learning_rate": 8.515057403167764e-06, "loss": 0.0005892738699913025, "memory(GiB)": 39.01, "reward": 0.635608971118927, "reward_std": 0.13110558688640594, "rewards/VisualizationJSONCombinedORM/mean": 0.635608971118927, "rewards/VisualizationJSONCombinedORM/std": 0.13620901107788086, "step": 395, "train_speed(iter/s)": 0.028207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 283.125, "completions/min_length": 229.0, "epoch": 0.32754342431761785, "grad_norm": 0.18182148039340973, "kl": 0.06793212890625, "learning_rate": 8.504775148268381e-06, "loss": 0.0006791874766349792, "memory(GiB)": 39.01, "reward": 0.4930909276008606, "reward_std": 0.07881627976894379, "rewards/VisualizationJSONCombinedORM/mean": 0.4930909276008606, "rewards/VisualizationJSONCombinedORM/std": 0.08533249795436859, "step": 396, "train_speed(iter/s)": 0.028239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 315.4375, "completions/min_length": 241.0, "epoch": 0.3283705541770058, "grad_norm": 0.1949678659439087, "kl": 0.04547119140625, "learning_rate": 8.494463671915547e-06, "loss": 0.00045455992221832275, "memory(GiB)": 39.01, "reward": 0.6605364084243774, "reward_std": 0.13277743756771088, "rewards/VisualizationJSONCombinedORM/mean": 0.6605364084243774, "rewards/VisualizationJSONCombinedORM/std": 0.12951131165027618, "step": 397, "train_speed(iter/s)": 0.02825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 291.125, "completions/min_length": 237.0, "epoch": 0.3291976840363937, "grad_norm": 0.1781718134880066, "kl": 0.04022216796875, "learning_rate": 8.484123060082346e-06, "loss": 0.0004024431109428406, "memory(GiB)": 39.01, "reward": 0.6255620718002319, "reward_std": 0.1388951539993286, "rewards/VisualizationJSONCombinedORM/mean": 0.6255620718002319, "rewards/VisualizationJSONCombinedORM/std": 0.15771931409835815, "step": 398, "train_speed(iter/s)": 0.028274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 295.875, "completions/min_length": 238.0, "epoch": 0.33002481389578164, "grad_norm": 0.1920441836118698, "kl": 0.094970703125, "learning_rate": 8.473753398984781e-06, "loss": 0.0009510442614555359, "memory(GiB)": 39.01, "reward": 0.24888554215431213, "reward_std": 0.059977345168590546, "rewards/VisualizationJSONCombinedORM/mean": 0.24888554215431213, "rewards/VisualizationJSONCombinedORM/std": 0.06151294335722923, "step": 399, "train_speed(iter/s)": 0.028285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 277.0, "completions/min_length": 236.0, "epoch": 0.3308519437551696, "grad_norm": 0.1961251199245453, "kl": 0.058349609375, "learning_rate": 8.463354775081056e-06, "loss": 0.0005848295986652374, "memory(GiB)": 39.01, "reward": 0.49150538444519043, "reward_std": 0.09324154257774353, "rewards/VisualizationJSONCombinedORM/mean": 0.49150538444519043, "rewards/VisualizationJSONCombinedORM/std": 0.22485089302062988, "step": 400, "train_speed(iter/s)": 0.028303 }, { "epoch": 0.3308519437551696, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.875, "eval_completions/mean_length": 289.9895833333333, "eval_completions/min_length": 246.58333333333334, "eval_kl": 0.06720987955729167, "eval_loss": 0.0006716176867485046, "eval_reward": 0.4931657885511716, "eval_reward_std": 0.0866262367926538, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4931657885511716, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08662623850007851, "eval_runtime": 298.0072, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 304.0625, "completions/min_length": 253.0, "epoch": 0.3316790736145575, "grad_norm": 0.1960788518190384, "kl": 0.06072998046875, "learning_rate": 8.452927275070858e-06, "loss": 0.0006076246500015259, "memory(GiB)": 39.01, "reward": 0.630734920501709, "reward_std": 0.10560670495033264, "rewards/VisualizationJSONCombinedORM/mean": 0.630734920501709, "rewards/VisualizationJSONCombinedORM/std": 0.10645616054534912, "step": 401, "train_speed(iter/s)": 0.027732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 310.8125, "completions/min_length": 276.0, "epoch": 0.3325062034739454, "grad_norm": 0.19933229684829712, "kl": 0.0955810546875, "learning_rate": 8.442470985894631e-06, "loss": 0.0009580180048942566, "memory(GiB)": 39.01, "reward": 0.5159990787506104, "reward_std": 0.11331094801425934, "rewards/VisualizationJSONCombinedORM/mean": 0.5159990787506104, "rewards/VisualizationJSONCombinedORM/std": 0.13016174733638763, "step": 402, "train_speed(iter/s)": 0.027753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 283.6875, "completions/min_length": 201.0, "epoch": 0.3333333333333333, "grad_norm": 0.20589126646518707, "kl": 0.07110595703125, "learning_rate": 8.43198599473285e-06, "loss": 0.0007112473249435425, "memory(GiB)": 39.01, "reward": 0.5827153325080872, "reward_std": 0.0999850481748581, "rewards/VisualizationJSONCombinedORM/mean": 0.5827153325080872, "rewards/VisualizationJSONCombinedORM/std": 0.10875339806079865, "step": 403, "train_speed(iter/s)": 0.027764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 302.6875, "completions/min_length": 249.0, "epoch": 0.33416046319272125, "grad_norm": 0.21068750321865082, "kl": 0.0760498046875, "learning_rate": 8.4214723890053e-06, "loss": 0.000761277973651886, "memory(GiB)": 39.01, "reward": 0.44482550024986267, "reward_std": 0.08564447611570358, "rewards/VisualizationJSONCombinedORM/mean": 0.44482550024986267, "rewards/VisualizationJSONCombinedORM/std": 0.13774144649505615, "step": 404, "train_speed(iter/s)": 0.027783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 301.75, "completions/min_length": 237.0, "epoch": 0.3349875930521092, "grad_norm": 0.30570074915885925, "kl": 0.08001708984375, "learning_rate": 8.410930256370337e-06, "loss": 0.000800449401140213, "memory(GiB)": 39.01, "reward": 0.5223488211631775, "reward_std": 0.15837132930755615, "rewards/VisualizationJSONCombinedORM/mean": 0.5223488211631775, "rewards/VisualizationJSONCombinedORM/std": 0.15792740881443024, "step": 405, "train_speed(iter/s)": 0.027805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 290.8125, "completions/min_length": 242.0, "epoch": 0.3358147229114971, "grad_norm": 0.18773400783538818, "kl": 0.0650634765625, "learning_rate": 8.400359684724168e-06, "loss": 0.0006500892341136932, "memory(GiB)": 39.01, "reward": 0.4648556113243103, "reward_std": 0.0981888398528099, "rewards/VisualizationJSONCombinedORM/mean": 0.4648556113243103, "rewards/VisualizationJSONCombinedORM/std": 0.22517572343349457, "step": 406, "train_speed(iter/s)": 0.027818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 313.75, "completions/min_length": 237.0, "epoch": 0.33664185277088504, "grad_norm": 0.39052078127861023, "kl": 0.05560302734375, "learning_rate": 8.389760762200117e-06, "loss": 0.000555574893951416, "memory(GiB)": 39.01, "reward": 0.4965183734893799, "reward_std": 0.11338803917169571, "rewards/VisualizationJSONCombinedORM/mean": 0.4965183734893799, "rewards/VisualizationJSONCombinedORM/std": 0.16367442905902863, "step": 407, "train_speed(iter/s)": 0.027835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 291.6875, "completions/min_length": 240.0, "epoch": 0.337468982630273, "grad_norm": 0.20400603115558624, "kl": 0.0616455078125, "learning_rate": 8.379133577167875e-06, "loss": 0.000614803284406662, "memory(GiB)": 39.01, "reward": 0.49840620160102844, "reward_std": 0.06342092901468277, "rewards/VisualizationJSONCombinedORM/mean": 0.49840620160102844, "rewards/VisualizationJSONCombinedORM/std": 0.22523275017738342, "step": 408, "train_speed(iter/s)": 0.027843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 284.4375, "completions/min_length": 215.0, "epoch": 0.33829611248966085, "grad_norm": 0.1988525241613388, "kl": 0.056396484375, "learning_rate": 8.368478218232787e-06, "loss": 0.0005640611052513123, "memory(GiB)": 39.01, "reward": 0.4166383445262909, "reward_std": 0.10022212564945221, "rewards/VisualizationJSONCombinedORM/mean": 0.4166383445262909, "rewards/VisualizationJSONCombinedORM/std": 0.0994628369808197, "step": 409, "train_speed(iter/s)": 0.027859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 303.125, "completions/min_length": 231.0, "epoch": 0.3391232423490488, "grad_norm": 0.18147249519824982, "kl": 0.04534912109375, "learning_rate": 8.357794774235094e-06, "loss": 0.00045324116945266724, "memory(GiB)": 39.01, "reward": 0.48967477679252625, "reward_std": 0.05176864191889763, "rewards/VisualizationJSONCombinedORM/mean": 0.48967477679252625, "rewards/VisualizationJSONCombinedORM/std": 0.2338569313287735, "step": 410, "train_speed(iter/s)": 0.02788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 295.6875, "completions/min_length": 254.0, "epoch": 0.3399503722084367, "grad_norm": 0.19900040328502655, "kl": 0.06524658203125, "learning_rate": 8.347083334249198e-06, "loss": 0.0006532073020935059, "memory(GiB)": 39.01, "reward": 0.5478857159614563, "reward_std": 0.11539959162473679, "rewards/VisualizationJSONCombinedORM/mean": 0.5478857159614563, "rewards/VisualizationJSONCombinedORM/std": 0.19697242975234985, "step": 411, "train_speed(iter/s)": 0.027893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 288.8125, "completions/min_length": 227.0, "epoch": 0.34077750206782464, "grad_norm": 0.18206720054149628, "kl": 0.04888916015625, "learning_rate": 8.33634398758293e-06, "loss": 0.0004886509850621223, "memory(GiB)": 39.01, "reward": 0.5442171692848206, "reward_std": 0.05571061745285988, "rewards/VisualizationJSONCombinedORM/mean": 0.5442171692848206, "rewards/VisualizationJSONCombinedORM/std": 0.22523875534534454, "step": 412, "train_speed(iter/s)": 0.027912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 316.0625, "completions/min_length": 273.0, "epoch": 0.3416046319272126, "grad_norm": 0.193484365940094, "kl": 0.048583984375, "learning_rate": 8.325576823776785e-06, "loss": 0.00048576295375823975, "memory(GiB)": 39.01, "reward": 0.5339990258216858, "reward_std": 0.10525356233119965, "rewards/VisualizationJSONCombinedORM/mean": 0.5339990258216858, "rewards/VisualizationJSONCombinedORM/std": 0.20679984986782074, "step": 413, "train_speed(iter/s)": 0.027931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 262.625, "completions/min_length": 233.0, "epoch": 0.3424317617866005, "grad_norm": 0.1723775416612625, "kl": 0.04632568359375, "learning_rate": 8.314781932603194e-06, "loss": 0.00046384334564208984, "memory(GiB)": 39.01, "reward": 0.6075239181518555, "reward_std": 0.09552600979804993, "rewards/VisualizationJSONCombinedORM/mean": 0.6075239181518555, "rewards/VisualizationJSONCombinedORM/std": 0.1400538980960846, "step": 414, "train_speed(iter/s)": 0.027955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 295.5625, "completions/min_length": 224.0, "epoch": 0.34325889164598844, "grad_norm": 0.2184753268957138, "kl": 0.0550537109375, "learning_rate": 8.303959404065763e-06, "loss": 0.0005502179265022278, "memory(GiB)": 39.01, "reward": 0.5086814165115356, "reward_std": 0.10965219140052795, "rewards/VisualizationJSONCombinedORM/mean": 0.5086814165115356, "rewards/VisualizationJSONCombinedORM/std": 0.15289992094039917, "step": 415, "train_speed(iter/s)": 0.027963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 301.125, "completions/min_length": 234.0, "epoch": 0.34408602150537637, "grad_norm": 0.19173721969127655, "kl": 0.07403564453125, "learning_rate": 8.29310932839853e-06, "loss": 0.0007404610514640808, "memory(GiB)": 39.01, "reward": 0.5586879253387451, "reward_std": 0.126773402094841, "rewards/VisualizationJSONCombinedORM/mean": 0.5586879253387451, "rewards/VisualizationJSONCombinedORM/std": 0.140001118183136, "step": 416, "train_speed(iter/s)": 0.027982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 291.5, "completions/min_length": 231.0, "epoch": 0.34491315136476425, "grad_norm": 0.17898887395858765, "kl": 0.05908203125, "learning_rate": 8.282231796065215e-06, "loss": 0.0005907304584980011, "memory(GiB)": 39.01, "reward": 0.49339085817337036, "reward_std": 0.10238748788833618, "rewards/VisualizationJSONCombinedORM/mean": 0.49339085817337036, "rewards/VisualizationJSONCombinedORM/std": 0.18212689459323883, "step": 417, "train_speed(iter/s)": 0.027997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 293.75, "completions/min_length": 230.0, "epoch": 0.3457402812241522, "grad_norm": 0.19892767071723938, "kl": 0.04107666015625, "learning_rate": 8.27132689775845e-06, "loss": 0.00041149184107780457, "memory(GiB)": 39.01, "reward": 0.41983723640441895, "reward_std": 0.05880865827202797, "rewards/VisualizationJSONCombinedORM/mean": 0.41983723640441895, "rewards/VisualizationJSONCombinedORM/std": 0.10586091130971909, "step": 418, "train_speed(iter/s)": 0.028015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 296.9375, "completions/min_length": 222.0, "epoch": 0.3465674110835401, "grad_norm": 0.1954062283039093, "kl": 0.0374755859375, "learning_rate": 8.260394724399043e-06, "loss": 0.00037547945976257324, "memory(GiB)": 39.01, "reward": 0.2807910442352295, "reward_std": 0.04162972420454025, "rewards/VisualizationJSONCombinedORM/mean": 0.2807910442352295, "rewards/VisualizationJSONCombinedORM/std": 0.045947086066007614, "step": 419, "train_speed(iter/s)": 0.028043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 308.8125, "completions/min_length": 262.0, "epoch": 0.34739454094292804, "grad_norm": 0.21680127084255219, "kl": 0.04608154296875, "learning_rate": 8.24943536713521e-06, "loss": 0.0004611015319824219, "memory(GiB)": 39.01, "reward": 0.6572363376617432, "reward_std": 0.07253777980804443, "rewards/VisualizationJSONCombinedORM/mean": 0.6572363376617432, "rewards/VisualizationJSONCombinedORM/std": 0.1306181401014328, "step": 420, "train_speed(iter/s)": 0.028061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 311.6875, "completions/min_length": 255.0, "epoch": 0.348221670802316, "grad_norm": 0.16487586498260498, "kl": 0.0557861328125, "learning_rate": 8.23844891734181e-06, "loss": 0.0005574002861976624, "memory(GiB)": 39.01, "reward": 0.5531156063079834, "reward_std": 0.12409183382987976, "rewards/VisualizationJSONCombinedORM/mean": 0.5531156063079834, "rewards/VisualizationJSONCombinedORM/std": 0.13091357052326202, "step": 421, "train_speed(iter/s)": 0.028085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 305.3125, "completions/min_length": 248.0, "epoch": 0.3490488006617039, "grad_norm": 0.19655796885490417, "kl": 0.066650390625, "learning_rate": 8.227435466619596e-06, "loss": 0.0006668791174888611, "memory(GiB)": 39.01, "reward": 0.5504978895187378, "reward_std": 0.11604280769824982, "rewards/VisualizationJSONCombinedORM/mean": 0.5504978895187378, "rewards/VisualizationJSONCombinedORM/std": 0.1742561310529709, "step": 422, "train_speed(iter/s)": 0.028105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 294.3125, "completions/min_length": 252.0, "epoch": 0.34987593052109184, "grad_norm": 0.19459128379821777, "kl": 0.0531005859375, "learning_rate": 8.216395106794437e-06, "loss": 0.0005319155752658844, "memory(GiB)": 39.01, "reward": 0.5988582968711853, "reward_std": 0.13922427594661713, "rewards/VisualizationJSONCombinedORM/mean": 0.5988582968711853, "rewards/VisualizationJSONCombinedORM/std": 0.14195364713668823, "step": 423, "train_speed(iter/s)": 0.028134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 285.9375, "completions/min_length": 228.0, "epoch": 0.3507030603804797, "grad_norm": 0.18878862261772156, "kl": 0.04229736328125, "learning_rate": 8.20532792991657e-06, "loss": 0.00042324140667915344, "memory(GiB)": 39.01, "reward": 0.5451532006263733, "reward_std": 0.13000090420246124, "rewards/VisualizationJSONCombinedORM/mean": 0.5451532006263733, "rewards/VisualizationJSONCombinedORM/std": 0.12823060154914856, "step": 424, "train_speed(iter/s)": 0.02816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 291.75, "completions/min_length": 248.0, "epoch": 0.35153019023986765, "grad_norm": 0.1819080263376236, "kl": 0.0521240234375, "learning_rate": 8.194234028259806e-06, "loss": 0.0005211643874645233, "memory(GiB)": 39.01, "reward": 0.5868995189666748, "reward_std": 0.1199808120727539, "rewards/VisualizationJSONCombinedORM/mean": 0.5868995189666748, "rewards/VisualizationJSONCombinedORM/std": 0.15055005252361298, "step": 425, "train_speed(iter/s)": 0.02818 }, { "epoch": 0.35153019023986765, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 340.8333333333333, "eval_completions/mean_length": 294.140625, "eval_completions/min_length": 246.41666666666666, "eval_kl": 0.049997965494791664, "eval_loss": 0.0005022299592383206, "eval_reward": 0.4680954683572054, "eval_reward_std": 0.092122925290217, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4680954683572054, "eval_rewards/VisualizationJSONCombinedORM/std": 0.09212292986921966, "eval_runtime": 295.8014, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 265.4375, "completions/min_length": 212.0, "epoch": 0.3523573200992556, "grad_norm": 0.20417137444019318, "kl": 0.05364990234375, "learning_rate": 8.183113494320796e-06, "loss": 0.0005366615951061249, "memory(GiB)": 39.01, "reward": 0.6020158529281616, "reward_std": 0.14204677939414978, "rewards/VisualizationJSONCombinedORM/mean": 0.6020158529281616, "rewards/VisualizationJSONCombinedORM/std": 0.16333253681659698, "step": 426, "train_speed(iter/s)": 0.027663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 286.4375, "completions/min_length": 238.0, "epoch": 0.3531844499586435, "grad_norm": 0.21363112330436707, "kl": 0.05218505859375, "learning_rate": 8.171966420818227e-06, "loss": 0.0005212202668190002, "memory(GiB)": 39.01, "reward": 0.4704153537750244, "reward_std": 0.13773471117019653, "rewards/VisualizationJSONCombinedORM/mean": 0.4704153537750244, "rewards/VisualizationJSONCombinedORM/std": 0.16651427745819092, "step": 427, "train_speed(iter/s)": 0.027676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 290.0, "completions/min_length": 221.0, "epoch": 0.35401157981803144, "grad_norm": 0.19389821588993073, "kl": 0.0572509765625, "learning_rate": 8.160792900692069e-06, "loss": 0.0005734190344810486, "memory(GiB)": 39.01, "reward": 0.44315746426582336, "reward_std": 0.09312878549098969, "rewards/VisualizationJSONCombinedORM/mean": 0.44315746426582336, "rewards/VisualizationJSONCombinedORM/std": 0.2709684669971466, "step": 428, "train_speed(iter/s)": 0.027693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 305.5, "completions/min_length": 249.0, "epoch": 0.3548387096774194, "grad_norm": 0.21802011132240295, "kl": 0.04315185546875, "learning_rate": 8.14959302710279e-06, "loss": 0.0004313662648200989, "memory(GiB)": 39.01, "reward": 0.409846693277359, "reward_std": 0.06159058213233948, "rewards/VisualizationJSONCombinedORM/mean": 0.409846693277359, "rewards/VisualizationJSONCombinedORM/std": 0.20277376472949982, "step": 429, "train_speed(iter/s)": 0.027709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 299.0625, "completions/min_length": 249.0, "epoch": 0.3556658395368073, "grad_norm": 0.21342702209949493, "kl": 0.06414794921875, "learning_rate": 8.138366893430583e-06, "loss": 0.0006414204835891724, "memory(GiB)": 39.01, "reward": 0.6201022863388062, "reward_std": 0.1170203685760498, "rewards/VisualizationJSONCombinedORM/mean": 0.6201022863388062, "rewards/VisualizationJSONCombinedORM/std": 0.12142311036586761, "step": 430, "train_speed(iter/s)": 0.027726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 280.6875, "completions/min_length": 207.0, "epoch": 0.3564929693961952, "grad_norm": 0.21938765048980713, "kl": 0.04974365234375, "learning_rate": 8.12711459327459e-06, "loss": 0.000497967004776001, "memory(GiB)": 39.01, "reward": 0.5164976119995117, "reward_std": 0.08004589378833771, "rewards/VisualizationJSONCombinedORM/mean": 0.5164976119995117, "rewards/VisualizationJSONCombinedORM/std": 0.2478557527065277, "step": 431, "train_speed(iter/s)": 0.027748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 298.5, "completions/min_length": 240.0, "epoch": 0.3573200992555831, "grad_norm": 0.18752911686897278, "kl": 0.04498291015625, "learning_rate": 8.115836220452118e-06, "loss": 0.00045064836740493774, "memory(GiB)": 39.01, "reward": 0.6030187010765076, "reward_std": 0.10761527717113495, "rewards/VisualizationJSONCombinedORM/mean": 0.6030187010765076, "rewards/VisualizationJSONCombinedORM/std": 0.16611407697200775, "step": 432, "train_speed(iter/s)": 0.027762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 278.625, "completions/min_length": 244.0, "epoch": 0.35814722911497104, "grad_norm": 0.20687907934188843, "kl": 0.039794921875, "learning_rate": 8.104531868997858e-06, "loss": 0.0003981739282608032, "memory(GiB)": 39.01, "reward": 0.49507325887680054, "reward_std": 0.07114537805318832, "rewards/VisualizationJSONCombinedORM/mean": 0.49507325887680054, "rewards/VisualizationJSONCombinedORM/std": 0.08850686252117157, "step": 433, "train_speed(iter/s)": 0.027784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 292.25, "completions/min_length": 215.0, "epoch": 0.358974358974359, "grad_norm": 0.19320321083068848, "kl": 0.0582275390625, "learning_rate": 8.0932016331631e-06, "loss": 0.0005817152559757233, "memory(GiB)": 39.01, "reward": 0.45830559730529785, "reward_std": 0.10720944404602051, "rewards/VisualizationJSONCombinedORM/mean": 0.45830559730529785, "rewards/VisualizationJSONCombinedORM/std": 0.1400323510169983, "step": 434, "train_speed(iter/s)": 0.027804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 258.3125, "completions/min_length": 213.0, "epoch": 0.3598014888337469, "grad_norm": 0.16509594023227692, "kl": 0.03466796875, "learning_rate": 8.081845607414947e-06, "loss": 0.00034724175930023193, "memory(GiB)": 39.01, "reward": 0.7169881463050842, "reward_std": 0.10678081214427948, "rewards/VisualizationJSONCombinedORM/mean": 0.7169881463050842, "rewards/VisualizationJSONCombinedORM/std": 0.1410674899816513, "step": 435, "train_speed(iter/s)": 0.027827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 284.125, "completions/min_length": 237.0, "epoch": 0.36062861869313484, "grad_norm": 0.21449516713619232, "kl": 0.056396484375, "learning_rate": 8.07046388643553e-06, "loss": 0.0005641058087348938, "memory(GiB)": 39.01, "reward": 0.4995792806148529, "reward_std": 0.16038653254508972, "rewards/VisualizationJSONCombinedORM/mean": 0.4995792806148529, "rewards/VisualizationJSONCombinedORM/std": 0.21168963611125946, "step": 436, "train_speed(iter/s)": 0.02784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 307.375, "completions/min_length": 232.0, "epoch": 0.36145574855252277, "grad_norm": 0.18688441812992096, "kl": 0.04522705078125, "learning_rate": 8.059056565121218e-06, "loss": 0.00045134127140045166, "memory(GiB)": 39.01, "reward": 0.5699172019958496, "reward_std": 0.12732625007629395, "rewards/VisualizationJSONCombinedORM/mean": 0.5699172019958496, "rewards/VisualizationJSONCombinedORM/std": 0.18329139053821564, "step": 437, "train_speed(iter/s)": 0.027862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 311.5, "completions/min_length": 246.0, "epoch": 0.36228287841191065, "grad_norm": 0.17471763491630554, "kl": 0.04412841796875, "learning_rate": 8.047623738581822e-06, "loss": 0.00044049695134162903, "memory(GiB)": 39.01, "reward": 0.6560647487640381, "reward_std": 0.06782905012369156, "rewards/VisualizationJSONCombinedORM/mean": 0.6560647487640381, "rewards/VisualizationJSONCombinedORM/std": 0.1246759444475174, "step": 438, "train_speed(iter/s)": 0.027883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 276.375, "completions/min_length": 227.0, "epoch": 0.3631100082712986, "grad_norm": 0.18076357245445251, "kl": 0.04608154296875, "learning_rate": 8.036165502139809e-06, "loss": 0.0004610475152730942, "memory(GiB)": 39.01, "reward": 0.6717841029167175, "reward_std": 0.10188625752925873, "rewards/VisualizationJSONCombinedORM/mean": 0.6717841029167175, "rewards/VisualizationJSONCombinedORM/std": 0.10276743769645691, "step": 439, "train_speed(iter/s)": 0.027905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 318.4375, "completions/min_length": 232.0, "epoch": 0.3639371381306865, "grad_norm": 0.18913304805755615, "kl": 0.0487060546875, "learning_rate": 8.0246819513295e-06, "loss": 0.00048595480620861053, "memory(GiB)": 39.01, "reward": 0.4736155569553375, "reward_std": 0.08460758626461029, "rewards/VisualizationJSONCombinedORM/mean": 0.4736155569553375, "rewards/VisualizationJSONCombinedORM/std": 0.1193925067782402, "step": 440, "train_speed(iter/s)": 0.027911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 267.8125, "completions/min_length": 230.0, "epoch": 0.36476426799007444, "grad_norm": 0.1777970790863037, "kl": 0.033935546875, "learning_rate": 8.013173181896283e-06, "loss": 0.00033943355083465576, "memory(GiB)": 39.01, "reward": 0.6456952095031738, "reward_std": 0.10094152390956879, "rewards/VisualizationJSONCombinedORM/mean": 0.6456952095031738, "rewards/VisualizationJSONCombinedORM/std": 0.11926859617233276, "step": 441, "train_speed(iter/s)": 0.02794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 266.75, "completions/min_length": 223.0, "epoch": 0.3655913978494624, "grad_norm": 0.19274084270000458, "kl": 0.04071044921875, "learning_rate": 8.001639289795804e-06, "loss": 0.00040734559297561646, "memory(GiB)": 39.01, "reward": 0.4650154113769531, "reward_std": 0.07658510655164719, "rewards/VisualizationJSONCombinedORM/mean": 0.4650154113769531, "rewards/VisualizationJSONCombinedORM/std": 0.30067968368530273, "step": 442, "train_speed(iter/s)": 0.02796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 296.8125, "completions/min_length": 237.0, "epoch": 0.3664185277088503, "grad_norm": 0.22097347676753998, "kl": 0.05755615234375, "learning_rate": 7.990080371193175e-06, "loss": 0.0005768612027168274, "memory(GiB)": 39.01, "reward": 0.482846736907959, "reward_std": 0.09219586849212646, "rewards/VisualizationJSONCombinedORM/mean": 0.482846736907959, "rewards/VisualizationJSONCombinedORM/std": 0.22256386280059814, "step": 443, "train_speed(iter/s)": 0.027979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 297.625, "completions/min_length": 240.0, "epoch": 0.36724565756823824, "grad_norm": 0.20647190511226654, "kl": 0.05145263671875, "learning_rate": 7.978496522462167e-06, "loss": 0.0005130395293235779, "memory(GiB)": 39.01, "reward": 0.4685795307159424, "reward_std": 0.08279760181903839, "rewards/VisualizationJSONCombinedORM/mean": 0.4685795307159424, "rewards/VisualizationJSONCombinedORM/std": 0.26654794812202454, "step": 444, "train_speed(iter/s)": 0.027999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 291.1875, "completions/min_length": 232.0, "epoch": 0.3680727874276261, "grad_norm": 0.19873455166816711, "kl": 0.060302734375, "learning_rate": 7.966887840184412e-06, "loss": 0.0006036087870597839, "memory(GiB)": 39.01, "reward": 0.43672266602516174, "reward_std": 0.10929945111274719, "rewards/VisualizationJSONCombinedORM/mean": 0.43672266602516174, "rewards/VisualizationJSONCombinedORM/std": 0.14250901341438293, "step": 445, "train_speed(iter/s)": 0.028009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 310.6875, "completions/min_length": 248.0, "epoch": 0.36889991728701405, "grad_norm": 0.1695721447467804, "kl": 0.0509033203125, "learning_rate": 7.95525442114859e-06, "loss": 0.0005096346139907837, "memory(GiB)": 39.01, "reward": 0.37710925936698914, "reward_std": 0.0345412977039814, "rewards/VisualizationJSONCombinedORM/mean": 0.37710925936698914, "rewards/VisualizationJSONCombinedORM/std": 0.05505146458745003, "step": 446, "train_speed(iter/s)": 0.028027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 291.0625, "completions/min_length": 225.0, "epoch": 0.369727047146402, "grad_norm": 0.1789015680551529, "kl": 0.0550537109375, "learning_rate": 7.943596362349631e-06, "loss": 0.0005491860210895538, "memory(GiB)": 39.01, "reward": 0.5854508280754089, "reward_std": 0.06458783149719238, "rewards/VisualizationJSONCombinedORM/mean": 0.5854508280754089, "rewards/VisualizationJSONCombinedORM/std": 0.22327472269535065, "step": 447, "train_speed(iter/s)": 0.028033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 324.6875, "completions/min_length": 247.0, "epoch": 0.3705541770057899, "grad_norm": 0.19607840478420258, "kl": 0.06219482421875, "learning_rate": 7.9319137609879e-06, "loss": 0.0006216764450073242, "memory(GiB)": 39.01, "reward": 0.5953760743141174, "reward_std": 0.16597619652748108, "rewards/VisualizationJSONCombinedORM/mean": 0.5953760743141174, "rewards/VisualizationJSONCombinedORM/std": 0.18057799339294434, "step": 448, "train_speed(iter/s)": 0.028048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 291.0625, "completions/min_length": 238.0, "epoch": 0.37138130686517784, "grad_norm": 0.1858367919921875, "kl": 0.057861328125, "learning_rate": 7.920206714468383e-06, "loss": 0.0005792966112494469, "memory(GiB)": 39.01, "reward": 0.5959460735321045, "reward_std": 0.11022096872329712, "rewards/VisualizationJSONCombinedORM/mean": 0.5959460735321045, "rewards/VisualizationJSONCombinedORM/std": 0.12027814239263535, "step": 449, "train_speed(iter/s)": 0.028063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 304.75, "completions/min_length": 265.0, "epoch": 0.37220843672456577, "grad_norm": 0.18776267766952515, "kl": 0.05877685546875, "learning_rate": 7.908475320399893e-06, "loss": 0.0005885884165763855, "memory(GiB)": 39.01, "reward": 0.4114828407764435, "reward_std": 0.1026100292801857, "rewards/VisualizationJSONCombinedORM/mean": 0.4114828407764435, "rewards/VisualizationJSONCombinedORM/std": 0.2304178774356842, "step": 450, "train_speed(iter/s)": 0.028077 }, { "epoch": 0.37220843672456577, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.5, "eval_completions/mean_length": 298.1510416666667, "eval_completions/min_length": 252.29166666666666, "eval_kl": 0.054962158203125, "eval_loss": 0.0005521625280380249, "eval_reward": 0.4633769715825717, "eval_reward_std": 0.08866516066094239, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4633769715825717, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08866516198031604, "eval_runtime": 305.7727, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 316.3125, "completions/min_length": 274.0, "epoch": 0.3730355665839537, "grad_norm": 0.19960452616214752, "kl": 0.066162109375, "learning_rate": 7.89671967659423e-06, "loss": 0.0006631764117628336, "memory(GiB)": 39.01, "reward": 0.3441133499145508, "reward_std": 0.08466705679893494, "rewards/VisualizationJSONCombinedORM/mean": 0.3441133499145508, "rewards/VisualizationJSONCombinedORM/std": 0.14835339784622192, "step": 451, "train_speed(iter/s)": 0.027577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 336.25, "completions/min_length": 278.0, "epoch": 0.3738626964433416, "grad_norm": 0.2101266086101532, "kl": 0.07000732421875, "learning_rate": 7.884939881065387e-06, "loss": 0.0007010176777839661, "memory(GiB)": 39.01, "reward": 0.3176151514053345, "reward_std": 0.06963469088077545, "rewards/VisualizationJSONCombinedORM/mean": 0.3176151514053345, "rewards/VisualizationJSONCombinedORM/std": 0.08664368093013763, "step": 452, "train_speed(iter/s)": 0.0276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 284.25, "completions/min_length": 226.0, "epoch": 0.3746898263027295, "grad_norm": 0.2009240686893463, "kl": 0.0418701171875, "learning_rate": 7.873136032028719e-06, "loss": 0.00041847676038742065, "memory(GiB)": 39.01, "reward": 0.4256020784378052, "reward_std": 0.0723477303981781, "rewards/VisualizationJSONCombinedORM/mean": 0.4256020784378052, "rewards/VisualizationJSONCombinedORM/std": 0.09183034300804138, "step": 453, "train_speed(iter/s)": 0.027612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 311.125, "completions/min_length": 261.0, "epoch": 0.37551695616211744, "grad_norm": 0.1881980299949646, "kl": 0.066162109375, "learning_rate": 7.86130822790014e-06, "loss": 0.0006603486835956573, "memory(GiB)": 39.01, "reward": 0.32874664664268494, "reward_std": 0.07137586921453476, "rewards/VisualizationJSONCombinedORM/mean": 0.32874664664268494, "rewards/VisualizationJSONCombinedORM/std": 0.07163124531507492, "step": 454, "train_speed(iter/s)": 0.027638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 308.875, "completions/min_length": 233.0, "epoch": 0.3763440860215054, "grad_norm": 0.1773962676525116, "kl": 0.0469970703125, "learning_rate": 7.849456567295276e-06, "loss": 0.0004704520106315613, "memory(GiB)": 39.01, "reward": 0.5266328454017639, "reward_std": 0.09633839875459671, "rewards/VisualizationJSONCombinedORM/mean": 0.5266328454017639, "rewards/VisualizationJSONCombinedORM/std": 0.11059942841529846, "step": 455, "train_speed(iter/s)": 0.027656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 312.25, "completions/min_length": 242.0, "epoch": 0.3771712158808933, "grad_norm": 0.16264654695987701, "kl": 0.06561279296875, "learning_rate": 7.837581149028677e-06, "loss": 0.0006566531956195831, "memory(GiB)": 39.01, "reward": 0.7253429889678955, "reward_std": 0.09546156227588654, "rewards/VisualizationJSONCombinedORM/mean": 0.7253429889678955, "rewards/VisualizationJSONCombinedORM/std": 0.09257785975933075, "step": 456, "train_speed(iter/s)": 0.027672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 311.875, "completions/min_length": 239.0, "epoch": 0.37799834574028124, "grad_norm": 0.19676196575164795, "kl": 0.0595703125, "learning_rate": 7.82568207211296e-06, "loss": 0.0005962718278169632, "memory(GiB)": 39.01, "reward": 0.5552322268486023, "reward_std": 0.10998561978340149, "rewards/VisualizationJSONCombinedORM/mean": 0.5552322268486023, "rewards/VisualizationJSONCombinedORM/std": 0.11616112291812897, "step": 457, "train_speed(iter/s)": 0.027691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 328.75, "completions/min_length": 251.0, "epoch": 0.37882547559966917, "grad_norm": 0.19852443039417267, "kl": 0.05694580078125, "learning_rate": 7.81375943575801e-06, "loss": 0.0005696713924407959, "memory(GiB)": 39.01, "reward": 0.6958416700363159, "reward_std": 0.09195326268672943, "rewards/VisualizationJSONCombinedORM/mean": 0.6958416700363159, "rewards/VisualizationJSONCombinedORM/std": 0.10722694545984268, "step": 458, "train_speed(iter/s)": 0.02771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 317.625, "completions/min_length": 262.0, "epoch": 0.37965260545905705, "grad_norm": 0.2071291208267212, "kl": 0.0435791015625, "learning_rate": 7.801813339370133e-06, "loss": 0.00043670833110809326, "memory(GiB)": 39.01, "reward": 0.6032734513282776, "reward_std": 0.07038868963718414, "rewards/VisualizationJSONCombinedORM/mean": 0.6032734513282776, "rewards/VisualizationJSONCombinedORM/std": 0.08792231231927872, "step": 459, "train_speed(iter/s)": 0.02773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 299.3125, "completions/min_length": 243.0, "epoch": 0.380479735318445, "grad_norm": 0.17889460921287537, "kl": 0.07366943359375, "learning_rate": 7.78984388255124e-06, "loss": 0.0007359236478805542, "memory(GiB)": 39.01, "reward": 0.496249794960022, "reward_std": 0.07679080963134766, "rewards/VisualizationJSONCombinedORM/mean": 0.496249794960022, "rewards/VisualizationJSONCombinedORM/std": 0.20429036021232605, "step": 460, "train_speed(iter/s)": 0.027743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 352.125, "completions/min_length": 311.0, "epoch": 0.3813068651778329, "grad_norm": 0.18797406554222107, "kl": 0.061767578125, "learning_rate": 7.777851165098012e-06, "loss": 0.0006179846823215485, "memory(GiB)": 39.01, "reward": 0.4167826175689697, "reward_std": 0.1199934259057045, "rewards/VisualizationJSONCombinedORM/mean": 0.4167826175689697, "rewards/VisualizationJSONCombinedORM/std": 0.1409900039434433, "step": 461, "train_speed(iter/s)": 0.027758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 321.25, "completions/min_length": 250.0, "epoch": 0.38213399503722084, "grad_norm": 0.17972131073474884, "kl": 0.08294677734375, "learning_rate": 7.765835287001068e-06, "loss": 0.0008281879127025604, "memory(GiB)": 39.01, "reward": 0.39016032218933105, "reward_std": 0.05356743931770325, "rewards/VisualizationJSONCombinedORM/mean": 0.39016032218933105, "rewards/VisualizationJSONCombinedORM/std": 0.05271650105714798, "step": 462, "train_speed(iter/s)": 0.027774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 320.5625, "completions/min_length": 213.0, "epoch": 0.3829611248966088, "grad_norm": 0.2086309790611267, "kl": 0.1004638671875, "learning_rate": 7.753796348444129e-06, "loss": 0.0010074451565742493, "memory(GiB)": 39.01, "reward": 0.52719646692276, "reward_std": 0.09222692251205444, "rewards/VisualizationJSONCombinedORM/mean": 0.52719646692276, "rewards/VisualizationJSONCombinedORM/std": 0.1125728115439415, "step": 463, "train_speed(iter/s)": 0.027792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 327.8125, "completions/min_length": 274.0, "epoch": 0.3837882547559967, "grad_norm": 0.1928734928369522, "kl": 0.0780029296875, "learning_rate": 7.74173444980319e-06, "loss": 0.0007788483053445816, "memory(GiB)": 39.01, "reward": 0.4812494218349457, "reward_std": 0.049105387181043625, "rewards/VisualizationJSONCombinedORM/mean": 0.4812494218349457, "rewards/VisualizationJSONCombinedORM/std": 0.1526683121919632, "step": 464, "train_speed(iter/s)": 0.027809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 305.625, "completions/min_length": 243.0, "epoch": 0.38461538461538464, "grad_norm": 0.199244424700737, "kl": 0.058349609375, "learning_rate": 7.729649691645673e-06, "loss": 0.0005831755697727203, "memory(GiB)": 39.01, "reward": 0.488530695438385, "reward_std": 0.06298121064901352, "rewards/VisualizationJSONCombinedORM/mean": 0.488530695438385, "rewards/VisualizationJSONCombinedORM/std": 0.18468087911605835, "step": 465, "train_speed(iter/s)": 0.027829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 316.75, "completions/min_length": 237.0, "epoch": 0.3854425144747725, "grad_norm": 0.20910237729549408, "kl": 0.119384765625, "learning_rate": 7.717542174729597e-06, "loss": 0.0011934563517570496, "memory(GiB)": 39.01, "reward": 0.42111968994140625, "reward_std": 0.03668757528066635, "rewards/VisualizationJSONCombinedORM/mean": 0.42111968994140625, "rewards/VisualizationJSONCombinedORM/std": 0.13712169229984283, "step": 466, "train_speed(iter/s)": 0.027847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 331.9375, "completions/min_length": 279.0, "epoch": 0.38626964433416044, "grad_norm": 0.18577316403388977, "kl": 0.0648193359375, "learning_rate": 7.705412000002735e-06, "loss": 0.0006489306688308716, "memory(GiB)": 39.01, "reward": 0.5906088352203369, "reward_std": 0.14570313692092896, "rewards/VisualizationJSONCombinedORM/mean": 0.5906088352203369, "rewards/VisualizationJSONCombinedORM/std": 0.16738037765026093, "step": 467, "train_speed(iter/s)": 0.02786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 294.5625, "completions/min_length": 218.0, "epoch": 0.3870967741935484, "grad_norm": 0.180439293384552, "kl": 0.04583740234375, "learning_rate": 7.693259268601767e-06, "loss": 0.00045778602361679077, "memory(GiB)": 39.01, "reward": 0.6925169825553894, "reward_std": 0.1004219651222229, "rewards/VisualizationJSONCombinedORM/mean": 0.6925169825553894, "rewards/VisualizationJSONCombinedORM/std": 0.11592266708612442, "step": 468, "train_speed(iter/s)": 0.027881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 300.75, "completions/min_length": 252.0, "epoch": 0.3879239040529363, "grad_norm": 0.2124636173248291, "kl": 0.07421875, "learning_rate": 7.68108408185145e-06, "loss": 0.0007419213652610779, "memory(GiB)": 39.01, "reward": 0.36867523193359375, "reward_std": 0.08813057839870453, "rewards/VisualizationJSONCombinedORM/mean": 0.36867523193359375, "rewards/VisualizationJSONCombinedORM/std": 0.13624049723148346, "step": 469, "train_speed(iter/s)": 0.027904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.9375, "completions/min_length": 230.0, "epoch": 0.38875103391232424, "grad_norm": 0.20179712772369385, "kl": 0.0537109375, "learning_rate": 7.668886541263757e-06, "loss": 0.000537484884262085, "memory(GiB)": 39.01, "reward": 0.6720738410949707, "reward_std": 0.10454420745372772, "rewards/VisualizationJSONCombinedORM/mean": 0.6720738410949707, "rewards/VisualizationJSONCombinedORM/std": 0.10279063880443573, "step": 470, "train_speed(iter/s)": 0.027919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 332.75, "completions/min_length": 256.0, "epoch": 0.38957816377171217, "grad_norm": 0.18428190052509308, "kl": 0.0628662109375, "learning_rate": 7.656666748537047e-06, "loss": 0.0006282851099967957, "memory(GiB)": 39.01, "reward": 0.36098554730415344, "reward_std": 0.06182652339339256, "rewards/VisualizationJSONCombinedORM/mean": 0.36098554730415344, "rewards/VisualizationJSONCombinedORM/std": 0.17122061550617218, "step": 471, "train_speed(iter/s)": 0.027941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 333.25, "completions/min_length": 270.0, "epoch": 0.3904052936311001, "grad_norm": 0.17562192678451538, "kl": 0.0823974609375, "learning_rate": 7.644424805555199e-06, "loss": 0.0008227452635765076, "memory(GiB)": 39.01, "reward": 0.423114538192749, "reward_std": 0.08823642879724503, "rewards/VisualizationJSONCombinedORM/mean": 0.423114538192749, "rewards/VisualizationJSONCombinedORM/std": 0.1394711434841156, "step": 472, "train_speed(iter/s)": 0.027948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 276.3125, "completions/min_length": 207.0, "epoch": 0.391232423490488, "grad_norm": 0.17545130848884583, "kl": 0.066650390625, "learning_rate": 7.63216081438678e-06, "loss": 0.0006659552454948425, "memory(GiB)": 39.01, "reward": 0.5722158551216125, "reward_std": 0.08099034428596497, "rewards/VisualizationJSONCombinedORM/mean": 0.5722158551216125, "rewards/VisualizationJSONCombinedORM/std": 0.15373186767101288, "step": 473, "train_speed(iter/s)": 0.027966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 318.875, "completions/min_length": 271.0, "epoch": 0.3920595533498759, "grad_norm": 0.17991267144680023, "kl": 0.06964111328125, "learning_rate": 7.619874877284181e-06, "loss": 0.0006958469748497009, "memory(GiB)": 39.01, "reward": 0.40347084403038025, "reward_std": 0.10544557869434357, "rewards/VisualizationJSONCombinedORM/mean": 0.40347084403038025, "rewards/VisualizationJSONCombinedORM/std": 0.11104093492031097, "step": 474, "train_speed(iter/s)": 0.02799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 314.1875, "completions/min_length": 259.0, "epoch": 0.39288668320926384, "grad_norm": 0.20659835636615753, "kl": 0.089111328125, "learning_rate": 7.607567096682775e-06, "loss": 0.0008911974728107452, "memory(GiB)": 39.01, "reward": 0.3817349672317505, "reward_std": 0.09030750393867493, "rewards/VisualizationJSONCombinedORM/mean": 0.3817349672317505, "rewards/VisualizationJSONCombinedORM/std": 0.1816175878047943, "step": 475, "train_speed(iter/s)": 0.02801 }, { "epoch": 0.39288668320926384, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.9583333333333, "eval_completions/mean_length": 313.703125, "eval_completions/min_length": 263.375, "eval_kl": 0.076324462890625, "eval_loss": 0.0007685453747399151, "eval_reward": 0.47927001553277176, "eval_reward_std": 0.08758855378255248, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47927001553277176, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08758855805111428, "eval_runtime": 317.4084, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 300.6875, "completions/min_length": 238.0, "epoch": 0.3937138130686518, "grad_norm": 0.27307695150375366, "kl": 0.0787353515625, "learning_rate": 7.595237575200053e-06, "loss": 0.0007874146103858948, "memory(GiB)": 39.01, "reward": 0.6007264256477356, "reward_std": 0.09238812327384949, "rewards/VisualizationJSONCombinedORM/mean": 0.6007264256477356, "rewards/VisualizationJSONCombinedORM/std": 0.10806584358215332, "step": 476, "train_speed(iter/s)": 0.027514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 307.75, "completions/min_length": 262.0, "epoch": 0.3945409429280397, "grad_norm": 0.17819392681121826, "kl": 0.0599365234375, "learning_rate": 7.5828864156347735e-06, "loss": 0.0005986951291561127, "memory(GiB)": 39.01, "reward": 0.6046620607376099, "reward_std": 0.11268135905265808, "rewards/VisualizationJSONCombinedORM/mean": 0.6046620607376099, "rewards/VisualizationJSONCombinedORM/std": 0.11886685341596603, "step": 477, "train_speed(iter/s)": 0.027528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 288.5625, "completions/min_length": 217.0, "epoch": 0.39536807278742764, "grad_norm": 0.22807136178016663, "kl": 0.06927490234375, "learning_rate": 7.570513720966108e-06, "loss": 0.0006945282220840454, "memory(GiB)": 39.01, "reward": 0.6182816624641418, "reward_std": 0.12289685010910034, "rewards/VisualizationJSONCombinedORM/mean": 0.6182816624641418, "rewards/VisualizationJSONCombinedORM/std": 0.12255778908729553, "step": 478, "train_speed(iter/s)": 0.027541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 324.9375, "completions/min_length": 276.0, "epoch": 0.39619520264681557, "grad_norm": 0.19047972559928894, "kl": 0.073486328125, "learning_rate": 7.5581195943527785e-06, "loss": 0.000734422355890274, "memory(GiB)": 39.01, "reward": 0.5857144594192505, "reward_std": 0.0848580002784729, "rewards/VisualizationJSONCombinedORM/mean": 0.5857144594192505, "rewards/VisualizationJSONCombinedORM/std": 0.15467898547649384, "step": 479, "train_speed(iter/s)": 0.027554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 334.4375, "completions/min_length": 268.0, "epoch": 0.3970223325062035, "grad_norm": 0.19329231977462769, "kl": 0.091796875, "learning_rate": 7.545704139132194e-06, "loss": 0.0009187068790197372, "memory(GiB)": 39.01, "reward": 0.6382875442504883, "reward_std": 0.1400892585515976, "rewards/VisualizationJSONCombinedORM/mean": 0.6382875442504883, "rewards/VisualizationJSONCombinedORM/std": 0.14246784150600433, "step": 480, "train_speed(iter/s)": 0.027564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 277.875, "completions/min_length": 232.0, "epoch": 0.3978494623655914, "grad_norm": 0.23383265733718872, "kl": 0.0762939453125, "learning_rate": 7.533267458819597e-06, "loss": 0.0007626134902238846, "memory(GiB)": 39.01, "reward": 0.4081999659538269, "reward_std": 0.1050296351313591, "rewards/VisualizationJSONCombinedORM/mean": 0.4081999659538269, "rewards/VisualizationJSONCombinedORM/std": 0.27577605843544006, "step": 481, "train_speed(iter/s)": 0.027586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 311.5, "completions/min_length": 240.0, "epoch": 0.3986765922249793, "grad_norm": 0.18737143278121948, "kl": 0.05389404296875, "learning_rate": 7.520809657107198e-06, "loss": 0.0005390942096710205, "memory(GiB)": 39.01, "reward": 0.36676424741744995, "reward_std": 0.1336137056350708, "rewards/VisualizationJSONCombinedORM/mean": 0.36676424741744995, "rewards/VisualizationJSONCombinedORM/std": 0.13137361407279968, "step": 482, "train_speed(iter/s)": 0.027603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 337.9375, "completions/min_length": 221.0, "epoch": 0.39950372208436724, "grad_norm": 0.21627895534038544, "kl": 0.05596923828125, "learning_rate": 7.508330837863305e-06, "loss": 0.0005601570010185242, "memory(GiB)": 39.01, "reward": 0.5587278604507446, "reward_std": 0.1736736297607422, "rewards/VisualizationJSONCombinedORM/mean": 0.5587278604507446, "rewards/VisualizationJSONCombinedORM/std": 0.18812507390975952, "step": 483, "train_speed(iter/s)": 0.027621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 330.625, "completions/min_length": 265.0, "epoch": 0.4003308519437552, "grad_norm": 0.18441464006900787, "kl": 0.0618896484375, "learning_rate": 7.4958311051314645e-06, "loss": 0.0006182491779327393, "memory(GiB)": 39.01, "reward": 0.6053735017776489, "reward_std": 0.11055776476860046, "rewards/VisualizationJSONCombinedORM/mean": 0.6053735017776489, "rewards/VisualizationJSONCombinedORM/std": 0.11749795824289322, "step": 484, "train_speed(iter/s)": 0.027638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 302.8125, "completions/min_length": 251.0, "epoch": 0.4011579818031431, "grad_norm": 0.1871975064277649, "kl": 0.10333251953125, "learning_rate": 7.483310563129592e-06, "loss": 0.001033572480082512, "memory(GiB)": 39.01, "reward": 0.5715786814689636, "reward_std": 0.1149250864982605, "rewards/VisualizationJSONCombinedORM/mean": 0.5715786814689636, "rewards/VisualizationJSONCombinedORM/std": 0.14940813183784485, "step": 485, "train_speed(iter/s)": 0.027657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 290.5, "completions/min_length": 225.0, "epoch": 0.40198511166253104, "grad_norm": 0.18403328955173492, "kl": 0.04766845703125, "learning_rate": 7.470769316249102e-06, "loss": 0.0004765167832374573, "memory(GiB)": 39.01, "reward": 0.5156192779541016, "reward_std": 0.18585029244422913, "rewards/VisualizationJSONCombinedORM/mean": 0.5156192779541016, "rewards/VisualizationJSONCombinedORM/std": 0.19520601630210876, "step": 486, "train_speed(iter/s)": 0.027676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 325.3125, "completions/min_length": 264.0, "epoch": 0.40281224152191897, "grad_norm": 0.19144362211227417, "kl": 0.081298828125, "learning_rate": 7.45820746905404e-06, "loss": 0.000811481848359108, "memory(GiB)": 39.01, "reward": 0.5152961611747742, "reward_std": 0.10404014587402344, "rewards/VisualizationJSONCombinedORM/mean": 0.5152961611747742, "rewards/VisualizationJSONCombinedORM/std": 0.14721670746803284, "step": 487, "train_speed(iter/s)": 0.02769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 307.5, "completions/min_length": 250.0, "epoch": 0.40363937138130684, "grad_norm": 0.18809223175048828, "kl": 0.06591796875, "learning_rate": 7.445625126280204e-06, "loss": 0.0006597880274057388, "memory(GiB)": 39.01, "reward": 0.647710919380188, "reward_std": 0.12782195210456848, "rewards/VisualizationJSONCombinedORM/mean": 0.647710919380188, "rewards/VisualizationJSONCombinedORM/std": 0.13916705548763275, "step": 488, "train_speed(iter/s)": 0.027709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 313.3125, "completions/min_length": 258.0, "epoch": 0.4044665012406948, "grad_norm": 0.19493062794208527, "kl": 0.0653076171875, "learning_rate": 7.4330223928342814e-06, "loss": 0.0006521232426166534, "memory(GiB)": 39.01, "reward": 0.506321132183075, "reward_std": 0.10164386034011841, "rewards/VisualizationJSONCombinedORM/mean": 0.506321132183075, "rewards/VisualizationJSONCombinedORM/std": 0.14735649526119232, "step": 489, "train_speed(iter/s)": 0.027723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 324.875, "completions/min_length": 228.0, "epoch": 0.4052936311000827, "grad_norm": 0.1670776903629303, "kl": 0.06219482421875, "learning_rate": 7.420399373792967e-06, "loss": 0.0006224103271961212, "memory(GiB)": 39.01, "reward": 0.3520440459251404, "reward_std": 0.05116996541619301, "rewards/VisualizationJSONCombinedORM/mean": 0.3520440459251404, "rewards/VisualizationJSONCombinedORM/std": 0.05046319589018822, "step": 490, "train_speed(iter/s)": 0.027733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 325.8125, "completions/min_length": 266.0, "epoch": 0.40612076095947064, "grad_norm": 0.2123895138502121, "kl": 0.06317138671875, "learning_rate": 7.407756174402088e-06, "loss": 0.0006331969052553177, "memory(GiB)": 39.01, "reward": 0.4099385142326355, "reward_std": 0.06825298815965652, "rewards/VisualizationJSONCombinedORM/mean": 0.4099385142326355, "rewards/VisualizationJSONCombinedORM/std": 0.12154654413461685, "step": 491, "train_speed(iter/s)": 0.027749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 301.6875, "completions/min_length": 238.0, "epoch": 0.40694789081885857, "grad_norm": 0.19867543876171112, "kl": 0.0653076171875, "learning_rate": 7.39509290007573e-06, "loss": 0.0006542876362800598, "memory(GiB)": 39.01, "reward": 0.5604161024093628, "reward_std": 0.08957931399345398, "rewards/VisualizationJSONCombinedORM/mean": 0.5604161024093628, "rewards/VisualizationJSONCombinedORM/std": 0.15425534546375275, "step": 492, "train_speed(iter/s)": 0.027762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 336.9375, "completions/min_length": 268.0, "epoch": 0.4077750206782465, "grad_norm": 0.1826765537261963, "kl": 0.07122802734375, "learning_rate": 7.382409656395353e-06, "loss": 0.0007127895951271057, "memory(GiB)": 39.01, "reward": 0.4944494366645813, "reward_std": 0.08051706850528717, "rewards/VisualizationJSONCombinedORM/mean": 0.4944494366645813, "rewards/VisualizationJSONCombinedORM/std": 0.08567006886005402, "step": 493, "train_speed(iter/s)": 0.02777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 351.3125, "completions/min_length": 254.0, "epoch": 0.40860215053763443, "grad_norm": 0.18735311925411224, "kl": 0.0714111328125, "learning_rate": 7.369706549108915e-06, "loss": 0.0007149428129196167, "memory(GiB)": 39.01, "reward": 0.5161744356155396, "reward_std": 0.11482160538434982, "rewards/VisualizationJSONCombinedORM/mean": 0.5161744356155396, "rewards/VisualizationJSONCombinedORM/std": 0.15156951546669006, "step": 494, "train_speed(iter/s)": 0.027783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 331.9375, "completions/min_length": 261.0, "epoch": 0.4094292803970223, "grad_norm": 0.1956755667924881, "kl": 0.068359375, "learning_rate": 7.3569836841299905e-06, "loss": 0.0006829351186752319, "memory(GiB)": 39.01, "reward": 0.523205041885376, "reward_std": 0.08481264114379883, "rewards/VisualizationJSONCombinedORM/mean": 0.523205041885376, "rewards/VisualizationJSONCombinedORM/std": 0.10898064821958542, "step": 495, "train_speed(iter/s)": 0.027797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 326.375, "completions/min_length": 268.0, "epoch": 0.41025641025641024, "grad_norm": 0.187386155128479, "kl": 0.06951904296875, "learning_rate": 7.34424116753688e-06, "loss": 0.0006944946944713593, "memory(GiB)": 39.01, "reward": 0.3998304009437561, "reward_std": 0.13949179649353027, "rewards/VisualizationJSONCombinedORM/mean": 0.3998304009437561, "rewards/VisualizationJSONCombinedORM/std": 0.22631531953811646, "step": 496, "train_speed(iter/s)": 0.027814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 321.875, "completions/min_length": 256.0, "epoch": 0.4110835401157982, "grad_norm": 0.22571782767772675, "kl": 0.0611572265625, "learning_rate": 7.33147910557174e-06, "loss": 0.0006118826568126678, "memory(GiB)": 39.01, "reward": 0.5303508043289185, "reward_std": 0.1135464757680893, "rewards/VisualizationJSONCombinedORM/mean": 0.5303508043289185, "rewards/VisualizationJSONCombinedORM/std": 0.14387816190719604, "step": 497, "train_speed(iter/s)": 0.027831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 330.9375, "completions/min_length": 271.0, "epoch": 0.4119106699751861, "grad_norm": 0.20945404469966888, "kl": 0.05938720703125, "learning_rate": 7.318697604639685e-06, "loss": 0.0005937293171882629, "memory(GiB)": 39.01, "reward": 0.561597466468811, "reward_std": 0.1091771125793457, "rewards/VisualizationJSONCombinedORM/mean": 0.561597466468811, "rewards/VisualizationJSONCombinedORM/std": 0.1206970363855362, "step": 498, "train_speed(iter/s)": 0.027847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 323.3125, "completions/min_length": 267.0, "epoch": 0.41273779983457404, "grad_norm": 0.19660025835037231, "kl": 0.063232421875, "learning_rate": 7.3058967713079025e-06, "loss": 0.0006322432309389114, "memory(GiB)": 39.01, "reward": 0.4750305414199829, "reward_std": 0.08769971132278442, "rewards/VisualizationJSONCombinedORM/mean": 0.4750305414199829, "rewards/VisualizationJSONCombinedORM/std": 0.15696190297603607, "step": 499, "train_speed(iter/s)": 0.027854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 326.0625, "completions/min_length": 240.0, "epoch": 0.41356492969396197, "grad_norm": 0.21139836311340332, "kl": 0.052001953125, "learning_rate": 7.293076712304765e-06, "loss": 0.0005199573934078217, "memory(GiB)": 39.01, "reward": 0.4706464409828186, "reward_std": 0.051583461463451385, "rewards/VisualizationJSONCombinedORM/mean": 0.4706464409828186, "rewards/VisualizationJSONCombinedORM/std": 0.27824482321739197, "step": 500, "train_speed(iter/s)": 0.027869 }, { "epoch": 0.41356492969396197, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.75, "eval_completions/mean_length": 317.5416666666667, "eval_completions/min_length": 267.8333333333333, "eval_kl": 0.051076253255208336, "eval_loss": 0.0005114407395012677, "eval_reward": 0.44644464055697125, "eval_reward_std": 0.08337124716490507, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44644464055697125, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08337124933799107, "eval_runtime": 310.9475, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 315.3125, "completions/min_length": 272.0, "epoch": 0.4143920595533499, "grad_norm": 0.18460145592689514, "kl": 0.0465087890625, "learning_rate": 7.280237534518948e-06, "loss": 0.0004647448658943176, "memory(GiB)": 39.01, "reward": 0.695609986782074, "reward_std": 0.12792649865150452, "rewards/VisualizationJSONCombinedORM/mean": 0.695609986782074, "rewards/VisualizationJSONCombinedORM/std": 0.18682904541492462, "step": 501, "train_speed(iter/s)": 0.027409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 340.875, "completions/min_length": 272.0, "epoch": 0.4152191894127378, "grad_norm": 0.18793797492980957, "kl": 0.07659912109375, "learning_rate": 7.267379344998523e-06, "loss": 0.0007649920880794525, "memory(GiB)": 39.01, "reward": 0.44610047340393066, "reward_std": 0.08022057265043259, "rewards/VisualizationJSONCombinedORM/mean": 0.44610047340393066, "rewards/VisualizationJSONCombinedORM/std": 0.13603730499744415, "step": 502, "train_speed(iter/s)": 0.027417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 299.875, "completions/min_length": 241.0, "epoch": 0.4160463192721257, "grad_norm": 0.1951640099287033, "kl": 0.042724609375, "learning_rate": 7.2545022509500805e-06, "loss": 0.0004283487796783447, "memory(GiB)": 39.01, "reward": 0.44236183166503906, "reward_std": 0.06850796192884445, "rewards/VisualizationJSONCombinedORM/mean": 0.44236183166503906, "rewards/VisualizationJSONCombinedORM/std": 0.2138195037841797, "step": 503, "train_speed(iter/s)": 0.027437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 272.1875, "completions/min_length": 235.0, "epoch": 0.41687344913151364, "grad_norm": 0.19333404302597046, "kl": 0.06866455078125, "learning_rate": 7.241606359737826e-06, "loss": 0.00068654865026474, "memory(GiB)": 39.01, "reward": 0.348468154668808, "reward_std": 0.08605614304542542, "rewards/VisualizationJSONCombinedORM/mean": 0.348468154668808, "rewards/VisualizationJSONCombinedORM/std": 0.09084665030241013, "step": 504, "train_speed(iter/s)": 0.027458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 335.5, "completions/min_length": 298.0, "epoch": 0.41770057899090157, "grad_norm": 0.18980900943279266, "kl": 0.0770263671875, "learning_rate": 7.2286917788826926e-06, "loss": 0.0007700324058532715, "memory(GiB)": 39.01, "reward": 0.4966891407966614, "reward_std": 0.11901834607124329, "rewards/VisualizationJSONCombinedORM/mean": 0.4966891407966614, "rewards/VisualizationJSONCombinedORM/std": 0.1202818900346756, "step": 505, "train_speed(iter/s)": 0.027472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 280.9375, "completions/min_length": 222.0, "epoch": 0.4185277088502895, "grad_norm": 0.18610788881778717, "kl": 0.0579833984375, "learning_rate": 7.215758616061435e-06, "loss": 0.0005799736827611923, "memory(GiB)": 39.01, "reward": 0.495444655418396, "reward_std": 0.126094251871109, "rewards/VisualizationJSONCombinedORM/mean": 0.495444655418396, "rewards/VisualizationJSONCombinedORM/std": 0.21996602416038513, "step": 506, "train_speed(iter/s)": 0.027498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 344.6875, "completions/min_length": 287.0, "epoch": 0.41935483870967744, "grad_norm": 0.1727590709924698, "kl": 0.0416259765625, "learning_rate": 7.202806979105741e-06, "loss": 0.0004157647490501404, "memory(GiB)": 39.01, "reward": 0.6741436719894409, "reward_std": 0.10070987045764923, "rewards/VisualizationJSONCombinedORM/mean": 0.6741436719894409, "rewards/VisualizationJSONCombinedORM/std": 0.11840642243623734, "step": 507, "train_speed(iter/s)": 0.027518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 349.3125, "completions/min_length": 259.0, "epoch": 0.42018196856906537, "grad_norm": 0.17616432905197144, "kl": 0.04376220703125, "learning_rate": 7.189836976001328e-06, "loss": 0.00043690577149391174, "memory(GiB)": 39.01, "reward": 0.6576340794563293, "reward_std": 0.07834653556346893, "rewards/VisualizationJSONCombinedORM/mean": 0.6576340794563293, "rewards/VisualizationJSONCombinedORM/std": 0.07628475874662399, "step": 508, "train_speed(iter/s)": 0.027527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 316.0, "completions/min_length": 261.0, "epoch": 0.42100909842845324, "grad_norm": 0.21871240437030792, "kl": 0.04962158203125, "learning_rate": 7.176848714887042e-06, "loss": 0.0004962682723999023, "memory(GiB)": 39.01, "reward": 0.6333035230636597, "reward_std": 0.11942899227142334, "rewards/VisualizationJSONCombinedORM/mean": 0.6333035230636597, "rewards/VisualizationJSONCombinedORM/std": 0.12163574248552322, "step": 509, "train_speed(iter/s)": 0.027543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 339.375, "completions/min_length": 239.0, "epoch": 0.4218362282878412, "grad_norm": 0.18967792391777039, "kl": 0.0484619140625, "learning_rate": 7.163842304053956e-06, "loss": 0.00048379600048065186, "memory(GiB)": 39.01, "reward": 0.5218024253845215, "reward_std": 0.07859579473733902, "rewards/VisualizationJSONCombinedORM/mean": 0.5218024253845215, "rewards/VisualizationJSONCombinedORM/std": 0.32005852460861206, "step": 510, "train_speed(iter/s)": 0.027556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 304.75, "completions/min_length": 240.0, "epoch": 0.4226633581472291, "grad_norm": 0.20145782828330994, "kl": 0.0411376953125, "learning_rate": 7.150817851944473e-06, "loss": 0.00041121523827314377, "memory(GiB)": 39.01, "reward": 0.36675596237182617, "reward_std": 0.09163865447044373, "rewards/VisualizationJSONCombinedORM/mean": 0.36675596237182617, "rewards/VisualizationJSONCombinedORM/std": 0.16423769295215607, "step": 511, "train_speed(iter/s)": 0.027573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 333.25, "completions/min_length": 257.0, "epoch": 0.42349048800661704, "grad_norm": 0.20487546920776367, "kl": 0.0604248046875, "learning_rate": 7.137775467151411e-06, "loss": 0.0006041508167982101, "memory(GiB)": 39.01, "reward": 0.42875179648399353, "reward_std": 0.083351269364357, "rewards/VisualizationJSONCombinedORM/mean": 0.42875179648399353, "rewards/VisualizationJSONCombinedORM/std": 0.1875031739473343, "step": 512, "train_speed(iter/s)": 0.027584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 347.0, "completions/min_length": 275.0, "epoch": 0.42431761786600497, "grad_norm": 0.1795341819524765, "kl": 0.04583740234375, "learning_rate": 7.124715258417111e-06, "loss": 0.00045912712812423706, "memory(GiB)": 39.01, "reward": 0.49572688341140747, "reward_std": 0.07275637239217758, "rewards/VisualizationJSONCombinedORM/mean": 0.49572688341140747, "rewards/VisualizationJSONCombinedORM/std": 0.23126715421676636, "step": 513, "train_speed(iter/s)": 0.027602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 369.5625, "completions/min_length": 273.0, "epoch": 0.4251447477253929, "grad_norm": 0.1774129420518875, "kl": 0.04901123046875, "learning_rate": 7.111637334632515e-06, "loss": 0.000490281730890274, "memory(GiB)": 39.01, "reward": 0.45419830083847046, "reward_std": 0.09975806623697281, "rewards/VisualizationJSONCombinedORM/mean": 0.45419830083847046, "rewards/VisualizationJSONCombinedORM/std": 0.15878987312316895, "step": 514, "train_speed(iter/s)": 0.027616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.1875, "completions/min_length": 249.0, "epoch": 0.42597187758478083, "grad_norm": 0.18547233939170837, "kl": 0.05859375, "learning_rate": 7.098541804836272e-06, "loss": 0.0005855020135641098, "memory(GiB)": 39.01, "reward": 0.48441529273986816, "reward_std": 0.11379946023225784, "rewards/VisualizationJSONCombinedORM/mean": 0.48441529273986816, "rewards/VisualizationJSONCombinedORM/std": 0.11069298535585403, "step": 515, "train_speed(iter/s)": 0.027634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 323.375, "completions/min_length": 258.0, "epoch": 0.4267990074441687, "grad_norm": 0.19457024335861206, "kl": 0.0693359375, "learning_rate": 7.085428778213822e-06, "loss": 0.0006928145885467529, "memory(GiB)": 39.01, "reward": 0.30993422865867615, "reward_std": 0.04783390834927559, "rewards/VisualizationJSONCombinedORM/mean": 0.30993422865867615, "rewards/VisualizationJSONCombinedORM/std": 0.07219666987657547, "step": 516, "train_speed(iter/s)": 0.02765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 305.5625, "completions/min_length": 261.0, "epoch": 0.42762613730355664, "grad_norm": 0.18352000415325165, "kl": 0.04144287109375, "learning_rate": 7.072298364096486e-06, "loss": 0.0004146778956055641, "memory(GiB)": 39.01, "reward": 0.5416572093963623, "reward_std": 0.07314278930425644, "rewards/VisualizationJSONCombinedORM/mean": 0.5416572093963623, "rewards/VisualizationJSONCombinedORM/std": 0.1809975653886795, "step": 517, "train_speed(iter/s)": 0.027668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 319.0625, "completions/min_length": 262.0, "epoch": 0.4284532671629446, "grad_norm": 0.18674497306346893, "kl": 0.0538330078125, "learning_rate": 7.059150671960554e-06, "loss": 0.0005382746458053589, "memory(GiB)": 39.01, "reward": 0.47227340936660767, "reward_std": 0.07151126861572266, "rewards/VisualizationJSONCombinedORM/mean": 0.47227340936660767, "rewards/VisualizationJSONCombinedORM/std": 0.2163936048746109, "step": 518, "train_speed(iter/s)": 0.027681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 297.4375, "completions/min_length": 250.0, "epoch": 0.4292803970223325, "grad_norm": 0.20792391896247864, "kl": 0.06427001953125, "learning_rate": 7.0459858114263755e-06, "loss": 0.0006427615880966187, "memory(GiB)": 39.01, "reward": 0.5615308284759521, "reward_std": 0.1514376699924469, "rewards/VisualizationJSONCombinedORM/mean": 0.5615308284759521, "rewards/VisualizationJSONCombinedORM/std": 0.1670045107603073, "step": 519, "train_speed(iter/s)": 0.027691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 311.1875, "completions/min_length": 227.0, "epoch": 0.43010752688172044, "grad_norm": 0.17572802305221558, "kl": 0.03875732421875, "learning_rate": 7.032803892257443e-06, "loss": 0.00038738176226615906, "memory(GiB)": 39.01, "reward": 0.5045488476753235, "reward_std": 0.08115627616643906, "rewards/VisualizationJSONCombinedORM/mean": 0.5045488476753235, "rewards/VisualizationJSONCombinedORM/std": 0.27167290449142456, "step": 520, "train_speed(iter/s)": 0.027705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 291.125, "completions/min_length": 240.0, "epoch": 0.43093465674110837, "grad_norm": 0.20962963998317719, "kl": 0.052490234375, "learning_rate": 7.019605024359475e-06, "loss": 0.0005251020193099976, "memory(GiB)": 39.01, "reward": 0.35727590322494507, "reward_std": 0.06765937060117722, "rewards/VisualizationJSONCombinedORM/mean": 0.35727590322494507, "rewards/VisualizationJSONCombinedORM/std": 0.20068249106407166, "step": 521, "train_speed(iter/s)": 0.02772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 302.9375, "completions/min_length": 248.0, "epoch": 0.4317617866004963, "grad_norm": 0.17750303447246552, "kl": 0.05908203125, "learning_rate": 7.006389317779506e-06, "loss": 0.0005906093865633011, "memory(GiB)": 39.01, "reward": 0.5085335969924927, "reward_std": 0.0968494787812233, "rewards/VisualizationJSONCombinedORM/mean": 0.5085335969924927, "rewards/VisualizationJSONCombinedORM/std": 0.1438925713300705, "step": 522, "train_speed(iter/s)": 0.027735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 317.75, "completions/min_length": 244.0, "epoch": 0.4325889164598842, "grad_norm": 0.18293939530849457, "kl": 0.0745849609375, "learning_rate": 6.993156882704962e-06, "loss": 0.0007450468838214874, "memory(GiB)": 39.01, "reward": 0.4880343973636627, "reward_std": 0.1066262274980545, "rewards/VisualizationJSONCombinedORM/mean": 0.4880343973636627, "rewards/VisualizationJSONCombinedORM/std": 0.1428544670343399, "step": 523, "train_speed(iter/s)": 0.027748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 337.6875, "completions/min_length": 279.0, "epoch": 0.4334160463192721, "grad_norm": 0.185475692152977, "kl": 0.05511474609375, "learning_rate": 6.979907829462745e-06, "loss": 0.0005525164306163788, "memory(GiB)": 39.01, "reward": 0.538724422454834, "reward_std": 0.10281787812709808, "rewards/VisualizationJSONCombinedORM/mean": 0.538724422454834, "rewards/VisualizationJSONCombinedORM/std": 0.10777359455823898, "step": 524, "train_speed(iter/s)": 0.027762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 326.25, "completions/min_length": 261.0, "epoch": 0.43424317617866004, "grad_norm": 0.34079447388648987, "kl": 0.04833984375, "learning_rate": 6.966642268518313e-06, "loss": 0.00048433616757392883, "memory(GiB)": 39.01, "reward": 0.3745952248573303, "reward_std": 0.0776776522397995, "rewards/VisualizationJSONCombinedORM/mean": 0.3745952248573303, "rewards/VisualizationJSONCombinedORM/std": 0.08398263156414032, "step": 525, "train_speed(iter/s)": 0.027777 }, { "epoch": 0.43424317617866004, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.6666666666667, "eval_completions/mean_length": 306.4739583333333, "eval_completions/min_length": 256.3333333333333, "eval_kl": 0.057688395182291664, "eval_loss": 0.0005768369883298874, "eval_reward": 0.46784508662919205, "eval_reward_std": 0.07238149860252936, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46784508662919205, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07238150077561538, "eval_runtime": 310.4987, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 336.4375, "completions/min_length": 278.0, "epoch": 0.43507030603804797, "grad_norm": 0.1878584325313568, "kl": 0.0654296875, "learning_rate": 6.953360310474761e-06, "loss": 0.0006547849625349045, "memory(GiB)": 39.01, "reward": 0.34650254249572754, "reward_std": 0.06156287342309952, "rewards/VisualizationJSONCombinedORM/mean": 0.34650254249572754, "rewards/VisualizationJSONCombinedORM/std": 0.06764844059944153, "step": 526, "train_speed(iter/s)": 0.027339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 328.1875, "completions/min_length": 248.0, "epoch": 0.4358974358974359, "grad_norm": 0.18410663306713104, "kl": 0.05426025390625, "learning_rate": 6.940062066071891e-06, "loss": 0.0005434155464172363, "memory(GiB)": 39.01, "reward": 0.606711745262146, "reward_std": 0.07597076147794724, "rewards/VisualizationJSONCombinedORM/mean": 0.606711745262146, "rewards/VisualizationJSONCombinedORM/std": 0.24553333222866058, "step": 527, "train_speed(iter/s)": 0.027351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 283.8125, "completions/min_length": 248.0, "epoch": 0.43672456575682383, "grad_norm": 0.15995272994041443, "kl": 0.05712890625, "learning_rate": 6.9267476461853015e-06, "loss": 0.0005713775753974915, "memory(GiB)": 39.01, "reward": 0.5777660608291626, "reward_std": 0.09440559148788452, "rewards/VisualizationJSONCombinedORM/mean": 0.5777660608291626, "rewards/VisualizationJSONCombinedORM/std": 0.20025326311588287, "step": 528, "train_speed(iter/s)": 0.027372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 297.1875, "completions/min_length": 250.0, "epoch": 0.43755169561621177, "grad_norm": 0.18966218829154968, "kl": 0.052978515625, "learning_rate": 6.913417161825449e-06, "loss": 0.000529903918504715, "memory(GiB)": 39.01, "reward": 0.5323199033737183, "reward_std": 0.09088484942913055, "rewards/VisualizationJSONCombinedORM/mean": 0.5323199033737183, "rewards/VisualizationJSONCombinedORM/std": 0.09398660808801651, "step": 529, "train_speed(iter/s)": 0.027389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 281.1875, "completions/min_length": 231.0, "epoch": 0.43837882547559964, "grad_norm": 0.24856790900230408, "kl": 0.0594482421875, "learning_rate": 6.900070724136736e-06, "loss": 0.0005952641367912292, "memory(GiB)": 39.01, "reward": 0.5799624919891357, "reward_std": 0.10536840558052063, "rewards/VisualizationJSONCombinedORM/mean": 0.5799624919891357, "rewards/VisualizationJSONCombinedORM/std": 0.12308363616466522, "step": 530, "train_speed(iter/s)": 0.027399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 301.9375, "completions/min_length": 240.0, "epoch": 0.4392059553349876, "grad_norm": 0.18298612534999847, "kl": 0.06549072265625, "learning_rate": 6.8867084443965726e-06, "loss": 0.0006561386398971081, "memory(GiB)": 39.01, "reward": 0.3589436411857605, "reward_std": 0.028836701065301895, "rewards/VisualizationJSONCombinedORM/mean": 0.3589436411857605, "rewards/VisualizationJSONCombinedORM/std": 0.15269453823566437, "step": 531, "train_speed(iter/s)": 0.027409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 308.1875, "completions/min_length": 259.0, "epoch": 0.4400330851943755, "grad_norm": 0.1801951378583908, "kl": 0.04998779296875, "learning_rate": 6.8733304340144554e-06, "loss": 0.0005004927515983582, "memory(GiB)": 39.01, "reward": 0.5641641020774841, "reward_std": 0.09324578940868378, "rewards/VisualizationJSONCombinedORM/mean": 0.5641641020774841, "rewards/VisualizationJSONCombinedORM/std": 0.15403306484222412, "step": 532, "train_speed(iter/s)": 0.027429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.6875, "completions/min_length": 224.0, "epoch": 0.44086021505376344, "grad_norm": 0.2522233724594116, "kl": 0.06988525390625, "learning_rate": 6.859936804531039e-06, "loss": 0.0006983727216720581, "memory(GiB)": 39.01, "reward": 0.54460608959198, "reward_std": 0.13507448136806488, "rewards/VisualizationJSONCombinedORM/mean": 0.54460608959198, "rewards/VisualizationJSONCombinedORM/std": 0.18155287206172943, "step": 533, "train_speed(iter/s)": 0.027448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 277.125, "completions/min_length": 240.0, "epoch": 0.44168734491315137, "grad_norm": 0.17638549208641052, "kl": 0.07373046875, "learning_rate": 6.846527667617199e-06, "loss": 0.0007379595190286636, "memory(GiB)": 39.01, "reward": 0.7709707021713257, "reward_std": 0.1252036690711975, "rewards/VisualizationJSONCombinedORM/mean": 0.7709707021713257, "rewards/VisualizationJSONCombinedORM/std": 0.12589748203754425, "step": 534, "train_speed(iter/s)": 0.027458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 290.1875, "completions/min_length": 234.0, "epoch": 0.4425144747725393, "grad_norm": 0.20877963304519653, "kl": 0.05401611328125, "learning_rate": 6.8331031350731115e-06, "loss": 0.0005396883934736252, "memory(GiB)": 39.01, "reward": 0.516280472278595, "reward_std": 0.08755457401275635, "rewards/VisualizationJSONCombinedORM/mean": 0.516280472278595, "rewards/VisualizationJSONCombinedORM/std": 0.08741208165884018, "step": 535, "train_speed(iter/s)": 0.027474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 283.875, "completions/min_length": 205.0, "epoch": 0.44334160463192723, "grad_norm": 0.18018607795238495, "kl": 0.0543212890625, "learning_rate": 6.819663318827311e-06, "loss": 0.0005420297384262085, "memory(GiB)": 39.01, "reward": 0.5487919449806213, "reward_std": 0.06852003186941147, "rewards/VisualizationJSONCombinedORM/mean": 0.5487919449806213, "rewards/VisualizationJSONCombinedORM/std": 0.20305174589157104, "step": 536, "train_speed(iter/s)": 0.02749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 279.8125, "completions/min_length": 243.0, "epoch": 0.4441687344913151, "grad_norm": 0.18981003761291504, "kl": 0.0562744140625, "learning_rate": 6.806208330935766e-06, "loss": 0.0005619088187813759, "memory(GiB)": 39.01, "reward": 0.4239201843738556, "reward_std": 0.0938623696565628, "rewards/VisualizationJSONCombinedORM/mean": 0.4239201843738556, "rewards/VisualizationJSONCombinedORM/std": 0.15801702439785004, "step": 537, "train_speed(iter/s)": 0.02751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 337.9375, "completions/min_length": 252.0, "epoch": 0.44499586435070304, "grad_norm": 0.22263450920581818, "kl": 0.070068359375, "learning_rate": 6.792738283580935e-06, "loss": 0.0007021054625511169, "memory(GiB)": 39.01, "reward": 0.741485595703125, "reward_std": 0.09788302332162857, "rewards/VisualizationJSONCombinedORM/mean": 0.741485595703125, "rewards/VisualizationJSONCombinedORM/std": 0.09806852787733078, "step": 538, "train_speed(iter/s)": 0.027516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 299.625, "completions/min_length": 258.0, "epoch": 0.445822994210091, "grad_norm": 0.1967126876115799, "kl": 0.0579833984375, "learning_rate": 6.7792532890708415e-06, "loss": 0.0005795881152153015, "memory(GiB)": 39.01, "reward": 0.685592532157898, "reward_std": 0.11187349259853363, "rewards/VisualizationJSONCombinedORM/mean": 0.685592532157898, "rewards/VisualizationJSONCombinedORM/std": 0.1082582026720047, "step": 539, "train_speed(iter/s)": 0.027532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 309.875, "completions/min_length": 254.0, "epoch": 0.4466501240694789, "grad_norm": 0.1883794069290161, "kl": 0.05047607421875, "learning_rate": 6.765753459838129e-06, "loss": 0.000504830852150917, "memory(GiB)": 39.01, "reward": 0.4834911823272705, "reward_std": 0.07461290061473846, "rewards/VisualizationJSONCombinedORM/mean": 0.4834911823272705, "rewards/VisualizationJSONCombinedORM/std": 0.1229257881641388, "step": 540, "train_speed(iter/s)": 0.027549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 284.75, "completions/min_length": 250.0, "epoch": 0.44747725392886684, "grad_norm": 0.20135943591594696, "kl": 0.04876708984375, "learning_rate": 6.75223890843913e-06, "loss": 0.0004878547042608261, "memory(GiB)": 39.01, "reward": 0.6700663566589355, "reward_std": 0.1031600683927536, "rewards/VisualizationJSONCombinedORM/mean": 0.6700663566589355, "rewards/VisualizationJSONCombinedORM/std": 0.12375853955745697, "step": 541, "train_speed(iter/s)": 0.027564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 325.8125, "completions/min_length": 277.0, "epoch": 0.44830438378825477, "grad_norm": 0.20632950961589813, "kl": 0.05267333984375, "learning_rate": 6.738709747552921e-06, "loss": 0.0005268305540084839, "memory(GiB)": 39.01, "reward": 0.4546400308609009, "reward_std": 0.1173986941576004, "rewards/VisualizationJSONCombinedORM/mean": 0.4546400308609009, "rewards/VisualizationJSONCombinedORM/std": 0.20411300659179688, "step": 542, "train_speed(iter/s)": 0.027576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 327.6875, "completions/min_length": 246.0, "epoch": 0.4491315136476427, "grad_norm": 0.16849464178085327, "kl": 0.05047607421875, "learning_rate": 6.72516608998039e-06, "loss": 0.0005050972104072571, "memory(GiB)": 39.01, "reward": 0.6322634220123291, "reward_std": 0.11173851788043976, "rewards/VisualizationJSONCombinedORM/mean": 0.6322634220123291, "rewards/VisualizationJSONCombinedORM/std": 0.1453234702348709, "step": 543, "train_speed(iter/s)": 0.02759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 297.75, "completions/min_length": 248.0, "epoch": 0.44995864350703063, "grad_norm": 0.18202528357505798, "kl": 0.0498046875, "learning_rate": 6.7116080486432925e-06, "loss": 0.0004978030920028687, "memory(GiB)": 39.01, "reward": 0.4039992392063141, "reward_std": 0.05170377343893051, "rewards/VisualizationJSONCombinedORM/mean": 0.4039992392063141, "rewards/VisualizationJSONCombinedORM/std": 0.11655773222446442, "step": 544, "train_speed(iter/s)": 0.027604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 314.6875, "completions/min_length": 273.0, "epoch": 0.4507857733664185, "grad_norm": 0.17599518597126007, "kl": 0.06011962890625, "learning_rate": 6.698035736583307e-06, "loss": 0.0006019324064254761, "memory(GiB)": 39.01, "reward": 0.6811336874961853, "reward_std": 0.10266035050153732, "rewards/VisualizationJSONCombinedORM/mean": 0.6811336874961853, "rewards/VisualizationJSONCombinedORM/std": 0.11179658025503159, "step": 545, "train_speed(iter/s)": 0.027623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 321.0625, "completions/min_length": 253.0, "epoch": 0.45161290322580644, "grad_norm": 0.17929136753082275, "kl": 0.0767822265625, "learning_rate": 6.684449266961101e-06, "loss": 0.0007666796445846558, "memory(GiB)": 39.01, "reward": 0.5296638011932373, "reward_std": 0.08698482811450958, "rewards/VisualizationJSONCombinedORM/mean": 0.5296638011932373, "rewards/VisualizationJSONCombinedORM/std": 0.17508149147033691, "step": 546, "train_speed(iter/s)": 0.027625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 311.0, "completions/min_length": 228.0, "epoch": 0.45244003308519437, "grad_norm": 0.18212684988975525, "kl": 0.035400390625, "learning_rate": 6.670848753055376e-06, "loss": 0.00035396963357925415, "memory(GiB)": 39.01, "reward": 0.4645160436630249, "reward_std": 0.09119034558534622, "rewards/VisualizationJSONCombinedORM/mean": 0.4645160436630249, "rewards/VisualizationJSONCombinedORM/std": 0.16976749897003174, "step": 547, "train_speed(iter/s)": 0.027635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 318.5625, "completions/min_length": 258.0, "epoch": 0.4532671629445823, "grad_norm": 0.1746758073568344, "kl": 0.046142578125, "learning_rate": 6.657234308261937e-06, "loss": 0.0004612952470779419, "memory(GiB)": 39.01, "reward": 0.5147418975830078, "reward_std": 0.08748571574687958, "rewards/VisualizationJSONCombinedORM/mean": 0.5147418975830078, "rewards/VisualizationJSONCombinedORM/std": 0.2558539807796478, "step": 548, "train_speed(iter/s)": 0.027648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 297.25, "completions/min_length": 226.0, "epoch": 0.45409429280397023, "grad_norm": 0.19065023958683014, "kl": 0.0445556640625, "learning_rate": 6.643606046092732e-06, "loss": 0.00044706836342811584, "memory(GiB)": 39.01, "reward": 0.7219841480255127, "reward_std": 0.09357450902462006, "rewards/VisualizationJSONCombinedORM/mean": 0.7219841480255127, "rewards/VisualizationJSONCombinedORM/std": 0.09327060729265213, "step": 549, "train_speed(iter/s)": 0.027664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 324.5, "completions/min_length": 262.0, "epoch": 0.45492142266335817, "grad_norm": 0.17107515037059784, "kl": 0.0494384765625, "learning_rate": 6.629964080174915e-06, "loss": 0.0004941485822200775, "memory(GiB)": 39.01, "reward": 0.3400150537490845, "reward_std": 0.10400764644145966, "rewards/VisualizationJSONCombinedORM/mean": 0.3400150537490845, "rewards/VisualizationJSONCombinedORM/std": 0.15447460114955902, "step": 550, "train_speed(iter/s)": 0.027672 }, { "epoch": 0.45492142266335817, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.875, "eval_completions/mean_length": 319.78125, "eval_completions/min_length": 268.0, "eval_kl": 0.05389404296875, "eval_loss": 0.0005400342051871121, "eval_reward": 0.46853743493556976, "eval_reward_std": 0.07895731980291505, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46853743493556976, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07895732096706827, "eval_runtime": 315.7918, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 326.25, "completions/min_length": 266.0, "epoch": 0.4557485525227461, "grad_norm": 0.16897054016590118, "kl": 0.03826904296875, "learning_rate": 6.616308524249901e-06, "loss": 0.0003820061683654785, "memory(GiB)": 39.01, "reward": 0.7513003349304199, "reward_std": 0.06140174716711044, "rewards/VisualizationJSONCombinedORM/mean": 0.7513003349304199, "rewards/VisualizationJSONCombinedORM/std": 0.06271016597747803, "step": 551, "train_speed(iter/s)": 0.027255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 315.625, "completions/min_length": 236.0, "epoch": 0.456575682382134, "grad_norm": 0.1767035871744156, "kl": 0.05133056640625, "learning_rate": 6.602639492172406e-06, "loss": 0.000512346625328064, "memory(GiB)": 39.01, "reward": 0.4680127501487732, "reward_std": 0.07450222969055176, "rewards/VisualizationJSONCombinedORM/mean": 0.4680127501487732, "rewards/VisualizationJSONCombinedORM/std": 0.11980512738227844, "step": 552, "train_speed(iter/s)": 0.027268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 316.6875, "completions/min_length": 249.0, "epoch": 0.4574028122415219, "grad_norm": 0.16337831318378448, "kl": 0.05438232421875, "learning_rate": 6.588957097909509e-06, "loss": 0.0005443841218948364, "memory(GiB)": 39.01, "reward": 0.4128652811050415, "reward_std": 0.07358469814062119, "rewards/VisualizationJSONCombinedORM/mean": 0.4128652811050415, "rewards/VisualizationJSONCombinedORM/std": 0.08732789754867554, "step": 553, "train_speed(iter/s)": 0.027283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 312.875, "completions/min_length": 227.0, "epoch": 0.45822994210090984, "grad_norm": 0.19448958337306976, "kl": 0.05426025390625, "learning_rate": 6.575261455539699e-06, "loss": 0.0005434714257717133, "memory(GiB)": 39.01, "reward": 0.4564315974712372, "reward_std": 0.07755303382873535, "rewards/VisualizationJSONCombinedORM/mean": 0.4564315974712372, "rewards/VisualizationJSONCombinedORM/std": 0.12352339923381805, "step": 554, "train_speed(iter/s)": 0.027298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 330.25, "completions/min_length": 255.0, "epoch": 0.45905707196029777, "grad_norm": 0.17221812903881073, "kl": 0.05755615234375, "learning_rate": 6.561552679251919e-06, "loss": 0.0005774162709712982, "memory(GiB)": 39.01, "reward": 0.405953049659729, "reward_std": 0.08401309698820114, "rewards/VisualizationJSONCombinedORM/mean": 0.405953049659729, "rewards/VisualizationJSONCombinedORM/std": 0.18293273448944092, "step": 555, "train_speed(iter/s)": 0.02731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 317.875, "completions/min_length": 267.0, "epoch": 0.4598842018196857, "grad_norm": 0.1561187207698822, "kl": 0.0458984375, "learning_rate": 6.547830883344623e-06, "loss": 0.00045964494347572327, "memory(GiB)": 39.01, "reward": 0.4491046667098999, "reward_std": 0.059972502291202545, "rewards/VisualizationJSONCombinedORM/mean": 0.4491046667098999, "rewards/VisualizationJSONCombinedORM/std": 0.30814072489738464, "step": 556, "train_speed(iter/s)": 0.027322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 326.1875, "completions/min_length": 270.0, "epoch": 0.46071133167907363, "grad_norm": 0.1715637892484665, "kl": 0.05108642578125, "learning_rate": 6.534096182224809e-06, "loss": 0.0005111806094646454, "memory(GiB)": 39.01, "reward": 0.42881572246551514, "reward_std": 0.0658336728811264, "rewards/VisualizationJSONCombinedORM/mean": 0.42881572246551514, "rewards/VisualizationJSONCombinedORM/std": 0.25217384099960327, "step": 557, "train_speed(iter/s)": 0.027333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 298.625, "completions/min_length": 239.0, "epoch": 0.46153846153846156, "grad_norm": 0.1940048336982727, "kl": 0.05584716796875, "learning_rate": 6.520348690407083e-06, "loss": 0.0005589984357357025, "memory(GiB)": 39.01, "reward": 0.4498940110206604, "reward_std": 0.072357177734375, "rewards/VisualizationJSONCombinedORM/mean": 0.4498940110206604, "rewards/VisualizationJSONCombinedORM/std": 0.07078175991773605, "step": 558, "train_speed(iter/s)": 0.027345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 290.0625, "completions/min_length": 252.0, "epoch": 0.46236559139784944, "grad_norm": 0.17993523180484772, "kl": 0.045166015625, "learning_rate": 6.506588522512687e-06, "loss": 0.00045100972056388855, "memory(GiB)": 39.01, "reward": 0.4735718369483948, "reward_std": 0.07542650401592255, "rewards/VisualizationJSONCombinedORM/mean": 0.4735718369483948, "rewards/VisualizationJSONCombinedORM/std": 0.31519320607185364, "step": 559, "train_speed(iter/s)": 0.027359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 313.0, "completions/min_length": 261.0, "epoch": 0.46319272125723737, "grad_norm": 0.20107032358646393, "kl": 0.04693603515625, "learning_rate": 6.492815793268554e-06, "loss": 0.000469856895506382, "memory(GiB)": 39.01, "reward": 0.5319211483001709, "reward_std": 0.09170179814100266, "rewards/VisualizationJSONCombinedORM/mean": 0.5319211483001709, "rewards/VisualizationJSONCombinedORM/std": 0.1193820908665657, "step": 560, "train_speed(iter/s)": 0.02737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 327.5, "completions/min_length": 258.0, "epoch": 0.4640198511166253, "grad_norm": 0.17421305179595947, "kl": 0.04632568359375, "learning_rate": 6.4790306175063535e-06, "loss": 0.0004627890884876251, "memory(GiB)": 39.01, "reward": 0.6887590289115906, "reward_std": 0.09684941172599792, "rewards/VisualizationJSONCombinedORM/mean": 0.6887590289115906, "rewards/VisualizationJSONCombinedORM/std": 0.09611079841852188, "step": 561, "train_speed(iter/s)": 0.027382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 317.0625, "completions/min_length": 249.0, "epoch": 0.46484698097601324, "grad_norm": 0.19474203884601593, "kl": 0.05859375, "learning_rate": 6.46523311016152e-06, "loss": 0.0005864650011062622, "memory(GiB)": 39.01, "reward": 0.4970458745956421, "reward_std": 0.0897410586476326, "rewards/VisualizationJSONCombinedORM/mean": 0.4970458745956421, "rewards/VisualizationJSONCombinedORM/std": 0.21277876198291779, "step": 562, "train_speed(iter/s)": 0.027397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 323.8125, "completions/min_length": 245.0, "epoch": 0.46567411083540117, "grad_norm": 0.1790410578250885, "kl": 0.052001953125, "learning_rate": 6.451423386272312e-06, "loss": 0.0005209073424339294, "memory(GiB)": 39.01, "reward": 0.6038538813591003, "reward_std": 0.11235681176185608, "rewards/VisualizationJSONCombinedORM/mean": 0.6038538813591003, "rewards/VisualizationJSONCombinedORM/std": 0.13150212168693542, "step": 563, "train_speed(iter/s)": 0.027412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 319.125, "completions/min_length": 234.0, "epoch": 0.4665012406947891, "grad_norm": 0.20365570485591888, "kl": 0.06121826171875, "learning_rate": 6.437601560978841e-06, "loss": 0.0006124973297119141, "memory(GiB)": 39.01, "reward": 0.5082536339759827, "reward_std": 0.0744498148560524, "rewards/VisualizationJSONCombinedORM/mean": 0.5082536339759827, "rewards/VisualizationJSONCombinedORM/std": 0.18695062398910522, "step": 564, "train_speed(iter/s)": 0.027418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 346.875, "completions/min_length": 280.0, "epoch": 0.46732837055417703, "grad_norm": 0.17872639000415802, "kl": 0.0499267578125, "learning_rate": 6.423767749522116e-06, "loss": 0.0004982501268386841, "memory(GiB)": 39.01, "reward": 0.3577161431312561, "reward_std": 0.06571225821971893, "rewards/VisualizationJSONCombinedORM/mean": 0.3577161431312561, "rewards/VisualizationJSONCombinedORM/std": 0.0689706951379776, "step": 565, "train_speed(iter/s)": 0.027435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 331.125, "completions/min_length": 277.0, "epoch": 0.4681555004135649, "grad_norm": 0.17271751165390015, "kl": 0.0535888671875, "learning_rate": 6.409922067243083e-06, "loss": 0.0005353540182113647, "memory(GiB)": 39.01, "reward": 0.42186444997787476, "reward_std": 0.07147155702114105, "rewards/VisualizationJSONCombinedORM/mean": 0.42186444997787476, "rewards/VisualizationJSONCombinedORM/std": 0.21140563488006592, "step": 566, "train_speed(iter/s)": 0.027449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 311.9375, "completions/min_length": 246.0, "epoch": 0.46898263027295284, "grad_norm": 0.1822657287120819, "kl": 0.05340576171875, "learning_rate": 6.39606462958166e-06, "loss": 0.0005350224673748016, "memory(GiB)": 39.01, "reward": 0.6091663837432861, "reward_std": 0.07995344698429108, "rewards/VisualizationJSONCombinedORM/mean": 0.6091663837432861, "rewards/VisualizationJSONCombinedORM/std": 0.08893069624900818, "step": 567, "train_speed(iter/s)": 0.027471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 346.5, "completions/min_length": 275.0, "epoch": 0.46980976013234077, "grad_norm": 0.2140568196773529, "kl": 0.0418701171875, "learning_rate": 6.382195552075777e-06, "loss": 0.0004183948040008545, "memory(GiB)": 39.01, "reward": 0.6789796352386475, "reward_std": 0.10674548894166946, "rewards/VisualizationJSONCombinedORM/mean": 0.6789796352386475, "rewards/VisualizationJSONCombinedORM/std": 0.14131741225719452, "step": 568, "train_speed(iter/s)": 0.027484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 340.6875, "completions/min_length": 267.0, "epoch": 0.4706368899917287, "grad_norm": 0.20330725610256195, "kl": 0.04205322265625, "learning_rate": 6.368314950360416e-06, "loss": 0.000420302152633667, "memory(GiB)": 39.01, "reward": 0.4336939752101898, "reward_std": 0.11481940746307373, "rewards/VisualizationJSONCombinedORM/mean": 0.4336939752101898, "rewards/VisualizationJSONCombinedORM/std": 0.11471044272184372, "step": 569, "train_speed(iter/s)": 0.027498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 343.875, "completions/min_length": 251.0, "epoch": 0.47146401985111663, "grad_norm": 0.16910050809383392, "kl": 0.0465087890625, "learning_rate": 6.354422940166639e-06, "loss": 0.00046481192111968994, "memory(GiB)": 39.01, "reward": 0.5059798955917358, "reward_std": 0.0507650151848793, "rewards/VisualizationJSONCombinedORM/mean": 0.5059798955917358, "rewards/VisualizationJSONCombinedORM/std": 0.15615561604499817, "step": 570, "train_speed(iter/s)": 0.027508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 321.0, "completions/min_length": 220.0, "epoch": 0.47229114971050457, "grad_norm": 0.16848276555538177, "kl": 0.06591796875, "learning_rate": 6.3405196373206304e-06, "loss": 0.0006595402956008911, "memory(GiB)": 39.01, "reward": 0.7001379728317261, "reward_std": 0.1371425986289978, "rewards/VisualizationJSONCombinedORM/mean": 0.7001379728317261, "rewards/VisualizationJSONCombinedORM/std": 0.14123226702213287, "step": 571, "train_speed(iter/s)": 0.027521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 317.75, "completions/min_length": 246.0, "epoch": 0.4731182795698925, "grad_norm": 0.17315563559532166, "kl": 0.06744384765625, "learning_rate": 6.3266051577427264e-06, "loss": 0.0006747152656316757, "memory(GiB)": 39.01, "reward": 0.6858162879943848, "reward_std": 0.1116805225610733, "rewards/VisualizationJSONCombinedORM/mean": 0.6858162879943848, "rewards/VisualizationJSONCombinedORM/std": 0.11889238655567169, "step": 572, "train_speed(iter/s)": 0.027527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 337.9375, "completions/min_length": 230.0, "epoch": 0.4739454094292804, "grad_norm": 0.18387655913829803, "kl": 0.0576171875, "learning_rate": 6.312679617446453e-06, "loss": 0.0005762949585914612, "memory(GiB)": 39.01, "reward": 0.539249062538147, "reward_std": 0.11638686805963516, "rewards/VisualizationJSONCombinedORM/mean": 0.539249062538147, "rewards/VisualizationJSONCombinedORM/std": 0.12551791965961456, "step": 573, "train_speed(iter/s)": 0.02754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 350.125, "completions/min_length": 289.0, "epoch": 0.4747725392886683, "grad_norm": 0.1629205048084259, "kl": 0.03125, "learning_rate": 6.298743132537555e-06, "loss": 0.0003126300871372223, "memory(GiB)": 39.01, "reward": 0.6623882055282593, "reward_std": 0.09756041318178177, "rewards/VisualizationJSONCombinedORM/mean": 0.6623882055282593, "rewards/VisualizationJSONCombinedORM/std": 0.1314406841993332, "step": 574, "train_speed(iter/s)": 0.027555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 307.75, "completions/min_length": 241.0, "epoch": 0.47559966914805624, "grad_norm": 0.20005638897418976, "kl": 0.03448486328125, "learning_rate": 6.284795819213027e-06, "loss": 0.00034543126821517944, "memory(GiB)": 39.01, "reward": 0.4788733720779419, "reward_std": 0.06849963217973709, "rewards/VisualizationJSONCombinedORM/mean": 0.4788733720779419, "rewards/VisualizationJSONCombinedORM/std": 0.24459940195083618, "step": 575, "train_speed(iter/s)": 0.02757 }, { "epoch": 0.47559966914805624, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 395.8333333333333, "eval_completions/mean_length": 329.2447916666667, "eval_completions/min_length": 277.875, "eval_kl": 0.051493326822916664, "eval_loss": 0.000517145439516753, "eval_reward": 0.4463143286605676, "eval_reward_std": 0.08018473636669417, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4463143286605676, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08018473722040653, "eval_runtime": 329.7227, "eval_samples_per_second": 0.073, "eval_steps_per_second": 0.009, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 382.3125, "completions/min_length": 330.0, "epoch": 0.47642679900744417, "grad_norm": 0.19501163065433502, "kl": 0.0556640625, "learning_rate": 6.270837793760148e-06, "loss": 0.0005568303167819977, "memory(GiB)": 39.01, "reward": 0.37328749895095825, "reward_std": 0.08197072893381119, "rewards/VisualizationJSONCombinedORM/mean": 0.37328749895095825, "rewards/VisualizationJSONCombinedORM/std": 0.12765896320343018, "step": 576, "train_speed(iter/s)": 0.027146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 356.8125, "completions/min_length": 284.0, "epoch": 0.4772539288668321, "grad_norm": 0.17193342745304108, "kl": 0.0545654296875, "learning_rate": 6.2568691725555144e-06, "loss": 0.0005464255809783936, "memory(GiB)": 39.01, "reward": 0.6708154678344727, "reward_std": 0.13627395033836365, "rewards/VisualizationJSONCombinedORM/mean": 0.6708154678344727, "rewards/VisualizationJSONCombinedORM/std": 0.14042750000953674, "step": 577, "train_speed(iter/s)": 0.027158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 342.625, "completions/min_length": 298.0, "epoch": 0.47808105872622003, "grad_norm": 0.18849223852157593, "kl": 0.03485107421875, "learning_rate": 6.242890072064058e-06, "loss": 0.00034787505865097046, "memory(GiB)": 39.01, "reward": 0.4462405741214752, "reward_std": 0.06261033564805984, "rewards/VisualizationJSONCombinedORM/mean": 0.4462405741214752, "rewards/VisualizationJSONCombinedORM/std": 0.172845259308815, "step": 578, "train_speed(iter/s)": 0.027171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 346.1875, "completions/min_length": 273.0, "epoch": 0.47890818858560796, "grad_norm": 0.17986439168453217, "kl": 0.0474853515625, "learning_rate": 6.228900608838091e-06, "loss": 0.00047388672828674316, "memory(GiB)": 39.01, "reward": 0.6353662014007568, "reward_std": 0.1090325266122818, "rewards/VisualizationJSONCombinedORM/mean": 0.6353662014007568, "rewards/VisualizationJSONCombinedORM/std": 0.1181330606341362, "step": 579, "train_speed(iter/s)": 0.027183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 324.125, "completions/min_length": 237.0, "epoch": 0.47973531844499584, "grad_norm": 0.18942466378211975, "kl": 0.0472412109375, "learning_rate": 6.21490089951632e-06, "loss": 0.00047313421964645386, "memory(GiB)": 39.01, "reward": 0.4507628381252289, "reward_std": 0.08446678519248962, "rewards/VisualizationJSONCombinedORM/mean": 0.4507628381252289, "rewards/VisualizationJSONCombinedORM/std": 0.24386803805828094, "step": 580, "train_speed(iter/s)": 0.027192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 292.9375, "completions/min_length": 227.0, "epoch": 0.48056244830438377, "grad_norm": 0.1831207573413849, "kl": 0.0479736328125, "learning_rate": 6.200891060822884e-06, "loss": 0.0004791431128978729, "memory(GiB)": 39.01, "reward": 0.2025776505470276, "reward_std": 0.026133932173252106, "rewards/VisualizationJSONCombinedORM/mean": 0.2025776505470276, "rewards/VisualizationJSONCombinedORM/std": 0.0331590473651886, "step": 581, "train_speed(iter/s)": 0.027203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 354.75, "completions/min_length": 261.0, "epoch": 0.4813895781637717, "grad_norm": 0.15773187577724457, "kl": 0.032012939453125, "learning_rate": 6.186871209566372e-06, "loss": 0.00032041221857070923, "memory(GiB)": 39.01, "reward": 0.30036336183547974, "reward_std": 0.034763842821121216, "rewards/VisualizationJSONCombinedORM/mean": 0.30036336183547974, "rewards/VisualizationJSONCombinedORM/std": 0.05114099755883217, "step": 582, "train_speed(iter/s)": 0.027207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 353.625, "completions/min_length": 276.0, "epoch": 0.48221670802315963, "grad_norm": 0.16108392179012299, "kl": 0.0465087890625, "learning_rate": 6.172841462638858e-06, "loss": 0.00046408548951148987, "memory(GiB)": 39.01, "reward": 0.5030379295349121, "reward_std": 0.11024483293294907, "rewards/VisualizationJSONCombinedORM/mean": 0.5030379295349121, "rewards/VisualizationJSONCombinedORM/std": 0.1272604912519455, "step": 583, "train_speed(iter/s)": 0.027223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 329.75, "completions/min_length": 240.0, "epoch": 0.48304383788254757, "grad_norm": 0.18883489072322845, "kl": 0.04229736328125, "learning_rate": 6.158801937014921e-06, "loss": 0.0004221983253955841, "memory(GiB)": 39.01, "reward": 0.43973249197006226, "reward_std": 0.11237135529518127, "rewards/VisualizationJSONCombinedORM/mean": 0.43973249197006226, "rewards/VisualizationJSONCombinedORM/std": 0.12736493349075317, "step": 584, "train_speed(iter/s)": 0.027241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 326.25, "completions/min_length": 261.0, "epoch": 0.4838709677419355, "grad_norm": 0.17431212961673737, "kl": 0.0582275390625, "learning_rate": 6.144752749750671e-06, "loss": 0.0005819275975227356, "memory(GiB)": 39.01, "reward": 0.6058076620101929, "reward_std": 0.13204166293144226, "rewards/VisualizationJSONCombinedORM/mean": 0.6058076620101929, "rewards/VisualizationJSONCombinedORM/std": 0.12833762168884277, "step": 585, "train_speed(iter/s)": 0.027249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 370.625, "completions/min_length": 292.0, "epoch": 0.48469809760132343, "grad_norm": 0.15807078778743744, "kl": 0.04351806640625, "learning_rate": 6.130694017982772e-06, "loss": 0.0004350505769252777, "memory(GiB)": 39.01, "reward": 0.4720135033130646, "reward_std": 0.0802900418639183, "rewards/VisualizationJSONCombinedORM/mean": 0.4720135033130646, "rewards/VisualizationJSONCombinedORM/std": 0.09900061786174774, "step": 586, "train_speed(iter/s)": 0.027256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 362.8125, "completions/min_length": 307.0, "epoch": 0.4855252274607113, "grad_norm": 0.1816565841436386, "kl": 0.0396728515625, "learning_rate": 6.1166258589274685e-06, "loss": 0.0003964044153690338, "memory(GiB)": 39.01, "reward": 0.4896053075790405, "reward_std": 0.0840197205543518, "rewards/VisualizationJSONCombinedORM/mean": 0.4896053075790405, "rewards/VisualizationJSONCombinedORM/std": 0.08636032044887543, "step": 587, "train_speed(iter/s)": 0.027266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 378.625, "completions/min_length": 304.0, "epoch": 0.48635235732009924, "grad_norm": 0.17744146287441254, "kl": 0.040496826171875, "learning_rate": 6.102548389879604e-06, "loss": 0.00040541961789131165, "memory(GiB)": 39.01, "reward": 0.5659228563308716, "reward_std": 0.10222768783569336, "rewards/VisualizationJSONCombinedORM/mean": 0.5659228563308716, "rewards/VisualizationJSONCombinedORM/std": 0.10002396255731583, "step": 588, "train_speed(iter/s)": 0.027277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 343.5625, "completions/min_length": 276.0, "epoch": 0.48717948717948717, "grad_norm": 0.18818360567092896, "kl": 0.0443115234375, "learning_rate": 6.088461728211642e-06, "loss": 0.00044347718358039856, "memory(GiB)": 39.01, "reward": 0.5521578788757324, "reward_std": 0.06424443423748016, "rewards/VisualizationJSONCombinedORM/mean": 0.5521578788757324, "rewards/VisualizationJSONCombinedORM/std": 0.09868481010198593, "step": 589, "train_speed(iter/s)": 0.027288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 348.9375, "completions/min_length": 295.0, "epoch": 0.4880066170388751, "grad_norm": 0.17349408566951752, "kl": 0.045654296875, "learning_rate": 6.0743659913727e-06, "loss": 0.0004558153450489044, "memory(GiB)": 39.01, "reward": 0.41356152296066284, "reward_std": 0.06835614144802094, "rewards/VisualizationJSONCombinedORM/mean": 0.41356152296066284, "rewards/VisualizationJSONCombinedORM/std": 0.08155057579278946, "step": 590, "train_speed(iter/s)": 0.027301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 351.6875, "completions/min_length": 294.0, "epoch": 0.48883374689826303, "grad_norm": 0.2591553330421448, "kl": 0.03515625, "learning_rate": 6.060261296887554e-06, "loss": 0.0003516077995300293, "memory(GiB)": 39.01, "reward": 0.7150377035140991, "reward_std": 0.06823220103979111, "rewards/VisualizationJSONCombinedORM/mean": 0.7150377035140991, "rewards/VisualizationJSONCombinedORM/std": 0.08139583468437195, "step": 591, "train_speed(iter/s)": 0.027311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 350.625, "completions/min_length": 294.0, "epoch": 0.48966087675765096, "grad_norm": 0.18927191197872162, "kl": 0.04736328125, "learning_rate": 6.046147762355666e-06, "loss": 0.00047380104660987854, "memory(GiB)": 39.01, "reward": 0.5605558156967163, "reward_std": 0.1244603842496872, "rewards/VisualizationJSONCombinedORM/mean": 0.5605558156967163, "rewards/VisualizationJSONCombinedORM/std": 0.16115182638168335, "step": 592, "train_speed(iter/s)": 0.027324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 346.625, "completions/min_length": 288.0, "epoch": 0.4904880066170389, "grad_norm": 0.16600759327411652, "kl": 0.05999755859375, "learning_rate": 6.0320255054501985e-06, "loss": 0.0006002858281135559, "memory(GiB)": 39.01, "reward": 0.5459524393081665, "reward_std": 0.08521769940853119, "rewards/VisualizationJSONCombinedORM/mean": 0.5459524393081665, "rewards/VisualizationJSONCombinedORM/std": 0.13391734659671783, "step": 593, "train_speed(iter/s)": 0.027339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 327.625, "completions/min_length": 260.0, "epoch": 0.4913151364764268, "grad_norm": 0.18217602372169495, "kl": 0.06201171875, "learning_rate": 6.017894643917049e-06, "loss": 0.0006195604801177979, "memory(GiB)": 39.01, "reward": 0.7171612977981567, "reward_std": 0.07776787877082825, "rewards/VisualizationJSONCombinedORM/mean": 0.7171612977981567, "rewards/VisualizationJSONCombinedORM/std": 0.08006706088781357, "step": 594, "train_speed(iter/s)": 0.027348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 322.375, "completions/min_length": 240.0, "epoch": 0.4921422663358147, "grad_norm": 0.20111939311027527, "kl": 0.04559326171875, "learning_rate": 6.003755295573849e-06, "loss": 0.0004565012641251087, "memory(GiB)": 39.01, "reward": 0.5791586637496948, "reward_std": 0.1003120094537735, "rewards/VisualizationJSONCombinedORM/mean": 0.5791586637496948, "rewards/VisualizationJSONCombinedORM/std": 0.18026600778102875, "step": 595, "train_speed(iter/s)": 0.027364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 340.0, "completions/min_length": 270.0, "epoch": 0.49296939619520264, "grad_norm": 0.169367715716362, "kl": 0.0703125, "learning_rate": 5.98960757830899e-06, "loss": 0.0007040053606033325, "memory(GiB)": 39.01, "reward": 0.502296507358551, "reward_std": 0.062403880059719086, "rewards/VisualizationJSONCombinedORM/mean": 0.502296507358551, "rewards/VisualizationJSONCombinedORM/std": 0.20882254838943481, "step": 596, "train_speed(iter/s)": 0.027378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 341.8125, "completions/min_length": 242.0, "epoch": 0.49379652605459057, "grad_norm": 0.19264079630374908, "kl": 0.05230712890625, "learning_rate": 5.975451610080643e-06, "loss": 0.0005229748785495758, "memory(GiB)": 39.01, "reward": 0.4350605905056, "reward_std": 0.09948823601007462, "rewards/VisualizationJSONCombinedORM/mean": 0.4350605905056, "rewards/VisualizationJSONCombinedORM/std": 0.12673026323318481, "step": 597, "train_speed(iter/s)": 0.02739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 364.9375, "completions/min_length": 291.0, "epoch": 0.4946236559139785, "grad_norm": 0.16399839520454407, "kl": 0.06396484375, "learning_rate": 5.961287508915769e-06, "loss": 0.0006386451423168182, "memory(GiB)": 39.01, "reward": 0.467833936214447, "reward_std": 0.08349832892417908, "rewards/VisualizationJSONCombinedORM/mean": 0.467833936214447, "rewards/VisualizationJSONCombinedORM/std": 0.1955864280462265, "step": 598, "train_speed(iter/s)": 0.027408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 331.875, "completions/min_length": 256.0, "epoch": 0.49545078577336643, "grad_norm": 0.15333959460258484, "kl": 0.03704833984375, "learning_rate": 5.947115392909142e-06, "loss": 0.0003692731261253357, "memory(GiB)": 39.01, "reward": 0.38559913635253906, "reward_std": 0.05175918713212013, "rewards/VisualizationJSONCombinedORM/mean": 0.38559913635253906, "rewards/VisualizationJSONCombinedORM/std": 0.12996447086334229, "step": 599, "train_speed(iter/s)": 0.027418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 344.4375, "completions/min_length": 272.0, "epoch": 0.49627791563275436, "grad_norm": 0.19653986394405365, "kl": 0.039794921875, "learning_rate": 5.932935380222358e-06, "loss": 0.00039764493703842163, "memory(GiB)": 39.01, "reward": 0.37660300731658936, "reward_std": 0.07179658114910126, "rewards/VisualizationJSONCombinedORM/mean": 0.37660300731658936, "rewards/VisualizationJSONCombinedORM/std": 0.08396521955728531, "step": 600, "train_speed(iter/s)": 0.027424 }, { "epoch": 0.49627791563275436, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 393.0833333333333, "eval_completions/mean_length": 329.9427083333333, "eval_completions/min_length": 280.25, "eval_kl": 0.043263753255208336, "eval_loss": 0.00043291723704896867, "eval_reward": 0.4138169201711814, "eval_reward_std": 0.06346090002140652, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4138169201711814, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06346090246612827, "eval_runtime": 328.0289, "eval_samples_per_second": 0.073, "eval_steps_per_second": 0.009, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 347.6875, "completions/min_length": 267.0, "epoch": 0.49710504549214224, "grad_norm": 0.17152473330497742, "kl": 0.05426025390625, "learning_rate": 5.918747589082853e-06, "loss": 0.0005428418517112732, "memory(GiB)": 39.01, "reward": 0.49226364493370056, "reward_std": 0.1252761036157608, "rewards/VisualizationJSONCombinedORM/mean": 0.49226364493370056, "rewards/VisualizationJSONCombinedORM/std": 0.1282554417848587, "step": 601, "train_speed(iter/s)": 0.02703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 321.875, "completions/min_length": 269.0, "epoch": 0.49793217535153017, "grad_norm": 0.1729784607887268, "kl": 0.0528564453125, "learning_rate": 5.904552137782917e-06, "loss": 0.0005291029810905457, "memory(GiB)": 39.01, "reward": 0.3078084886074066, "reward_std": 0.05834585800766945, "rewards/VisualizationJSONCombinedORM/mean": 0.3078084886074066, "rewards/VisualizationJSONCombinedORM/std": 0.05737452581524849, "step": 602, "train_speed(iter/s)": 0.027044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 329.25, "completions/min_length": 289.0, "epoch": 0.4987593052109181, "grad_norm": 0.21500447392463684, "kl": 0.04547119140625, "learning_rate": 5.8903491446787094e-06, "loss": 0.00045484304428100586, "memory(GiB)": 39.01, "reward": 0.39690452814102173, "reward_std": 0.08237336575984955, "rewards/VisualizationJSONCombinedORM/mean": 0.39690452814102173, "rewards/VisualizationJSONCombinedORM/std": 0.09795661270618439, "step": 603, "train_speed(iter/s)": 0.02706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 311.0, "completions/min_length": 266.0, "epoch": 0.49958643507030603, "grad_norm": 0.17241892218589783, "kl": 0.04046630859375, "learning_rate": 5.876138728189268e-06, "loss": 0.0004045069217681885, "memory(GiB)": 39.01, "reward": 0.43651139736175537, "reward_std": 0.09200241416692734, "rewards/VisualizationJSONCombinedORM/mean": 0.43651139736175537, "rewards/VisualizationJSONCombinedORM/std": 0.09121375530958176, "step": 604, "train_speed(iter/s)": 0.027077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 363.5, "completions/min_length": 264.0, "epoch": 0.5004135649296939, "grad_norm": 0.16265258193016052, "kl": 0.035064697265625, "learning_rate": 5.861921006795522e-06, "loss": 0.00035068392753601074, "memory(GiB)": 39.01, "reward": 0.5542746186256409, "reward_std": 0.08968294411897659, "rewards/VisualizationJSONCombinedORM/mean": 0.5542746186256409, "rewards/VisualizationJSONCombinedORM/std": 0.12340617179870605, "step": 605, "train_speed(iter/s)": 0.027083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 296.375, "completions/min_length": 249.0, "epoch": 0.5012406947890818, "grad_norm": 0.17435547709465027, "kl": 0.04498291015625, "learning_rate": 5.8476960990393085e-06, "loss": 0.00044987350702285767, "memory(GiB)": 39.01, "reward": 0.36820271611213684, "reward_std": 0.06955357640981674, "rewards/VisualizationJSONCombinedORM/mean": 0.36820271611213684, "rewards/VisualizationJSONCombinedORM/std": 0.15885475277900696, "step": 606, "train_speed(iter/s)": 0.0271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 330.125, "completions/min_length": 256.0, "epoch": 0.5020678246484698, "grad_norm": 0.17337189614772797, "kl": 0.05169677734375, "learning_rate": 5.833464123522384e-06, "loss": 0.000517725944519043, "memory(GiB)": 39.01, "reward": 0.23600462079048157, "reward_std": 0.055182311683893204, "rewards/VisualizationJSONCombinedORM/mean": 0.23600462079048157, "rewards/VisualizationJSONCombinedORM/std": 0.05588867887854576, "step": 607, "train_speed(iter/s)": 0.027111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 302.875, "completions/min_length": 245.0, "epoch": 0.5028949545078577, "grad_norm": 0.1747315526008606, "kl": 0.03204345703125, "learning_rate": 5.819225198905429e-06, "loss": 0.0003204420208930969, "memory(GiB)": 39.01, "reward": 0.6169562935829163, "reward_std": 0.07052305340766907, "rewards/VisualizationJSONCombinedORM/mean": 0.6169562935829163, "rewards/VisualizationJSONCombinedORM/std": 0.13245894014835358, "step": 608, "train_speed(iter/s)": 0.027123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 317.5625, "completions/min_length": 246.0, "epoch": 0.5037220843672456, "grad_norm": 0.1791042983531952, "kl": 0.04852294921875, "learning_rate": 5.804979443907065e-06, "loss": 0.0004853494465351105, "memory(GiB)": 39.01, "reward": 0.3700868487358093, "reward_std": 0.050494663417339325, "rewards/VisualizationJSONCombinedORM/mean": 0.3700868487358093, "rewards/VisualizationJSONCombinedORM/std": 0.07026553899049759, "step": 609, "train_speed(iter/s)": 0.027129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 331.8125, "completions/min_length": 259.0, "epoch": 0.5045492142266336, "grad_norm": 0.18008971214294434, "kl": 0.036407470703125, "learning_rate": 5.790726977302862e-06, "loss": 0.00036414433270692825, "memory(GiB)": 39.01, "reward": 0.47269997000694275, "reward_std": 0.0719161257147789, "rewards/VisualizationJSONCombinedORM/mean": 0.47269997000694275, "rewards/VisualizationJSONCombinedORM/std": 0.12577854096889496, "step": 610, "train_speed(iter/s)": 0.027144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 286.625, "completions/min_length": 237.0, "epoch": 0.5053763440860215, "grad_norm": 0.1954774409532547, "kl": 0.0377197265625, "learning_rate": 5.7764679179243485e-06, "loss": 0.0003769993782043457, "memory(GiB)": 39.01, "reward": 0.5835720300674438, "reward_std": 0.0677647590637207, "rewards/VisualizationJSONCombinedORM/mean": 0.5835720300674438, "rewards/VisualizationJSONCombinedORM/std": 0.273546427488327, "step": 611, "train_speed(iter/s)": 0.027157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 313.75, "completions/min_length": 252.0, "epoch": 0.5062034739454094, "grad_norm": 0.17247851192951202, "kl": 0.0482177734375, "learning_rate": 5.762202384658021e-06, "loss": 0.00048152171075344086, "memory(GiB)": 39.01, "reward": 0.3469552993774414, "reward_std": 0.0764760673046112, "rewards/VisualizationJSONCombinedORM/mean": 0.3469552993774414, "rewards/VisualizationJSONCombinedORM/std": 0.09926189482212067, "step": 612, "train_speed(iter/s)": 0.027168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 306.8125, "completions/min_length": 241.0, "epoch": 0.5070306038047974, "grad_norm": 0.17687928676605225, "kl": 0.03997802734375, "learning_rate": 5.747930496444356e-06, "loss": 0.000400446355342865, "memory(GiB)": 39.01, "reward": 0.4366337060928345, "reward_std": 0.046604856848716736, "rewards/VisualizationJSONCombinedORM/mean": 0.4366337060928345, "rewards/VisualizationJSONCombinedORM/std": 0.21946647763252258, "step": 613, "train_speed(iter/s)": 0.027185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 343.9375, "completions/min_length": 241.0, "epoch": 0.5078577336641853, "grad_norm": 0.19374088943004608, "kl": 0.0548095703125, "learning_rate": 5.733652372276809e-06, "loss": 0.0005474984645843506, "memory(GiB)": 39.01, "reward": 0.5046683549880981, "reward_std": 0.12869802117347717, "rewards/VisualizationJSONCombinedORM/mean": 0.5046683549880981, "rewards/VisualizationJSONCombinedORM/std": 0.1429625302553177, "step": 614, "train_speed(iter/s)": 0.027202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 293.0625, "completions/min_length": 231.0, "epoch": 0.5086848635235732, "grad_norm": 0.19361869990825653, "kl": 0.033721923828125, "learning_rate": 5.719368131200834e-06, "loss": 0.0003372989594936371, "memory(GiB)": 39.01, "reward": 0.6864579319953918, "reward_std": 0.13037613034248352, "rewards/VisualizationJSONCombinedORM/mean": 0.6864579319953918, "rewards/VisualizationJSONCombinedORM/std": 0.13264714181423187, "step": 615, "train_speed(iter/s)": 0.027218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 328.0, "completions/min_length": 252.0, "epoch": 0.5095119933829612, "grad_norm": 0.1861712783575058, "kl": 0.04754638671875, "learning_rate": 5.705077892312881e-06, "loss": 0.0004764338955283165, "memory(GiB)": 39.01, "reward": 0.5342310070991516, "reward_std": 0.10789857059717178, "rewards/VisualizationJSONCombinedORM/mean": 0.5342310070991516, "rewards/VisualizationJSONCombinedORM/std": 0.13269545137882233, "step": 616, "train_speed(iter/s)": 0.02723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 303.375, "completions/min_length": 246.0, "epoch": 0.5103391232423491, "grad_norm": 0.19382888078689575, "kl": 0.04925537109375, "learning_rate": 5.690781774759412e-06, "loss": 0.000493466854095459, "memory(GiB)": 39.01, "reward": 0.4049810469150543, "reward_std": 0.08559894561767578, "rewards/VisualizationJSONCombinedORM/mean": 0.4049810469150543, "rewards/VisualizationJSONCombinedORM/std": 0.12599915266036987, "step": 617, "train_speed(iter/s)": 0.027248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 343.75, "completions/min_length": 270.0, "epoch": 0.511166253101737, "grad_norm": 0.17280294001102448, "kl": 0.0714111328125, "learning_rate": 5.676479897735899e-06, "loss": 0.0007137376815080643, "memory(GiB)": 39.01, "reward": 0.31651145219802856, "reward_std": 0.05359148606657982, "rewards/VisualizationJSONCombinedORM/mean": 0.31651145219802856, "rewards/VisualizationJSONCombinedORM/std": 0.07166261225938797, "step": 618, "train_speed(iter/s)": 0.027259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 327.3125, "completions/min_length": 260.0, "epoch": 0.5119933829611248, "grad_norm": 0.20046333968639374, "kl": 0.04302978515625, "learning_rate": 5.662172380485835e-06, "loss": 0.0004307851195335388, "memory(GiB)": 39.01, "reward": 0.5377075672149658, "reward_std": 0.11889015883207321, "rewards/VisualizationJSONCombinedORM/mean": 0.5377075672149658, "rewards/VisualizationJSONCombinedORM/std": 0.1674303263425827, "step": 619, "train_speed(iter/s)": 0.02727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 318.75, "completions/min_length": 230.0, "epoch": 0.5128205128205128, "grad_norm": 0.1708124279975891, "kl": 0.05950927734375, "learning_rate": 5.647859342299743e-06, "loss": 0.0005941744893789291, "memory(GiB)": 39.01, "reward": 0.6064522862434387, "reward_std": 0.08289472758769989, "rewards/VisualizationJSONCombinedORM/mean": 0.6064522862434387, "rewards/VisualizationJSONCombinedORM/std": 0.11055532842874527, "step": 620, "train_speed(iter/s)": 0.027284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 313.4375, "completions/min_length": 242.0, "epoch": 0.5136476426799007, "grad_norm": 0.17268334329128265, "kl": 0.07275390625, "learning_rate": 5.63354090251417e-06, "loss": 0.0007274523377418518, "memory(GiB)": 39.01, "reward": 0.5521043539047241, "reward_std": 0.09765832126140594, "rewards/VisualizationJSONCombinedORM/mean": 0.5521043539047241, "rewards/VisualizationJSONCombinedORM/std": 0.11236704140901566, "step": 621, "train_speed(iter/s)": 0.027296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 342.5625, "completions/min_length": 254.0, "epoch": 0.5144747725392886, "grad_norm": 0.18071657419204712, "kl": 0.0462646484375, "learning_rate": 5.619217180510706e-06, "loss": 0.00046320632100105286, "memory(GiB)": 39.01, "reward": 0.5113449692726135, "reward_std": 0.07793738692998886, "rewards/VisualizationJSONCombinedORM/mean": 0.5113449692726135, "rewards/VisualizationJSONCombinedORM/std": 0.1259976178407669, "step": 622, "train_speed(iter/s)": 0.027309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 302.625, "completions/min_length": 235.0, "epoch": 0.5153019023986766, "grad_norm": 0.18573413789272308, "kl": 0.06561279296875, "learning_rate": 5.60488829571498e-06, "loss": 0.0006577447056770325, "memory(GiB)": 39.01, "reward": 0.4180312752723694, "reward_std": 0.08152061700820923, "rewards/VisualizationJSONCombinedORM/mean": 0.4180312752723694, "rewards/VisualizationJSONCombinedORM/std": 0.14035233855247498, "step": 623, "train_speed(iter/s)": 0.027316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 331.125, "completions/min_length": 264.0, "epoch": 0.5161290322580645, "grad_norm": 0.18411800265312195, "kl": 0.0751953125, "learning_rate": 5.590554367595666e-06, "loss": 0.0007539242506027222, "memory(GiB)": 39.01, "reward": 0.39135825634002686, "reward_std": 0.09691844880580902, "rewards/VisualizationJSONCombinedORM/mean": 0.39135825634002686, "rewards/VisualizationJSONCombinedORM/std": 0.09430479258298874, "step": 624, "train_speed(iter/s)": 0.027325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 333.75, "completions/min_length": 267.0, "epoch": 0.5169561621174524, "grad_norm": 0.1698664277791977, "kl": 0.06915283203125, "learning_rate": 5.576215515663489e-06, "loss": 0.0006909742951393127, "memory(GiB)": 39.01, "reward": 0.6880975961685181, "reward_std": 0.09897129237651825, "rewards/VisualizationJSONCombinedORM/mean": 0.6880975961685181, "rewards/VisualizationJSONCombinedORM/std": 0.10580127686262131, "step": 625, "train_speed(iter/s)": 0.027336 }, { "epoch": 0.5169561621174524, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 380.4166666666667, "eval_completions/mean_length": 316.6875, "eval_completions/min_length": 258.9583333333333, "eval_kl": 0.057891845703125, "eval_loss": 0.0005806113476864994, "eval_reward": 0.48637986555695534, "eval_reward_std": 0.07824947230983526, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.48637986555695534, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07824947553065915, "eval_runtime": 320.036, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 312.625, "completions/min_length": 260.0, "epoch": 0.5177832919768404, "grad_norm": 0.17396870255470276, "kl": 0.03741455078125, "learning_rate": 5.561871859470222e-06, "loss": 0.00037419795989990234, "memory(GiB)": 39.01, "reward": 0.661750078201294, "reward_std": 0.12450379133224487, "rewards/VisualizationJSONCombinedORM/mean": 0.661750078201294, "rewards/VisualizationJSONCombinedORM/std": 0.1267261803150177, "step": 626, "train_speed(iter/s)": 0.026973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 294.375, "completions/min_length": 239.0, "epoch": 0.5186104218362283, "grad_norm": 0.19704623520374298, "kl": 0.0689697265625, "learning_rate": 5.5475235186076985e-06, "loss": 0.0006897468119859695, "memory(GiB)": 39.01, "reward": 0.486698716878891, "reward_std": 0.09735895693302155, "rewards/VisualizationJSONCombinedORM/mean": 0.486698716878891, "rewards/VisualizationJSONCombinedORM/std": 0.0995403453707695, "step": 627, "train_speed(iter/s)": 0.026984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 317.9375, "completions/min_length": 257.0, "epoch": 0.5194375516956162, "grad_norm": 0.17160406708717346, "kl": 0.049072265625, "learning_rate": 5.53317061270681e-06, "loss": 0.000490386039018631, "memory(GiB)": 39.01, "reward": 0.6012015342712402, "reward_std": 0.08893634378910065, "rewards/VisualizationJSONCombinedORM/mean": 0.6012015342712402, "rewards/VisualizationJSONCombinedORM/std": 0.16694872081279755, "step": 628, "train_speed(iter/s)": 0.026996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 319.4375, "completions/min_length": 244.0, "epoch": 0.5202646815550042, "grad_norm": 0.20503097772598267, "kl": 0.0675048828125, "learning_rate": 5.51881326143651e-06, "loss": 0.0006759446114301682, "memory(GiB)": 39.01, "reward": 0.5606397390365601, "reward_std": 0.13758884370326996, "rewards/VisualizationJSONCombinedORM/mean": 0.5606397390365601, "rewards/VisualizationJSONCombinedORM/std": 0.13964568078517914, "step": 629, "train_speed(iter/s)": 0.027006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 323.4375, "completions/min_length": 243.0, "epoch": 0.5210918114143921, "grad_norm": 0.17631658911705017, "kl": 0.04864501953125, "learning_rate": 5.504451584502813e-06, "loss": 0.0004870481789112091, "memory(GiB)": 39.01, "reward": 0.7018367648124695, "reward_std": 0.10840842127799988, "rewards/VisualizationJSONCombinedORM/mean": 0.7018367648124695, "rewards/VisualizationJSONCombinedORM/std": 0.154098242521286, "step": 630, "train_speed(iter/s)": 0.027017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 307.4375, "completions/min_length": 233.0, "epoch": 0.52191894127378, "grad_norm": 0.17974627017974854, "kl": 0.06658935546875, "learning_rate": 5.490085701647805e-06, "loss": 0.0006653442978858948, "memory(GiB)": 39.01, "reward": 0.6551070213317871, "reward_std": 0.10039927065372467, "rewards/VisualizationJSONCombinedORM/mean": 0.6551070213317871, "rewards/VisualizationJSONCombinedORM/std": 0.11087481677532196, "step": 631, "train_speed(iter/s)": 0.027022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 320.625, "completions/min_length": 253.0, "epoch": 0.522746071133168, "grad_norm": 0.19461293518543243, "kl": 0.07257080078125, "learning_rate": 5.47571573264863e-06, "loss": 0.0007261745631694794, "memory(GiB)": 39.01, "reward": 0.40746915340423584, "reward_std": 0.0828380286693573, "rewards/VisualizationJSONCombinedORM/mean": 0.40746915340423584, "rewards/VisualizationJSONCombinedORM/std": 0.12044678628444672, "step": 632, "train_speed(iter/s)": 0.02703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 317.625, "completions/min_length": 247.0, "epoch": 0.5235732009925558, "grad_norm": 0.18320788443088531, "kl": 0.0863037109375, "learning_rate": 5.46134179731651e-06, "loss": 0.0008616875857114792, "memory(GiB)": 39.01, "reward": 0.5876585245132446, "reward_std": 0.10678640753030777, "rewards/VisualizationJSONCombinedORM/mean": 0.5876585245132446, "rewards/VisualizationJSONCombinedORM/std": 0.12081913650035858, "step": 633, "train_speed(iter/s)": 0.027043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 310.0, "completions/min_length": 247.0, "epoch": 0.5244003308519437, "grad_norm": 0.1756642460823059, "kl": 0.043701171875, "learning_rate": 5.446964015495734e-06, "loss": 0.000436149537563324, "memory(GiB)": 39.01, "reward": 0.5919947624206543, "reward_std": 0.11471063643693924, "rewards/VisualizationJSONCombinedORM/mean": 0.5919947624206543, "rewards/VisualizationJSONCombinedORM/std": 0.15230369567871094, "step": 634, "train_speed(iter/s)": 0.027057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 297.6875, "completions/min_length": 239.0, "epoch": 0.5252274607113316, "grad_norm": 0.22489076852798462, "kl": 0.05609130859375, "learning_rate": 5.432582507062658e-06, "loss": 0.000561937689781189, "memory(GiB)": 39.01, "reward": 0.6816614866256714, "reward_std": 0.14332672953605652, "rewards/VisualizationJSONCombinedORM/mean": 0.6816614866256714, "rewards/VisualizationJSONCombinedORM/std": 0.1388152539730072, "step": 635, "train_speed(iter/s)": 0.027065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 332.1875, "completions/min_length": 286.0, "epoch": 0.5260545905707196, "grad_norm": 0.18479421734809875, "kl": 0.0576171875, "learning_rate": 5.418197391924712e-06, "loss": 0.0005762912333011627, "memory(GiB)": 39.01, "reward": 0.6851605176925659, "reward_std": 0.12822309136390686, "rewards/VisualizationJSONCombinedORM/mean": 0.6851605176925659, "rewards/VisualizationJSONCombinedORM/std": 0.12955285608768463, "step": 636, "train_speed(iter/s)": 0.027072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 320.8125, "completions/min_length": 271.0, "epoch": 0.5268817204301075, "grad_norm": 0.18806946277618408, "kl": 0.05743408203125, "learning_rate": 5.4038087900193985e-06, "loss": 0.0005743391811847687, "memory(GiB)": 39.01, "reward": 0.5333905220031738, "reward_std": 0.13232894241809845, "rewards/VisualizationJSONCombinedORM/mean": 0.5333905220031738, "rewards/VisualizationJSONCombinedORM/std": 0.1532086879014969, "step": 637, "train_speed(iter/s)": 0.02708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 321.8125, "completions/min_length": 239.0, "epoch": 0.5277088502894954, "grad_norm": 0.17516881227493286, "kl": 0.05255126953125, "learning_rate": 5.3894168213132865e-06, "loss": 0.0005251504480838776, "memory(GiB)": 39.01, "reward": 0.6707932949066162, "reward_std": 0.10488291084766388, "rewards/VisualizationJSONCombinedORM/mean": 0.6707932949066162, "rewards/VisualizationJSONCombinedORM/std": 0.1298818141222, "step": 638, "train_speed(iter/s)": 0.027092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 309.1875, "completions/min_length": 250.0, "epoch": 0.5285359801488834, "grad_norm": 0.18007723987102509, "kl": 0.0802001953125, "learning_rate": 5.375021605801023e-06, "loss": 0.0007998496294021606, "memory(GiB)": 39.01, "reward": 0.2937185764312744, "reward_std": 0.04618797078728676, "rewards/VisualizationJSONCombinedORM/mean": 0.2937185764312744, "rewards/VisualizationJSONCombinedORM/std": 0.04846681281924248, "step": 639, "train_speed(iter/s)": 0.027098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 356.875, "completions/min_length": 270.0, "epoch": 0.5293631100082713, "grad_norm": 0.15458324551582336, "kl": 0.042236328125, "learning_rate": 5.3606232635043185e-06, "loss": 0.00042262859642505646, "memory(GiB)": 39.01, "reward": 0.6388571262359619, "reward_std": 0.10394762456417084, "rewards/VisualizationJSONCombinedORM/mean": 0.6388571262359619, "rewards/VisualizationJSONCombinedORM/std": 0.1333397775888443, "step": 640, "train_speed(iter/s)": 0.027104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 315.5625, "completions/min_length": 236.0, "epoch": 0.5301902398676592, "grad_norm": 0.17517147958278656, "kl": 0.06243896484375, "learning_rate": 5.346221914470959e-06, "loss": 0.0006244555115699768, "memory(GiB)": 39.01, "reward": 0.6398583650588989, "reward_std": 0.09813663363456726, "rewards/VisualizationJSONCombinedORM/mean": 0.6398583650588989, "rewards/VisualizationJSONCombinedORM/std": 0.16231423616409302, "step": 641, "train_speed(iter/s)": 0.027118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 329.5625, "completions/min_length": 275.0, "epoch": 0.5310173697270472, "grad_norm": 0.1784866750240326, "kl": 0.06744384765625, "learning_rate": 5.331817678773796e-06, "loss": 0.0006749816238880157, "memory(GiB)": 39.01, "reward": 0.5206102132797241, "reward_std": 0.07239468395709991, "rewards/VisualizationJSONCombinedORM/mean": 0.5206102132797241, "rewards/VisualizationJSONCombinedORM/std": 0.21540401875972748, "step": 642, "train_speed(iter/s)": 0.027125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 312.9375, "completions/min_length": 229.0, "epoch": 0.5318444995864351, "grad_norm": 0.22100761532783508, "kl": 0.06304931640625, "learning_rate": 5.317410676509752e-06, "loss": 0.0006303805857896805, "memory(GiB)": 39.01, "reward": 0.30999577045440674, "reward_std": 0.06270073354244232, "rewards/VisualizationJSONCombinedORM/mean": 0.30999577045440674, "rewards/VisualizationJSONCombinedORM/std": 0.06711394339799881, "step": 643, "train_speed(iter/s)": 0.02714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 346.125, "completions/min_length": 291.0, "epoch": 0.532671629445823, "grad_norm": 0.18108543753623962, "kl": 0.07330322265625, "learning_rate": 5.303001027798813e-06, "loss": 0.0007338970899581909, "memory(GiB)": 39.01, "reward": 0.5181233882904053, "reward_std": 0.09184472262859344, "rewards/VisualizationJSONCombinedORM/mean": 0.5181233882904053, "rewards/VisualizationJSONCombinedORM/std": 0.09469900280237198, "step": 644, "train_speed(iter/s)": 0.027154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 322.75, "completions/min_length": 225.0, "epoch": 0.533498759305211, "grad_norm": 0.18283981084823608, "kl": 0.042236328125, "learning_rate": 5.288588852783031e-06, "loss": 0.0004233457148075104, "memory(GiB)": 39.01, "reward": 0.7321548461914062, "reward_std": 0.08587665855884552, "rewards/VisualizationJSONCombinedORM/mean": 0.7321548461914062, "rewards/VisualizationJSONCombinedORM/std": 0.10277298837900162, "step": 645, "train_speed(iter/s)": 0.027167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 298.75, "completions/min_length": 229.0, "epoch": 0.5343258891645989, "grad_norm": 0.16540877521038055, "kl": 0.0615234375, "learning_rate": 5.274174271625522e-06, "loss": 0.0006148926913738251, "memory(GiB)": 39.01, "reward": 0.6334187984466553, "reward_std": 0.10012688487768173, "rewards/VisualizationJSONCombinedORM/mean": 0.6334187984466553, "rewards/VisualizationJSONCombinedORM/std": 0.09968066960573196, "step": 646, "train_speed(iter/s)": 0.027175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 310.8125, "completions/min_length": 235.0, "epoch": 0.5351530190239868, "grad_norm": 0.16134892404079437, "kl": 0.054931640625, "learning_rate": 5.259757404509463e-06, "loss": 0.000550098717212677, "memory(GiB)": 39.01, "reward": 0.600437581539154, "reward_std": 0.11888270080089569, "rewards/VisualizationJSONCombinedORM/mean": 0.600437581539154, "rewards/VisualizationJSONCombinedORM/std": 0.12419936060905457, "step": 647, "train_speed(iter/s)": 0.027186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 315.0625, "completions/min_length": 254.0, "epoch": 0.5359801488833746, "grad_norm": 0.17267711460590363, "kl": 0.0777587890625, "learning_rate": 5.245338371637091e-06, "loss": 0.0007772510871291161, "memory(GiB)": 39.01, "reward": 0.4102120101451874, "reward_std": 0.07338766753673553, "rewards/VisualizationJSONCombinedORM/mean": 0.4102120101451874, "rewards/VisualizationJSONCombinedORM/std": 0.14618328213691711, "step": 648, "train_speed(iter/s)": 0.027199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 313.25, "completions/min_length": 254.0, "epoch": 0.5368072787427626, "grad_norm": 0.19222284853458405, "kl": 0.04730224609375, "learning_rate": 5.230917293228699e-06, "loss": 0.00047282129526138306, "memory(GiB)": 39.01, "reward": 0.3712560534477234, "reward_std": 0.04085186868906021, "rewards/VisualizationJSONCombinedORM/mean": 0.3712560534477234, "rewards/VisualizationJSONCombinedORM/std": 0.04048441722989082, "step": 649, "train_speed(iter/s)": 0.02721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 307.4375, "completions/min_length": 217.0, "epoch": 0.5376344086021505, "grad_norm": 0.1807480752468109, "kl": 0.06707763671875, "learning_rate": 5.216494289521637e-06, "loss": 0.000669136643409729, "memory(GiB)": 39.01, "reward": 0.44205933809280396, "reward_std": 0.11526401340961456, "rewards/VisualizationJSONCombinedORM/mean": 0.44205933809280396, "rewards/VisualizationJSONCombinedORM/std": 0.12575457990169525, "step": 650, "train_speed(iter/s)": 0.027225 }, { "epoch": 0.5376344086021505, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 383.2916666666667, "eval_completions/mean_length": 318.6197916666667, "eval_completions/min_length": 266.9166666666667, "eval_kl": 0.050104777018229164, "eval_loss": 0.0005034608766436577, "eval_reward": 0.46673880827923614, "eval_reward_std": 0.07277199043892324, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46673880827923614, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07277199005087216, "eval_runtime": 322.5546, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 291.3125, "completions/min_length": 222.0, "epoch": 0.5384615384615384, "grad_norm": 0.18539179861545563, "kl": 0.06890869140625, "learning_rate": 5.2020694807693015e-06, "loss": 0.0006888397037982941, "memory(GiB)": 39.01, "reward": 0.7228487730026245, "reward_std": 0.11362152546644211, "rewards/VisualizationJSONCombinedORM/mean": 0.7228487730026245, "rewards/VisualizationJSONCombinedORM/std": 0.1293218433856964, "step": 651, "train_speed(iter/s)": 0.026874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 302.875, "completions/min_length": 241.0, "epoch": 0.5392886683209264, "grad_norm": 0.17719414830207825, "kl": 0.04705810546875, "learning_rate": 5.18764298724015e-06, "loss": 0.0004697442054748535, "memory(GiB)": 39.01, "reward": 0.4946707487106323, "reward_std": 0.11233416199684143, "rewards/VisualizationJSONCombinedORM/mean": 0.4946707487106323, "rewards/VisualizationJSONCombinedORM/std": 0.11092167347669601, "step": 652, "train_speed(iter/s)": 0.026885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 326.875, "completions/min_length": 240.0, "epoch": 0.5401157981803143, "grad_norm": 0.173005610704422, "kl": 0.0416259765625, "learning_rate": 5.173214929216677e-06, "loss": 0.000416044145822525, "memory(GiB)": 39.01, "reward": 0.5740730166435242, "reward_std": 0.07498432695865631, "rewards/VisualizationJSONCombinedORM/mean": 0.5740730166435242, "rewards/VisualizationJSONCombinedORM/std": 0.24594691395759583, "step": 653, "train_speed(iter/s)": 0.026897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 291.75, "completions/min_length": 236.0, "epoch": 0.5409429280397022, "grad_norm": 0.17467503249645233, "kl": 0.06890869140625, "learning_rate": 5.158785426994423e-06, "loss": 0.0006896834820508957, "memory(GiB)": 39.01, "reward": 0.47290706634521484, "reward_std": 0.06448210775852203, "rewards/VisualizationJSONCombinedORM/mean": 0.47290706634521484, "rewards/VisualizationJSONCombinedORM/std": 0.21244972944259644, "step": 654, "train_speed(iter/s)": 0.026911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 310.9375, "completions/min_length": 239.0, "epoch": 0.5417700578990902, "grad_norm": 0.16236664354801178, "kl": 0.05120849609375, "learning_rate": 5.144354600880974e-06, "loss": 0.0005129575729370117, "memory(GiB)": 39.01, "reward": 0.5152550935745239, "reward_std": 0.0967630073428154, "rewards/VisualizationJSONCombinedORM/mean": 0.5152550935745239, "rewards/VisualizationJSONCombinedORM/std": 0.09969876706600189, "step": 655, "train_speed(iter/s)": 0.026927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 309.0, "completions/min_length": 259.0, "epoch": 0.5425971877584781, "grad_norm": 0.1618887186050415, "kl": 0.0462646484375, "learning_rate": 5.129922571194949e-06, "loss": 0.00046284496784210205, "memory(GiB)": 39.01, "reward": 0.6981891989707947, "reward_std": 0.12459969520568848, "rewards/VisualizationJSONCombinedORM/mean": 0.6981891989707947, "rewards/VisualizationJSONCombinedORM/std": 0.13238631188869476, "step": 656, "train_speed(iter/s)": 0.026939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 311.8125, "completions/min_length": 245.0, "epoch": 0.543424317617866, "grad_norm": 0.24163055419921875, "kl": 0.05047607421875, "learning_rate": 5.115489458265006e-06, "loss": 0.0005041211843490601, "memory(GiB)": 39.01, "reward": 0.3824876546859741, "reward_std": 0.09976936876773834, "rewards/VisualizationJSONCombinedORM/mean": 0.3824876546859741, "rewards/VisualizationJSONCombinedORM/std": 0.09834657609462738, "step": 657, "train_speed(iter/s)": 0.026946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 332.125, "completions/min_length": 251.0, "epoch": 0.544251447477254, "grad_norm": 0.16861550509929657, "kl": 0.0426025390625, "learning_rate": 5.101055382428831e-06, "loss": 0.00042590871453285217, "memory(GiB)": 39.01, "reward": 0.6380776166915894, "reward_std": 0.10747124254703522, "rewards/VisualizationJSONCombinedORM/mean": 0.6380776166915894, "rewards/VisualizationJSONCombinedORM/std": 0.13250049948692322, "step": 658, "train_speed(iter/s)": 0.026956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 329.375, "completions/min_length": 258.0, "epoch": 0.5450785773366419, "grad_norm": 0.18384085595607758, "kl": 0.05743408203125, "learning_rate": 5.086620464032143e-06, "loss": 0.0005727857351303101, "memory(GiB)": 39.01, "reward": 0.704930305480957, "reward_std": 0.12978509068489075, "rewards/VisualizationJSONCombinedORM/mean": 0.704930305480957, "rewards/VisualizationJSONCombinedORM/std": 0.14295485615730286, "step": 659, "train_speed(iter/s)": 0.026968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 297.25, "completions/min_length": 238.0, "epoch": 0.5459057071960298, "grad_norm": 0.168453186750412, "kl": 0.03656005859375, "learning_rate": 5.07218482342768e-06, "loss": 0.0003648996353149414, "memory(GiB)": 39.01, "reward": 0.6858714818954468, "reward_std": 0.08289801329374313, "rewards/VisualizationJSONCombinedORM/mean": 0.6858714818954468, "rewards/VisualizationJSONCombinedORM/std": 0.09345098584890366, "step": 660, "train_speed(iter/s)": 0.02698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 327.875, "completions/min_length": 262.0, "epoch": 0.5467328370554178, "grad_norm": 0.16225817799568176, "kl": 0.0426025390625, "learning_rate": 5.057748580974204e-06, "loss": 0.00042633339762687683, "memory(GiB)": 39.01, "reward": 0.7758417129516602, "reward_std": 0.06254850327968597, "rewards/VisualizationJSONCombinedORM/mean": 0.7758417129516602, "rewards/VisualizationJSONCombinedORM/std": 0.09025049954652786, "step": 661, "train_speed(iter/s)": 0.02699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 325.625, "completions/min_length": 266.0, "epoch": 0.5475599669148056, "grad_norm": 0.17779570817947388, "kl": 0.04931640625, "learning_rate": 5.043311857035499e-06, "loss": 0.0004934929311275482, "memory(GiB)": 39.01, "reward": 0.534653902053833, "reward_std": 0.058907344937324524, "rewards/VisualizationJSONCombinedORM/mean": 0.534653902053833, "rewards/VisualizationJSONCombinedORM/std": 0.20089410245418549, "step": 662, "train_speed(iter/s)": 0.027002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 298.0625, "completions/min_length": 242.0, "epoch": 0.5483870967741935, "grad_norm": 0.17983378469944, "kl": 0.073486328125, "learning_rate": 5.0288747719793584e-06, "loss": 0.0007355287671089172, "memory(GiB)": 39.01, "reward": 0.4904405176639557, "reward_std": 0.12107819318771362, "rewards/VisualizationJSONCombinedORM/mean": 0.4904405176639557, "rewards/VisualizationJSONCombinedORM/std": 0.192356675863266, "step": 663, "train_speed(iter/s)": 0.027008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 294.5625, "completions/min_length": 236.0, "epoch": 0.5492142266335814, "grad_norm": 0.13910618424415588, "kl": 0.029205322265625, "learning_rate": 5.014437446176588e-06, "loss": 0.00029231607913970947, "memory(GiB)": 39.01, "reward": 0.654962420463562, "reward_std": 0.09897518157958984, "rewards/VisualizationJSONCombinedORM/mean": 0.654962420463562, "rewards/VisualizationJSONCombinedORM/std": 0.10576522350311279, "step": 664, "train_speed(iter/s)": 0.02702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 295.3125, "completions/min_length": 240.0, "epoch": 0.5500413564929694, "grad_norm": 0.19383539259433746, "kl": 0.02734375, "learning_rate": 5e-06, "loss": 0.00027213990688323975, "memory(GiB)": 39.01, "reward": 0.5265774726867676, "reward_std": 0.10458657145500183, "rewards/VisualizationJSONCombinedORM/mean": 0.5265774726867676, "rewards/VisualizationJSONCombinedORM/std": 0.13442686200141907, "step": 665, "train_speed(iter/s)": 0.027031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 307.875, "completions/min_length": 242.0, "epoch": 0.5508684863523573, "grad_norm": 0.17686212062835693, "kl": 0.058349609375, "learning_rate": 4.985562553823413e-06, "loss": 0.0005839243531227112, "memory(GiB)": 39.01, "reward": 0.6046504974365234, "reward_std": 0.0909055843949318, "rewards/VisualizationJSONCombinedORM/mean": 0.6046504974365234, "rewards/VisualizationJSONCombinedORM/std": 0.1085582971572876, "step": 666, "train_speed(iter/s)": 0.02704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 313.5, "completions/min_length": 254.0, "epoch": 0.5516956162117452, "grad_norm": 0.16958320140838623, "kl": 0.0511474609375, "learning_rate": 4.971125228020643e-06, "loss": 0.0005109570920467377, "memory(GiB)": 39.01, "reward": 0.7205912470817566, "reward_std": 0.086489237844944, "rewards/VisualizationJSONCombinedORM/mean": 0.7205912470817566, "rewards/VisualizationJSONCombinedORM/std": 0.08663684874773026, "step": 667, "train_speed(iter/s)": 0.027051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 335.75, "completions/min_length": 266.0, "epoch": 0.5525227460711332, "grad_norm": 0.17685551941394806, "kl": 0.04833984375, "learning_rate": 4.956688142964501e-06, "loss": 0.00048267096281051636, "memory(GiB)": 39.01, "reward": 0.5840121507644653, "reward_std": 0.07776634395122528, "rewards/VisualizationJSONCombinedORM/mean": 0.5840121507644653, "rewards/VisualizationJSONCombinedORM/std": 0.1670054942369461, "step": 668, "train_speed(iter/s)": 0.027061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 297.75, "completions/min_length": 247.0, "epoch": 0.5533498759305211, "grad_norm": 0.16916011273860931, "kl": 0.03948974609375, "learning_rate": 4.942251419025797e-06, "loss": 0.0003946702927350998, "memory(GiB)": 39.01, "reward": 0.7130435109138489, "reward_std": 0.07246337831020355, "rewards/VisualizationJSONCombinedORM/mean": 0.7130435109138489, "rewards/VisualizationJSONCombinedORM/std": 0.07391514629125595, "step": 669, "train_speed(iter/s)": 0.027073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 319.0, "completions/min_length": 247.0, "epoch": 0.554177005789909, "grad_norm": 0.19807960093021393, "kl": 0.05352783203125, "learning_rate": 4.927815176572322e-06, "loss": 0.000535118393599987, "memory(GiB)": 39.01, "reward": 0.35917913913726807, "reward_std": 0.07262025028467178, "rewards/VisualizationJSONCombinedORM/mean": 0.35917913913726807, "rewards/VisualizationJSONCombinedORM/std": 0.07851105183362961, "step": 670, "train_speed(iter/s)": 0.027085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 331.4375, "completions/min_length": 224.0, "epoch": 0.555004135649297, "grad_norm": 0.19058841466903687, "kl": 0.051025390625, "learning_rate": 4.913379535967859e-06, "loss": 0.0005103964358568192, "memory(GiB)": 39.01, "reward": 0.3269246220588684, "reward_std": 0.08635742962360382, "rewards/VisualizationJSONCombinedORM/mean": 0.3269246220588684, "rewards/VisualizationJSONCombinedORM/std": 0.11395876109600067, "step": 671, "train_speed(iter/s)": 0.027096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 289.5, "completions/min_length": 231.0, "epoch": 0.5558312655086849, "grad_norm": 0.16083937883377075, "kl": 0.030364990234375, "learning_rate": 4.898944617571169e-06, "loss": 0.00030338019132614136, "memory(GiB)": 39.01, "reward": 0.5222383737564087, "reward_std": 0.06460092961788177, "rewards/VisualizationJSONCombinedORM/mean": 0.5222383737564087, "rewards/VisualizationJSONCombinedORM/std": 0.14874044060707092, "step": 672, "train_speed(iter/s)": 0.027106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 329.9375, "completions/min_length": 276.0, "epoch": 0.5566583953680728, "grad_norm": 0.17990411818027496, "kl": 0.0406494140625, "learning_rate": 4.8845105417349955e-06, "loss": 0.0004066266119480133, "memory(GiB)": 39.01, "reward": 0.5329511165618896, "reward_std": 0.11191883683204651, "rewards/VisualizationJSONCombinedORM/mean": 0.5329511165618896, "rewards/VisualizationJSONCombinedORM/std": 0.11202225089073181, "step": 673, "train_speed(iter/s)": 0.027116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 332.5, "completions/min_length": 237.0, "epoch": 0.5574855252274608, "grad_norm": 0.18800169229507446, "kl": 0.04425048828125, "learning_rate": 4.8700774288050515e-06, "loss": 0.0004423670470714569, "memory(GiB)": 39.01, "reward": 0.517743706703186, "reward_std": 0.07818888127803802, "rewards/VisualizationJSONCombinedORM/mean": 0.517743706703186, "rewards/VisualizationJSONCombinedORM/std": 0.1684056669473648, "step": 674, "train_speed(iter/s)": 0.027124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 336.375, "completions/min_length": 240.0, "epoch": 0.5583126550868487, "grad_norm": 0.167744979262352, "kl": 0.04302978515625, "learning_rate": 4.855645399119028e-06, "loss": 0.0004310682415962219, "memory(GiB)": 39.01, "reward": 0.650854229927063, "reward_std": 0.12319633364677429, "rewards/VisualizationJSONCombinedORM/mean": 0.650854229927063, "rewards/VisualizationJSONCombinedORM/std": 0.12208806723356247, "step": 675, "train_speed(iter/s)": 0.027135 }, { "epoch": 0.5583126550868487, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.5833333333333, "eval_completions/mean_length": 310.9427083333333, "eval_completions/min_length": 258.7083333333333, "eval_kl": 0.0480194091796875, "eval_loss": 0.000482986361021176, "eval_reward": 0.45771812833845615, "eval_reward_std": 0.07384090769725542, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45771812833845615, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07384090668832262, "eval_runtime": 312.6688, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 318.5, "completions/min_length": 249.0, "epoch": 0.5591397849462365, "grad_norm": 0.1957070678472519, "kl": 0.03973388671875, "learning_rate": 4.841214573005578e-06, "loss": 0.0003976374864578247, "memory(GiB)": 39.01, "reward": 0.6088285446166992, "reward_std": 0.14201539754867554, "rewards/VisualizationJSONCombinedORM/mean": 0.6088285446166992, "rewards/VisualizationJSONCombinedORM/std": 0.14107081294059753, "step": 676, "train_speed(iter/s)": 0.026811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 300.4375, "completions/min_length": 236.0, "epoch": 0.5599669148056244, "grad_norm": 0.17783810198307037, "kl": 0.0347900390625, "learning_rate": 4.826785070783326e-06, "loss": 0.0003476962447166443, "memory(GiB)": 39.01, "reward": 0.5297040343284607, "reward_std": 0.0869755744934082, "rewards/VisualizationJSONCombinedORM/mean": 0.5297040343284607, "rewards/VisualizationJSONCombinedORM/std": 0.1710461676120758, "step": 677, "train_speed(iter/s)": 0.026825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 303.6875, "completions/min_length": 226.0, "epoch": 0.5607940446650124, "grad_norm": 0.18154136836528778, "kl": 0.041748046875, "learning_rate": 4.8123570127598514e-06, "loss": 0.00041741877794265747, "memory(GiB)": 39.01, "reward": 0.26408475637435913, "reward_std": 0.035387322306632996, "rewards/VisualizationJSONCombinedORM/mean": 0.26408475637435913, "rewards/VisualizationJSONCombinedORM/std": 0.0978817492723465, "step": 678, "train_speed(iter/s)": 0.026837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 308.625, "completions/min_length": 242.0, "epoch": 0.5616211745244003, "grad_norm": 0.15753068029880524, "kl": 0.039794921875, "learning_rate": 4.797930519230699e-06, "loss": 0.0003978833556175232, "memory(GiB)": 39.01, "reward": 0.6824887990951538, "reward_std": 0.10801628232002258, "rewards/VisualizationJSONCombinedORM/mean": 0.6824887990951538, "rewards/VisualizationJSONCombinedORM/std": 0.13052956759929657, "step": 679, "train_speed(iter/s)": 0.026846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 306.5625, "completions/min_length": 251.0, "epoch": 0.5624483043837882, "grad_norm": 0.20162633061408997, "kl": 0.041839599609375, "learning_rate": 4.783505710478366e-06, "loss": 0.000418882817029953, "memory(GiB)": 39.01, "reward": 0.532463550567627, "reward_std": 0.09572000801563263, "rewards/VisualizationJSONCombinedORM/mean": 0.532463550567627, "rewards/VisualizationJSONCombinedORM/std": 0.10399055480957031, "step": 680, "train_speed(iter/s)": 0.026862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 289.375, "completions/min_length": 233.0, "epoch": 0.5632754342431762, "grad_norm": 0.16864904761314392, "kl": 0.05621337890625, "learning_rate": 4.7690827067713035e-06, "loss": 0.0005621463060379028, "memory(GiB)": 39.01, "reward": 0.5390952229499817, "reward_std": 0.10660607367753983, "rewards/VisualizationJSONCombinedORM/mean": 0.5390952229499817, "rewards/VisualizationJSONCombinedORM/std": 0.22712263464927673, "step": 681, "train_speed(iter/s)": 0.026875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 289.625, "completions/min_length": 239.0, "epoch": 0.5641025641025641, "grad_norm": 0.17119552195072174, "kl": 0.04095458984375, "learning_rate": 4.75466162836291e-06, "loss": 0.00040828902274370193, "memory(GiB)": 39.01, "reward": 0.6632382869720459, "reward_std": 0.1137123703956604, "rewards/VisualizationJSONCombinedORM/mean": 0.6632382869720459, "rewards/VisualizationJSONCombinedORM/std": 0.11286807060241699, "step": 682, "train_speed(iter/s)": 0.026888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 303.0625, "completions/min_length": 249.0, "epoch": 0.564929693961952, "grad_norm": 0.16165092587471008, "kl": 0.0352783203125, "learning_rate": 4.740242595490537e-06, "loss": 0.0003511030226945877, "memory(GiB)": 39.01, "reward": 0.5066663026809692, "reward_std": 0.05083434656262398, "rewards/VisualizationJSONCombinedORM/mean": 0.5066663026809692, "rewards/VisualizationJSONCombinedORM/std": 0.13605259358882904, "step": 683, "train_speed(iter/s)": 0.026898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 317.75, "completions/min_length": 256.0, "epoch": 0.56575682382134, "grad_norm": 0.17220641672611237, "kl": 0.03466796875, "learning_rate": 4.725825728374479e-06, "loss": 0.00034622102975845337, "memory(GiB)": 39.01, "reward": 0.7113417387008667, "reward_std": 0.10412628948688507, "rewards/VisualizationJSONCombinedORM/mean": 0.7113417387008667, "rewards/VisualizationJSONCombinedORM/std": 0.12257516384124756, "step": 684, "train_speed(iter/s)": 0.026911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 306.875, "completions/min_length": 241.0, "epoch": 0.5665839536807279, "grad_norm": 0.1723683625459671, "kl": 0.0384521484375, "learning_rate": 4.711411147216969e-06, "loss": 0.0003845691680908203, "memory(GiB)": 39.01, "reward": 0.6538636684417725, "reward_std": 0.09057874977588654, "rewards/VisualizationJSONCombinedORM/mean": 0.6538636684417725, "rewards/VisualizationJSONCombinedORM/std": 0.14514890313148499, "step": 685, "train_speed(iter/s)": 0.026918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 337.125, "completions/min_length": 265.0, "epoch": 0.5674110835401158, "grad_norm": 0.15550270676612854, "kl": 0.037811279296875, "learning_rate": 4.696998972201189e-06, "loss": 0.0003778040409088135, "memory(GiB)": 39.01, "reward": 0.5381402969360352, "reward_std": 0.06209126114845276, "rewards/VisualizationJSONCombinedORM/mean": 0.5381402969360352, "rewards/VisualizationJSONCombinedORM/std": 0.19674807786941528, "step": 686, "train_speed(iter/s)": 0.026927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 315.75, "completions/min_length": 236.0, "epoch": 0.5682382133995038, "grad_norm": 0.1693613976240158, "kl": 0.034515380859375, "learning_rate": 4.6825893234902485e-06, "loss": 0.0003451928496360779, "memory(GiB)": 39.01, "reward": 0.5486926436424255, "reward_std": 0.0567917674779892, "rewards/VisualizationJSONCombinedORM/mean": 0.5486926436424255, "rewards/VisualizationJSONCombinedORM/std": 0.07312964648008347, "step": 687, "train_speed(iter/s)": 0.026939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 313.0625, "completions/min_length": 274.0, "epoch": 0.5690653432588917, "grad_norm": 0.21979081630706787, "kl": 0.0465087890625, "learning_rate": 4.668182321226205e-06, "loss": 0.00046473927795886993, "memory(GiB)": 39.01, "reward": 0.6458298563957214, "reward_std": 0.13464708626270294, "rewards/VisualizationJSONCombinedORM/mean": 0.6458298563957214, "rewards/VisualizationJSONCombinedORM/std": 0.13411667943000793, "step": 688, "train_speed(iter/s)": 0.02695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 317.9375, "completions/min_length": 232.0, "epoch": 0.5698924731182796, "grad_norm": 0.14197880029678345, "kl": 0.0704345703125, "learning_rate": 4.653778085529043e-06, "loss": 0.0007049143314361572, "memory(GiB)": 39.01, "reward": 0.4883829951286316, "reward_std": 0.06577118486166, "rewards/VisualizationJSONCombinedORM/mean": 0.4883829951286316, "rewards/VisualizationJSONCombinedORM/std": 0.29062679409980774, "step": 689, "train_speed(iter/s)": 0.026955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 347.1875, "completions/min_length": 272.0, "epoch": 0.5707196029776674, "grad_norm": 0.1601010262966156, "kl": 0.0863037109375, "learning_rate": 4.639376736495683e-06, "loss": 0.0008627660572528839, "memory(GiB)": 39.01, "reward": 0.43008938431739807, "reward_std": 0.07572565972805023, "rewards/VisualizationJSONCombinedORM/mean": 0.43008938431739807, "rewards/VisualizationJSONCombinedORM/std": 0.17209890484809875, "step": 690, "train_speed(iter/s)": 0.026961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 311.75, "completions/min_length": 250.0, "epoch": 0.5715467328370554, "grad_norm": 0.1873917430639267, "kl": 0.0457763671875, "learning_rate": 4.624978394198978e-06, "loss": 0.00045709311962127686, "memory(GiB)": 39.01, "reward": 0.30651864409446716, "reward_std": 0.05888194963335991, "rewards/VisualizationJSONCombinedORM/mean": 0.30651864409446716, "rewards/VisualizationJSONCombinedORM/std": 0.1182817742228508, "step": 691, "train_speed(iter/s)": 0.026975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 310.4375, "completions/min_length": 255.0, "epoch": 0.5723738626964433, "grad_norm": 0.18235838413238525, "kl": 0.04571533203125, "learning_rate": 4.610583178686715e-06, "loss": 0.0004569888114929199, "memory(GiB)": 39.01, "reward": 0.44124072790145874, "reward_std": 0.08376715332269669, "rewards/VisualizationJSONCombinedORM/mean": 0.44124072790145874, "rewards/VisualizationJSONCombinedORM/std": 0.13022080063819885, "step": 692, "train_speed(iter/s)": 0.02699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 294.5625, "completions/min_length": 237.0, "epoch": 0.5732009925558312, "grad_norm": 0.1405756026506424, "kl": 0.05303955078125, "learning_rate": 4.596191209980604e-06, "loss": 0.0005293413996696472, "memory(GiB)": 39.01, "reward": 0.49145573377609253, "reward_std": 0.05967699736356735, "rewards/VisualizationJSONCombinedORM/mean": 0.49145573377609253, "rewards/VisualizationJSONCombinedORM/std": 0.09092236310243607, "step": 693, "train_speed(iter/s)": 0.027005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 298.25, "completions/min_length": 236.0, "epoch": 0.5740281224152192, "grad_norm": 0.1914595067501068, "kl": 0.0452880859375, "learning_rate": 4.58180260807529e-06, "loss": 0.0004538223147392273, "memory(GiB)": 39.01, "reward": 0.6414347887039185, "reward_std": 0.1220984160900116, "rewards/VisualizationJSONCombinedORM/mean": 0.6414347887039185, "rewards/VisualizationJSONCombinedORM/std": 0.12389993667602539, "step": 694, "train_speed(iter/s)": 0.02701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 302.0625, "completions/min_length": 247.0, "epoch": 0.5748552522746071, "grad_norm": 0.15534597635269165, "kl": 0.03802490234375, "learning_rate": 4.567417492937344e-06, "loss": 0.00038051605224609375, "memory(GiB)": 39.01, "reward": 0.5609328746795654, "reward_std": 0.08807903528213501, "rewards/VisualizationJSONCombinedORM/mean": 0.5609328746795654, "rewards/VisualizationJSONCombinedORM/std": 0.16417990624904633, "step": 695, "train_speed(iter/s)": 0.027022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 319.5, "completions/min_length": 259.0, "epoch": 0.575682382133995, "grad_norm": 0.19184432923793793, "kl": 0.03729248046875, "learning_rate": 4.553035984504269e-06, "loss": 0.00037414440885186195, "memory(GiB)": 39.01, "reward": 0.414498895406723, "reward_std": 0.07441607117652893, "rewards/VisualizationJSONCombinedORM/mean": 0.414498895406723, "rewards/VisualizationJSONCombinedORM/std": 0.10341918468475342, "step": 696, "train_speed(iter/s)": 0.027036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 311.75, "completions/min_length": 247.0, "epoch": 0.576509511993383, "grad_norm": 0.17472881078720093, "kl": 0.033966064453125, "learning_rate": 4.53865820268349e-06, "loss": 0.00033993087708950043, "memory(GiB)": 39.01, "reward": 0.7038142085075378, "reward_std": 0.09947121143341064, "rewards/VisualizationJSONCombinedORM/mean": 0.7038142085075378, "rewards/VisualizationJSONCombinedORM/std": 0.09914425760507584, "step": 697, "train_speed(iter/s)": 0.027045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 312.9375, "completions/min_length": 255.0, "epoch": 0.5773366418527709, "grad_norm": 0.18967510759830475, "kl": 0.034088134765625, "learning_rate": 4.524284267351372e-06, "loss": 0.00034143775701522827, "memory(GiB)": 39.01, "reward": 0.6808557510375977, "reward_std": 0.10658062994480133, "rewards/VisualizationJSONCombinedORM/mean": 0.6808557510375977, "rewards/VisualizationJSONCombinedORM/std": 0.10411670804023743, "step": 698, "train_speed(iter/s)": 0.027059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 323.4375, "completions/min_length": 252.0, "epoch": 0.5781637717121588, "grad_norm": 0.18413466215133667, "kl": 0.033355712890625, "learning_rate": 4.509914298352197e-06, "loss": 0.00033324211835861206, "memory(GiB)": 39.01, "reward": 0.37132728099823, "reward_std": 0.05261163413524628, "rewards/VisualizationJSONCombinedORM/mean": 0.37132728099823, "rewards/VisualizationJSONCombinedORM/std": 0.10354439169168472, "step": 699, "train_speed(iter/s)": 0.027071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 276.8125, "completions/min_length": 217.0, "epoch": 0.5789909015715468, "grad_norm": 0.1911332905292511, "kl": 0.04541015625, "learning_rate": 4.4955484154971875e-06, "loss": 0.0004543401300907135, "memory(GiB)": 39.01, "reward": 0.5602086186408997, "reward_std": 0.11158397793769836, "rewards/VisualizationJSONCombinedORM/mean": 0.5602086186408997, "rewards/VisualizationJSONCombinedORM/std": 0.11274952441453934, "step": 700, "train_speed(iter/s)": 0.027085 }, { "epoch": 0.5789909015715468, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 384.8333333333333, "eval_completions/mean_length": 323.3177083333333, "eval_completions/min_length": 268.2083333333333, "eval_kl": 0.046641031901041664, "eval_loss": 0.0004680730926338583, "eval_reward": 0.458034207423528, "eval_reward_std": 0.07626213137215625, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.458034207423528, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0762621316825971, "eval_runtime": 323.424, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 322.125, "completions/min_length": 245.0, "epoch": 0.5798180314309347, "grad_norm": 0.1795215755701065, "kl": 0.048583984375, "learning_rate": 4.4811867385634916e-06, "loss": 0.00048592686653137207, "memory(GiB)": 39.01, "reward": 0.4449877142906189, "reward_std": 0.09704525768756866, "rewards/VisualizationJSONCombinedORM/mean": 0.4449877142906189, "rewards/VisualizationJSONCombinedORM/std": 0.1263332962989807, "step": 701, "train_speed(iter/s)": 0.026761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 299.375, "completions/min_length": 247.0, "epoch": 0.5806451612903226, "grad_norm": 0.15984833240509033, "kl": 0.035919189453125, "learning_rate": 4.4668293872931904e-06, "loss": 0.00035881251096725464, "memory(GiB)": 39.01, "reward": 0.4617137014865875, "reward_std": 0.07668576389551163, "rewards/VisualizationJSONCombinedORM/mean": 0.4617137014865875, "rewards/VisualizationJSONCombinedORM/std": 0.07738348841667175, "step": 702, "train_speed(iter/s)": 0.026772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 319.25, "completions/min_length": 242.0, "epoch": 0.5814722911497106, "grad_norm": 0.17283020913600922, "kl": 0.03424072265625, "learning_rate": 4.452476481392302e-06, "loss": 0.000342655461281538, "memory(GiB)": 39.01, "reward": 0.3824649453163147, "reward_std": 0.04655318707227707, "rewards/VisualizationJSONCombinedORM/mean": 0.3824649453163147, "rewards/VisualizationJSONCombinedORM/std": 0.06893063336610794, "step": 703, "train_speed(iter/s)": 0.026785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 323.875, "completions/min_length": 241.0, "epoch": 0.5822994210090985, "grad_norm": 0.17683255672454834, "kl": 0.05255126953125, "learning_rate": 4.438128140529779e-06, "loss": 0.0005255714058876038, "memory(GiB)": 39.01, "reward": 0.5693560838699341, "reward_std": 0.10521242022514343, "rewards/VisualizationJSONCombinedORM/mean": 0.5693560838699341, "rewards/VisualizationJSONCombinedORM/std": 0.1777133345603943, "step": 704, "train_speed(iter/s)": 0.0268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 322.875, "completions/min_length": 268.0, "epoch": 0.5831265508684863, "grad_norm": 0.1803356558084488, "kl": 0.0521240234375, "learning_rate": 4.4237844843365126e-06, "loss": 0.0005219839513301849, "memory(GiB)": 39.01, "reward": 0.5310289859771729, "reward_std": 0.10793884098529816, "rewards/VisualizationJSONCombinedORM/mean": 0.5310289859771729, "rewards/VisualizationJSONCombinedORM/std": 0.2434508353471756, "step": 705, "train_speed(iter/s)": 0.02681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 309.0625, "completions/min_length": 244.0, "epoch": 0.5839536807278742, "grad_norm": 0.19138649106025696, "kl": 0.0496826171875, "learning_rate": 4.409445632404334e-06, "loss": 0.0004981271922588348, "memory(GiB)": 39.01, "reward": 0.5357307195663452, "reward_std": 0.132486030459404, "rewards/VisualizationJSONCombinedORM/mean": 0.5357307195663452, "rewards/VisualizationJSONCombinedORM/std": 0.16352513432502747, "step": 706, "train_speed(iter/s)": 0.026826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 309.5625, "completions/min_length": 240.0, "epoch": 0.5847808105872622, "grad_norm": 0.16979998350143433, "kl": 0.057373046875, "learning_rate": 4.395111704285021e-06, "loss": 0.0005734339356422424, "memory(GiB)": 39.01, "reward": 0.6110154390335083, "reward_std": 0.0930948406457901, "rewards/VisualizationJSONCombinedORM/mean": 0.6110154390335083, "rewards/VisualizationJSONCombinedORM/std": 0.1379791796207428, "step": 707, "train_speed(iter/s)": 0.026837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 312.0, "completions/min_length": 245.0, "epoch": 0.5856079404466501, "grad_norm": 0.19572283327579498, "kl": 0.049560546875, "learning_rate": 4.380782819489295e-06, "loss": 0.0004952959716320038, "memory(GiB)": 39.01, "reward": 0.4972788691520691, "reward_std": 0.07966504991054535, "rewards/VisualizationJSONCombinedORM/mean": 0.4972788691520691, "rewards/VisualizationJSONCombinedORM/std": 0.14344705641269684, "step": 708, "train_speed(iter/s)": 0.026844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 307.125, "completions/min_length": 248.0, "epoch": 0.586435070306038, "grad_norm": 0.1837088018655777, "kl": 0.05987548828125, "learning_rate": 4.366459097485832e-06, "loss": 0.0005981400609016418, "memory(GiB)": 39.01, "reward": 0.41457778215408325, "reward_std": 0.07383047789335251, "rewards/VisualizationJSONCombinedORM/mean": 0.41457778215408325, "rewards/VisualizationJSONCombinedORM/std": 0.17684894800186157, "step": 709, "train_speed(iter/s)": 0.026858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 310.375, "completions/min_length": 230.0, "epoch": 0.587262200165426, "grad_norm": 0.18184104561805725, "kl": 0.04217529296875, "learning_rate": 4.352140657700259e-06, "loss": 0.0004205554723739624, "memory(GiB)": 39.01, "reward": 0.4629880487918854, "reward_std": 0.0460008941590786, "rewards/VisualizationJSONCombinedORM/mean": 0.4629880487918854, "rewards/VisualizationJSONCombinedORM/std": 0.13303615152835846, "step": 710, "train_speed(iter/s)": 0.026873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 293.125, "completions/min_length": 253.0, "epoch": 0.5880893300248139, "grad_norm": 0.1897808015346527, "kl": 0.06524658203125, "learning_rate": 4.3378276195141665e-06, "loss": 0.0006519295275211334, "memory(GiB)": 39.01, "reward": 0.558233380317688, "reward_std": 0.12253482639789581, "rewards/VisualizationJSONCombinedORM/mean": 0.558233380317688, "rewards/VisualizationJSONCombinedORM/std": 0.11971059441566467, "step": 711, "train_speed(iter/s)": 0.026879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 349.875, "completions/min_length": 291.0, "epoch": 0.5889164598842018, "grad_norm": 0.1645144820213318, "kl": 0.04193115234375, "learning_rate": 4.323520102264103e-06, "loss": 0.00041969865560531616, "memory(GiB)": 39.09, "reward": 0.6336236000061035, "reward_std": 0.1262136548757553, "rewards/VisualizationJSONCombinedORM/mean": 0.6336236000061035, "rewards/VisualizationJSONCombinedORM/std": 0.13967643678188324, "step": 712, "train_speed(iter/s)": 0.026883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 326.125, "completions/min_length": 256.0, "epoch": 0.5897435897435898, "grad_norm": 0.14415891468524933, "kl": 0.03045654296875, "learning_rate": 4.309218225240591e-06, "loss": 0.0003046691417694092, "memory(GiB)": 39.09, "reward": 0.34923696517944336, "reward_std": 0.05380465090274811, "rewards/VisualizationJSONCombinedORM/mean": 0.34923696517944336, "rewards/VisualizationJSONCombinedORM/std": 0.054866377264261246, "step": 713, "train_speed(iter/s)": 0.026895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 308.75, "completions/min_length": 224.0, "epoch": 0.5905707196029777, "grad_norm": 0.19774273037910461, "kl": 0.06768798828125, "learning_rate": 4.29492210768712e-06, "loss": 0.0006777793169021606, "memory(GiB)": 39.09, "reward": 0.41259220242500305, "reward_std": 0.049191415309906006, "rewards/VisualizationJSONCombinedORM/mean": 0.41259220242500305, "rewards/VisualizationJSONCombinedORM/std": 0.1933918595314026, "step": 714, "train_speed(iter/s)": 0.026906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 324.3125, "completions/min_length": 240.0, "epoch": 0.5913978494623656, "grad_norm": 0.1551990807056427, "kl": 0.0601806640625, "learning_rate": 4.280631868799169e-06, "loss": 0.0006021708250045776, "memory(GiB)": 39.09, "reward": 0.7628326416015625, "reward_std": 0.09940119087696075, "rewards/VisualizationJSONCombinedORM/mean": 0.7628326416015625, "rewards/VisualizationJSONCombinedORM/std": 0.10049088299274445, "step": 715, "train_speed(iter/s)": 0.026912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 311.3125, "completions/min_length": 270.0, "epoch": 0.5922249793217536, "grad_norm": 0.19794510304927826, "kl": 0.087158203125, "learning_rate": 4.266347627723192e-06, "loss": 0.000870727002620697, "memory(GiB)": 39.09, "reward": 0.5602335333824158, "reward_std": 0.19434578716754913, "rewards/VisualizationJSONCombinedORM/mean": 0.5602335333824158, "rewards/VisualizationJSONCombinedORM/std": 0.20292501151561737, "step": 716, "train_speed(iter/s)": 0.026923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 299.0, "completions/min_length": 248.0, "epoch": 0.5930521091811415, "grad_norm": 0.3601945638656616, "kl": 0.06341552734375, "learning_rate": 4.252069503555645e-06, "loss": 0.0006323046982288361, "memory(GiB)": 39.09, "reward": 0.5354035496711731, "reward_std": 0.11031370609998703, "rewards/VisualizationJSONCombinedORM/mean": 0.5354035496711731, "rewards/VisualizationJSONCombinedORM/std": 0.17601728439331055, "step": 717, "train_speed(iter/s)": 0.026935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 311.75, "completions/min_length": 250.0, "epoch": 0.5938792390405294, "grad_norm": 0.15902557969093323, "kl": 0.08294677734375, "learning_rate": 4.23779761534198e-06, "loss": 0.0008304715156555176, "memory(GiB)": 39.09, "reward": 0.6481614112854004, "reward_std": 0.10651085525751114, "rewards/VisualizationJSONCombinedORM/mean": 0.6481614112854004, "rewards/VisualizationJSONCombinedORM/std": 0.1494549661874771, "step": 718, "train_speed(iter/s)": 0.026944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 291.4375, "completions/min_length": 243.0, "epoch": 0.5947063688999172, "grad_norm": 0.18431776762008667, "kl": 0.0867919921875, "learning_rate": 4.223532082075652e-06, "loss": 0.0008681900799274445, "memory(GiB)": 39.09, "reward": 0.5592706203460693, "reward_std": 0.10854686796665192, "rewards/VisualizationJSONCombinedORM/mean": 0.5592706203460693, "rewards/VisualizationJSONCombinedORM/std": 0.20964060723781586, "step": 719, "train_speed(iter/s)": 0.026956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 318.9375, "completions/min_length": 250.0, "epoch": 0.5955334987593052, "grad_norm": 0.17478755116462708, "kl": 0.0758056640625, "learning_rate": 4.20927302269714e-06, "loss": 0.0007572099566459656, "memory(GiB)": 39.09, "reward": 0.6618261337280273, "reward_std": 0.09975790977478027, "rewards/VisualizationJSONCombinedORM/mean": 0.6618261337280273, "rewards/VisualizationJSONCombinedORM/std": 0.10627532005310059, "step": 720, "train_speed(iter/s)": 0.026967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 329.6875, "completions/min_length": 275.0, "epoch": 0.5963606286186931, "grad_norm": 0.17208650708198547, "kl": 0.04864501953125, "learning_rate": 4.195020556092935e-06, "loss": 0.0004873499274253845, "memory(GiB)": 39.09, "reward": 0.37795400619506836, "reward_std": 0.06274686753749847, "rewards/VisualizationJSONCombinedORM/mean": 0.37795400619506836, "rewards/VisualizationJSONCombinedORM/std": 0.10297439992427826, "step": 721, "train_speed(iter/s)": 0.026979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 279.3125, "completions/min_length": 233.0, "epoch": 0.597187758478081, "grad_norm": 0.17713592946529388, "kl": 0.031036376953125, "learning_rate": 4.180774801094572e-06, "loss": 0.0003106622025370598, "memory(GiB)": 39.09, "reward": 0.5971652269363403, "reward_std": 0.07095459848642349, "rewards/VisualizationJSONCombinedORM/mean": 0.5971652269363403, "rewards/VisualizationJSONCombinedORM/std": 0.21003402769565582, "step": 722, "train_speed(iter/s)": 0.026991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 298.5625, "completions/min_length": 225.0, "epoch": 0.598014888337469, "grad_norm": 0.16154006123542786, "kl": 0.0416259765625, "learning_rate": 4.166535876477616e-06, "loss": 0.0004164166748523712, "memory(GiB)": 39.09, "reward": 0.49131858348846436, "reward_std": 0.039978619664907455, "rewards/VisualizationJSONCombinedORM/mean": 0.49131858348846436, "rewards/VisualizationJSONCombinedORM/std": 0.2859432101249695, "step": 723, "train_speed(iter/s)": 0.027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 306.0, "completions/min_length": 239.0, "epoch": 0.5988420181968569, "grad_norm": 0.20368532836437225, "kl": 0.0438232421875, "learning_rate": 4.152303900960692e-06, "loss": 0.0004387013614177704, "memory(GiB)": 39.09, "reward": 0.43220070004463196, "reward_std": 0.08606990426778793, "rewards/VisualizationJSONCombinedORM/mean": 0.43220070004463196, "rewards/VisualizationJSONCombinedORM/std": 0.192823588848114, "step": 724, "train_speed(iter/s)": 0.02701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 277.8125, "completions/min_length": 211.0, "epoch": 0.5996691480562448, "grad_norm": 0.170429527759552, "kl": 0.04052734375, "learning_rate": 4.1380789932044794e-06, "loss": 0.0004052594304084778, "memory(GiB)": 39.09, "reward": 0.5863388180732727, "reward_std": 0.08881151676177979, "rewards/VisualizationJSONCombinedORM/mean": 0.5863388180732727, "rewards/VisualizationJSONCombinedORM/std": 0.14570242166519165, "step": 725, "train_speed(iter/s)": 0.027022 }, { "epoch": 0.5996691480562448, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 358.7083333333333, "eval_completions/mean_length": 300.5885416666667, "eval_completions/min_length": 253.33333333333334, "eval_kl": 0.041508992513020836, "eval_loss": 0.00041578957461752, "eval_reward": 0.445145230119427, "eval_reward_std": 0.07809206711438794, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.445145230119427, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07809207122772932, "eval_runtime": 307.6304, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 295.25, "completions/min_length": 237.0, "epoch": 0.6004962779156328, "grad_norm": 0.18037673830986023, "kl": 0.05694580078125, "learning_rate": 4.123861271810735e-06, "loss": 0.0005697272717952728, "memory(GiB)": 39.09, "reward": 0.6966122984886169, "reward_std": 0.0906350314617157, "rewards/VisualizationJSONCombinedORM/mean": 0.6966122984886169, "rewards/VisualizationJSONCombinedORM/std": 0.08919517695903778, "step": 726, "train_speed(iter/s)": 0.026726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 331.6875, "completions/min_length": 262.0, "epoch": 0.6013234077750207, "grad_norm": 0.19271254539489746, "kl": 0.035400390625, "learning_rate": 4.109650855321291e-06, "loss": 0.0003539472818374634, "memory(GiB)": 39.09, "reward": 0.36826184391975403, "reward_std": 0.08800595998764038, "rewards/VisualizationJSONCombinedORM/mean": 0.36826184391975403, "rewards/VisualizationJSONCombinedORM/std": 0.1428963989019394, "step": 727, "train_speed(iter/s)": 0.026734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 278.9375, "completions/min_length": 220.0, "epoch": 0.6021505376344086, "grad_norm": 0.165066659450531, "kl": 0.03558349609375, "learning_rate": 4.095447862217084e-06, "loss": 0.0003565177321434021, "memory(GiB)": 39.09, "reward": 0.5567970275878906, "reward_std": 0.09215635061264038, "rewards/VisualizationJSONCombinedORM/mean": 0.5567970275878906, "rewards/VisualizationJSONCombinedORM/std": 0.1051534116268158, "step": 728, "train_speed(iter/s)": 0.026746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 303.75, "completions/min_length": 218.0, "epoch": 0.6029776674937966, "grad_norm": 0.17741596698760986, "kl": 0.0531005859375, "learning_rate": 4.081252410917148e-06, "loss": 0.0005312226712703705, "memory(GiB)": 39.09, "reward": 0.4284502863883972, "reward_std": 0.05270426720380783, "rewards/VisualizationJSONCombinedORM/mean": 0.4284502863883972, "rewards/VisualizationJSONCombinedORM/std": 0.20880523324012756, "step": 729, "train_speed(iter/s)": 0.026758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 290.5625, "completions/min_length": 246.0, "epoch": 0.6038047973531845, "grad_norm": 0.168808251619339, "kl": 0.0467529296875, "learning_rate": 4.067064619777645e-06, "loss": 0.0004669204354286194, "memory(GiB)": 39.09, "reward": 0.5451758503913879, "reward_std": 0.08897773176431656, "rewards/VisualizationJSONCombinedORM/mean": 0.5451758503913879, "rewards/VisualizationJSONCombinedORM/std": 0.11377625167369843, "step": 730, "train_speed(iter/s)": 0.026769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 283.25, "completions/min_length": 243.0, "epoch": 0.6046319272125724, "grad_norm": 0.16219264268875122, "kl": 0.021759033203125, "learning_rate": 4.05288460709086e-06, "loss": 0.0002173781394958496, "memory(GiB)": 39.09, "reward": 0.5488399267196655, "reward_std": 0.0515841543674469, "rewards/VisualizationJSONCombinedORM/mean": 0.5488399267196655, "rewards/VisualizationJSONCombinedORM/std": 0.05641086772084236, "step": 731, "train_speed(iter/s)": 0.026783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 286.0625, "completions/min_length": 244.0, "epoch": 0.6054590570719603, "grad_norm": 0.17076437175273895, "kl": 0.06597900390625, "learning_rate": 4.038712491084234e-06, "loss": 0.0006591975688934326, "memory(GiB)": 39.09, "reward": 0.5766642093658447, "reward_std": 0.1769811511039734, "rewards/VisualizationJSONCombinedORM/mean": 0.5766642093658447, "rewards/VisualizationJSONCombinedORM/std": 0.1886696070432663, "step": 732, "train_speed(iter/s)": 0.026794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 304.0625, "completions/min_length": 235.0, "epoch": 0.6062861869313482, "grad_norm": 0.17327800393104553, "kl": 0.02484130859375, "learning_rate": 4.02454838991936e-06, "loss": 0.0002492610365152359, "memory(GiB)": 39.09, "reward": 0.6504430770874023, "reward_std": 0.07002369314432144, "rewards/VisualizationJSONCombinedORM/mean": 0.6504430770874023, "rewards/VisualizationJSONCombinedORM/std": 0.13022620975971222, "step": 733, "train_speed(iter/s)": 0.026806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 311.4375, "completions/min_length": 234.0, "epoch": 0.6071133167907361, "grad_norm": 0.18576715886592865, "kl": 0.03936767578125, "learning_rate": 4.0103924216910104e-06, "loss": 0.00039351359009742737, "memory(GiB)": 39.09, "reward": 0.44165217876434326, "reward_std": 0.06270535290241241, "rewards/VisualizationJSONCombinedORM/mean": 0.44165217876434326, "rewards/VisualizationJSONCombinedORM/std": 0.21480686962604523, "step": 734, "train_speed(iter/s)": 0.026821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 297.1875, "completions/min_length": 244.0, "epoch": 0.607940446650124, "grad_norm": 0.17532195150852203, "kl": 0.030059814453125, "learning_rate": 3.996244704426153e-06, "loss": 0.0003003925085067749, "memory(GiB)": 39.09, "reward": 0.6928845047950745, "reward_std": 0.07497477531433105, "rewards/VisualizationJSONCombinedORM/mean": 0.6928845047950745, "rewards/VisualizationJSONCombinedORM/std": 0.07837599515914917, "step": 735, "train_speed(iter/s)": 0.026832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 300.8125, "completions/min_length": 232.0, "epoch": 0.608767576509512, "grad_norm": 0.21361663937568665, "kl": 0.05023193359375, "learning_rate": 3.982105356082951e-06, "loss": 0.0005026273429393768, "memory(GiB)": 39.09, "reward": 0.4798383116722107, "reward_std": 0.0683751106262207, "rewards/VisualizationJSONCombinedORM/mean": 0.4798383116722107, "rewards/VisualizationJSONCombinedORM/std": 0.21072719991207123, "step": 736, "train_speed(iter/s)": 0.026845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 308.1875, "completions/min_length": 222.0, "epoch": 0.6095947063688999, "grad_norm": 0.19744718074798584, "kl": 0.04766845703125, "learning_rate": 3.967974494549803e-06, "loss": 0.00047656893730163574, "memory(GiB)": 39.09, "reward": 0.3446863293647766, "reward_std": 0.0722118392586708, "rewards/VisualizationJSONCombinedORM/mean": 0.3446863293647766, "rewards/VisualizationJSONCombinedORM/std": 0.17230597138404846, "step": 737, "train_speed(iter/s)": 0.026856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 322.1875, "completions/min_length": 263.0, "epoch": 0.6104218362282878, "grad_norm": 0.18455351889133453, "kl": 0.038330078125, "learning_rate": 3.953852237644337e-06, "loss": 0.0003837607800960541, "memory(GiB)": 39.09, "reward": 0.49278607964515686, "reward_std": 0.05062493681907654, "rewards/VisualizationJSONCombinedORM/mean": 0.49278607964515686, "rewards/VisualizationJSONCombinedORM/std": 0.2296493500471115, "step": 738, "train_speed(iter/s)": 0.02686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 316.375, "completions/min_length": 268.0, "epoch": 0.6112489660876758, "grad_norm": 0.1820392608642578, "kl": 0.0350341796875, "learning_rate": 3.939738703112447e-06, "loss": 0.00035047903656959534, "memory(GiB)": 39.09, "reward": 0.7757720947265625, "reward_std": 0.11063431203365326, "rewards/VisualizationJSONCombinedORM/mean": 0.7757720947265625, "rewards/VisualizationJSONCombinedORM/std": 0.13928046822547913, "step": 739, "train_speed(iter/s)": 0.026873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 305.6875, "completions/min_length": 235.0, "epoch": 0.6120760959470637, "grad_norm": 0.1708388328552246, "kl": 0.022552490234375, "learning_rate": 3.925634008627299e-06, "loss": 0.00022586435079574585, "memory(GiB)": 39.09, "reward": 0.3926669657230377, "reward_std": 0.05743188410997391, "rewards/VisualizationJSONCombinedORM/mean": 0.3926669657230377, "rewards/VisualizationJSONCombinedORM/std": 0.17159894108772278, "step": 740, "train_speed(iter/s)": 0.026883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 329.375, "completions/min_length": 248.0, "epoch": 0.6129032258064516, "grad_norm": 0.22879332304000854, "kl": 0.0418701171875, "learning_rate": 3.911538271788359e-06, "loss": 0.00041799992322921753, "memory(GiB)": 39.09, "reward": 0.530933141708374, "reward_std": 0.1077217310667038, "rewards/VisualizationJSONCombinedORM/mean": 0.530933141708374, "rewards/VisualizationJSONCombinedORM/std": 0.1057758629322052, "step": 741, "train_speed(iter/s)": 0.026887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 288.5625, "completions/min_length": 229.0, "epoch": 0.6137303556658396, "grad_norm": 0.1901131123304367, "kl": 0.038970947265625, "learning_rate": 3.897451610120399e-06, "loss": 0.00038976967334747314, "memory(GiB)": 39.09, "reward": 0.562605619430542, "reward_std": 0.08959943801164627, "rewards/VisualizationJSONCombinedORM/mean": 0.562605619430542, "rewards/VisualizationJSONCombinedORM/std": 0.1321038454771042, "step": 742, "train_speed(iter/s)": 0.026893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 302.375, "completions/min_length": 232.0, "epoch": 0.6145574855252275, "grad_norm": 0.21129243075847626, "kl": 0.0498046875, "learning_rate": 3.883374141072534e-06, "loss": 0.0004991553723812103, "memory(GiB)": 39.09, "reward": 0.39280766248703003, "reward_std": 0.10041357576847076, "rewards/VisualizationJSONCombinedORM/mean": 0.39280766248703003, "rewards/VisualizationJSONCombinedORM/std": 0.19760116934776306, "step": 743, "train_speed(iter/s)": 0.026903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 293.5, "completions/min_length": 228.0, "epoch": 0.6153846153846154, "grad_norm": 0.18073615431785583, "kl": 0.04205322265625, "learning_rate": 3.869305982017229e-06, "loss": 0.0004199296236038208, "memory(GiB)": 39.09, "reward": 0.5975728034973145, "reward_std": 0.08680896461009979, "rewards/VisualizationJSONCombinedORM/mean": 0.5975728034973145, "rewards/VisualizationJSONCombinedORM/std": 0.15211905539035797, "step": 744, "train_speed(iter/s)": 0.026911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 314.625, "completions/min_length": 253.0, "epoch": 0.6162117452440034, "grad_norm": 0.1759830266237259, "kl": 0.0631103515625, "learning_rate": 3.855247250249331e-06, "loss": 0.0006313808262348175, "memory(GiB)": 39.09, "reward": 0.3431816101074219, "reward_std": 0.040249601006507874, "rewards/VisualizationJSONCombinedORM/mean": 0.3431816101074219, "rewards/VisualizationJSONCombinedORM/std": 0.10161489993333817, "step": 745, "train_speed(iter/s)": 0.026919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 289.0625, "completions/min_length": 246.0, "epoch": 0.6170388751033913, "grad_norm": 0.17615538835525513, "kl": 0.0325927734375, "learning_rate": 3.84119806298508e-06, "loss": 0.0003252774477005005, "memory(GiB)": 39.09, "reward": 0.5190376043319702, "reward_std": 0.04998776316642761, "rewards/VisualizationJSONCombinedORM/mean": 0.5190376043319702, "rewards/VisualizationJSONCombinedORM/std": 0.27265891432762146, "step": 746, "train_speed(iter/s)": 0.026932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 327.6875, "completions/min_length": 246.0, "epoch": 0.6178660049627791, "grad_norm": 0.1894257515668869, "kl": 0.04742431640625, "learning_rate": 3.827158537361144e-06, "loss": 0.0004745200276374817, "memory(GiB)": 39.09, "reward": 0.41238895058631897, "reward_std": 0.0831519290804863, "rewards/VisualizationJSONCombinedORM/mean": 0.41238895058631897, "rewards/VisualizationJSONCombinedORM/std": 0.09486260265111923, "step": 747, "train_speed(iter/s)": 0.026945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 305.6875, "completions/min_length": 229.0, "epoch": 0.618693134822167, "grad_norm": 0.19610245525836945, "kl": 0.1160888671875, "learning_rate": 3.8131287904336288e-06, "loss": 0.0011625252664089203, "memory(GiB)": 39.09, "reward": 0.29328757524490356, "reward_std": 0.041803956031799316, "rewards/VisualizationJSONCombinedORM/mean": 0.29328757524490356, "rewards/VisualizationJSONCombinedORM/std": 0.09426872432231903, "step": 748, "train_speed(iter/s)": 0.026954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 310.0625, "completions/min_length": 252.0, "epoch": 0.619520264681555, "grad_norm": 0.18105171620845795, "kl": 0.0556640625, "learning_rate": 3.7991089391771185e-06, "loss": 0.0005568042397499084, "memory(GiB)": 39.09, "reward": 0.6013686060905457, "reward_std": 0.0834193229675293, "rewards/VisualizationJSONCombinedORM/mean": 0.6013686060905457, "rewards/VisualizationJSONCombinedORM/std": 0.08705799281597137, "step": 749, "train_speed(iter/s)": 0.026966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 236.5, "completions/min_length": 200.0, "epoch": 0.6203473945409429, "grad_norm": 0.18477380275726318, "kl": 0.04852294921875, "learning_rate": 3.7850991004836813e-06, "loss": 0.0004852898418903351, "memory(GiB)": 39.09, "reward": 0.33984890580177307, "reward_std": 0.09820716083049774, "rewards/VisualizationJSONCombinedORM/mean": 0.33984890580177307, "rewards/VisualizationJSONCombinedORM/std": 0.1193234771490097, "step": 750, "train_speed(iter/s)": 0.026978 }, { "epoch": 0.6203473945409429, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.1666666666667, "eval_completions/mean_length": 299.3958333333333, "eval_completions/min_length": 249.58333333333334, "eval_kl": 0.039454142252604164, "eval_loss": 0.0003963485360145569, "eval_reward": 0.4070987496525049, "eval_reward_std": 0.06651404740599294, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4070987496525049, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06651404694033165, "eval_runtime": 305.7653, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 323.875, "completions/min_length": 208.0, "epoch": 0.6211745244003308, "grad_norm": 0.1707889288663864, "kl": 0.032196044921875, "learning_rate": 3.7710993911619093e-06, "loss": 0.00032144784927368164, "memory(GiB)": 39.09, "reward": 0.3230123221874237, "reward_std": 0.04633169621229172, "rewards/VisualizationJSONCombinedORM/mean": 0.3230123221874237, "rewards/VisualizationJSONCombinedORM/std": 0.1753392219543457, "step": 751, "train_speed(iter/s)": 0.026694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 306.3125, "completions/min_length": 228.0, "epoch": 0.6220016542597188, "grad_norm": 0.18581990897655487, "kl": 0.04327392578125, "learning_rate": 3.757109927935943e-06, "loss": 0.00043259933590888977, "memory(GiB)": 39.09, "reward": 0.5244652628898621, "reward_std": 0.05688120052218437, "rewards/VisualizationJSONCombinedORM/mean": 0.5244652628898621, "rewards/VisualizationJSONCombinedORM/std": 0.2298223078250885, "step": 752, "train_speed(iter/s)": 0.026706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 287.875, "completions/min_length": 232.0, "epoch": 0.6228287841191067, "grad_norm": 0.2167138159275055, "kl": 0.0482177734375, "learning_rate": 3.743130827444487e-06, "loss": 0.00048273801803588867, "memory(GiB)": 39.09, "reward": 0.34440043568611145, "reward_std": 0.05412295088171959, "rewards/VisualizationJSONCombinedORM/mean": 0.34440043568611145, "rewards/VisualizationJSONCombinedORM/std": 0.15545323491096497, "step": 753, "train_speed(iter/s)": 0.026719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 297.5625, "completions/min_length": 211.0, "epoch": 0.6236559139784946, "grad_norm": 0.18416966497898102, "kl": 0.03399658203125, "learning_rate": 3.7291622062398523e-06, "loss": 0.00033984333276748657, "memory(GiB)": 39.09, "reward": 0.5385359525680542, "reward_std": 0.07585489004850388, "rewards/VisualizationJSONCombinedORM/mean": 0.5385359525680542, "rewards/VisualizationJSONCombinedORM/std": 0.16062213480472565, "step": 754, "train_speed(iter/s)": 0.026731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 308.6875, "completions/min_length": 245.0, "epoch": 0.6244830438378826, "grad_norm": 0.18790186941623688, "kl": 0.05426025390625, "learning_rate": 3.7152041807869744e-06, "loss": 0.0005430206656455994, "memory(GiB)": 39.09, "reward": 0.5409228801727295, "reward_std": 0.11351431906223297, "rewards/VisualizationJSONCombinedORM/mean": 0.5409228801727295, "rewards/VisualizationJSONCombinedORM/std": 0.166110098361969, "step": 755, "train_speed(iter/s)": 0.026744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 272.5, "completions/min_length": 232.0, "epoch": 0.6253101736972705, "grad_norm": 0.21130062639713287, "kl": 0.0390625, "learning_rate": 3.7012568674624473e-06, "loss": 0.0003909692168235779, "memory(GiB)": 39.09, "reward": 0.5182757377624512, "reward_std": 0.09078778326511383, "rewards/VisualizationJSONCombinedORM/mean": 0.5182757377624512, "rewards/VisualizationJSONCombinedORM/std": 0.09323818981647491, "step": 756, "train_speed(iter/s)": 0.026758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 317.125, "completions/min_length": 230.0, "epoch": 0.6261373035566584, "grad_norm": 0.20023176074028015, "kl": 0.0386962890625, "learning_rate": 3.6873203825535473e-06, "loss": 0.0003870539367198944, "memory(GiB)": 39.09, "reward": 0.5453386306762695, "reward_std": 0.08449876308441162, "rewards/VisualizationJSONCombinedORM/mean": 0.5453386306762695, "rewards/VisualizationJSONCombinedORM/std": 0.12232367694377899, "step": 757, "train_speed(iter/s)": 0.026763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 305.375, "completions/min_length": 236.0, "epoch": 0.6269644334160464, "grad_norm": 0.18292027711868286, "kl": 0.0390625, "learning_rate": 3.673394842257275e-06, "loss": 0.0003893151879310608, "memory(GiB)": 39.09, "reward": 0.49313998222351074, "reward_std": 0.05280131846666336, "rewards/VisualizationJSONCombinedORM/mean": 0.49313998222351074, "rewards/VisualizationJSONCombinedORM/std": 0.1366647332906723, "step": 758, "train_speed(iter/s)": 0.02677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 291.875, "completions/min_length": 199.0, "epoch": 0.6277915632754343, "grad_norm": 0.1713920533657074, "kl": 0.03558349609375, "learning_rate": 3.659480362679371e-06, "loss": 0.00035633891820907593, "memory(GiB)": 39.09, "reward": 0.5496589541435242, "reward_std": 0.07701592892408371, "rewards/VisualizationJSONCombinedORM/mean": 0.5496589541435242, "rewards/VisualizationJSONCombinedORM/std": 0.2573074996471405, "step": 759, "train_speed(iter/s)": 0.026777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 277.0, "completions/min_length": 239.0, "epoch": 0.6286186931348222, "grad_norm": 0.19294561445713043, "kl": 0.032379150390625, "learning_rate": 3.6455770598333633e-06, "loss": 0.0003242567181587219, "memory(GiB)": 39.09, "reward": 0.7403438091278076, "reward_std": 0.08873221278190613, "rewards/VisualizationJSONCombinedORM/mean": 0.7403438091278076, "rewards/VisualizationJSONCombinedORM/std": 0.09417212754487991, "step": 760, "train_speed(iter/s)": 0.026785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 279.625, "completions/min_length": 234.0, "epoch": 0.62944582299421, "grad_norm": 0.1691361367702484, "kl": 0.02783203125, "learning_rate": 3.6316850496395863e-06, "loss": 0.0002780333161354065, "memory(GiB)": 39.09, "reward": 0.7133947610855103, "reward_std": 0.08348262310028076, "rewards/VisualizationJSONCombinedORM/mean": 0.7133947610855103, "rewards/VisualizationJSONCombinedORM/std": 0.10268975794315338, "step": 761, "train_speed(iter/s)": 0.026796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 290.9375, "completions/min_length": 247.0, "epoch": 0.630272952853598, "grad_norm": 0.19315141439437866, "kl": 0.078369140625, "learning_rate": 3.6178044479242256e-06, "loss": 0.0007828250527381897, "memory(GiB)": 39.09, "reward": 0.46818429231643677, "reward_std": 0.11395551264286041, "rewards/VisualizationJSONCombinedORM/mean": 0.46818429231643677, "rewards/VisualizationJSONCombinedORM/std": 0.21796740591526031, "step": 762, "train_speed(iter/s)": 0.026802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 296.625, "completions/min_length": 232.0, "epoch": 0.6311000827129859, "grad_norm": 0.20620976388454437, "kl": 0.0919189453125, "learning_rate": 3.603935370418342e-06, "loss": 0.0009195283055305481, "memory(GiB)": 39.09, "reward": 0.39011532068252563, "reward_std": 0.07769772410392761, "rewards/VisualizationJSONCombinedORM/mean": 0.39011532068252563, "rewards/VisualizationJSONCombinedORM/std": 0.13886606693267822, "step": 763, "train_speed(iter/s)": 0.026816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 301.6875, "completions/min_length": 246.0, "epoch": 0.6319272125723738, "grad_norm": 0.2026158571243286, "kl": 0.0966796875, "learning_rate": 3.59007793275692e-06, "loss": 0.0009662844240665436, "memory(GiB)": 39.09, "reward": 0.5031659603118896, "reward_std": 0.09976208209991455, "rewards/VisualizationJSONCombinedORM/mean": 0.5031659603118896, "rewards/VisualizationJSONCombinedORM/std": 0.16107124090194702, "step": 764, "train_speed(iter/s)": 0.026826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 289.75, "completions/min_length": 239.0, "epoch": 0.6327543424317618, "grad_norm": 0.2107073813676834, "kl": 0.03851318359375, "learning_rate": 3.5762322504778846e-06, "loss": 0.00038420408964157104, "memory(GiB)": 39.09, "reward": 0.517541766166687, "reward_std": 0.06670857220888138, "rewards/VisualizationJSONCombinedORM/mean": 0.517541766166687, "rewards/VisualizationJSONCombinedORM/std": 0.20264354348182678, "step": 765, "train_speed(iter/s)": 0.026837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 299.0, "completions/min_length": 232.0, "epoch": 0.6335814722911497, "grad_norm": 0.22611282765865326, "kl": 0.08984375, "learning_rate": 3.5623984390211597e-06, "loss": 0.0008981227874755859, "memory(GiB)": 39.09, "reward": 0.30566883087158203, "reward_std": 0.05412028357386589, "rewards/VisualizationJSONCombinedORM/mean": 0.30566883087158203, "rewards/VisualizationJSONCombinedORM/std": 0.15811415016651154, "step": 766, "train_speed(iter/s)": 0.026849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 291.5, "completions/min_length": 225.0, "epoch": 0.6344086021505376, "grad_norm": 0.21709448099136353, "kl": 0.08709716796875, "learning_rate": 3.5485766137276894e-06, "loss": 0.0008706040680408478, "memory(GiB)": 39.09, "reward": 0.3901812434196472, "reward_std": 0.0782947689294815, "rewards/VisualizationJSONCombinedORM/mean": 0.3901812434196472, "rewards/VisualizationJSONCombinedORM/std": 0.20947623252868652, "step": 767, "train_speed(iter/s)": 0.026861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 270.1875, "completions/min_length": 226.0, "epoch": 0.6352357320099256, "grad_norm": 0.14012396335601807, "kl": 0.056396484375, "learning_rate": 3.5347668898384805e-06, "loss": 0.0005629025399684906, "memory(GiB)": 39.09, "reward": 0.40872400999069214, "reward_std": 0.07673820853233337, "rewards/VisualizationJSONCombinedORM/mean": 0.40872400999069214, "rewards/VisualizationJSONCombinedORM/std": 0.0794001966714859, "step": 768, "train_speed(iter/s)": 0.026872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 290.1875, "completions/min_length": 242.0, "epoch": 0.6360628618693135, "grad_norm": 0.20477113127708435, "kl": 0.05596923828125, "learning_rate": 3.5209693824936486e-06, "loss": 0.0005600824952125549, "memory(GiB)": 39.09, "reward": 0.675133228302002, "reward_std": 0.11214213073253632, "rewards/VisualizationJSONCombinedORM/mean": 0.675133228302002, "rewards/VisualizationJSONCombinedORM/std": 0.11159557849168777, "step": 769, "train_speed(iter/s)": 0.026887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 290.9375, "completions/min_length": 226.0, "epoch": 0.6368899917287014, "grad_norm": 0.18342682719230652, "kl": 0.03643798828125, "learning_rate": 3.5071842067314453e-06, "loss": 0.0003644227981567383, "memory(GiB)": 39.09, "reward": 0.2499551773071289, "reward_std": 0.03726818040013313, "rewards/VisualizationJSONCombinedORM/mean": 0.2499551773071289, "rewards/VisualizationJSONCombinedORM/std": 0.04904038459062576, "step": 770, "train_speed(iter/s)": 0.026898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 293.875, "completions/min_length": 235.0, "epoch": 0.6377171215880894, "grad_norm": 0.2304990291595459, "kl": 0.06109619140625, "learning_rate": 3.4934114774873153e-06, "loss": 0.0006112046539783478, "memory(GiB)": 39.09, "reward": 0.36727848649024963, "reward_std": 0.10840161889791489, "rewards/VisualizationJSONCombinedORM/mean": 0.36727848649024963, "rewards/VisualizationJSONCombinedORM/std": 0.17857417464256287, "step": 771, "train_speed(iter/s)": 0.026905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 277.0, "completions/min_length": 226.0, "epoch": 0.6385442514474773, "grad_norm": 0.15937539935112, "kl": 0.02447509765625, "learning_rate": 3.4796513095929178e-06, "loss": 0.00024427473545074463, "memory(GiB)": 39.09, "reward": 0.7036239504814148, "reward_std": 0.07580849528312683, "rewards/VisualizationJSONCombinedORM/mean": 0.7036239504814148, "rewards/VisualizationJSONCombinedORM/std": 0.09206691384315491, "step": 772, "train_speed(iter/s)": 0.026916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 284.1875, "completions/min_length": 245.0, "epoch": 0.6393713813068652, "grad_norm": 0.17034153640270233, "kl": 0.0357666015625, "learning_rate": 3.4659038177751918e-06, "loss": 0.00035768747329711914, "memory(GiB)": 39.09, "reward": 0.7188844084739685, "reward_std": 0.09742911159992218, "rewards/VisualizationJSONCombinedORM/mean": 0.7188844084739685, "rewards/VisualizationJSONCombinedORM/std": 0.09580183029174805, "step": 773, "train_speed(iter/s)": 0.026927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 273.0, "completions/min_length": 209.0, "epoch": 0.6401985111662531, "grad_norm": 0.19060292840003967, "kl": 0.1571044921875, "learning_rate": 3.4521691166553777e-06, "loss": 0.0015686601400375366, "memory(GiB)": 39.09, "reward": 0.3281056880950928, "reward_std": 0.045088991522789, "rewards/VisualizationJSONCombinedORM/mean": 0.3281056880950928, "rewards/VisualizationJSONCombinedORM/std": 0.06510844826698303, "step": 774, "train_speed(iter/s)": 0.026937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 285.4375, "completions/min_length": 235.0, "epoch": 0.6410256410256411, "grad_norm": 0.18128417432308197, "kl": 0.03076171875, "learning_rate": 3.438447320748082e-06, "loss": 0.00030757486820220947, "memory(GiB)": 39.09, "reward": 0.5113586187362671, "reward_std": 0.09254130721092224, "rewards/VisualizationJSONCombinedORM/mean": 0.5113586187362671, "rewards/VisualizationJSONCombinedORM/std": 0.1778956800699234, "step": 775, "train_speed(iter/s)": 0.026951 }, { "epoch": 0.6410256410256411, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 342.1666666666667, "eval_completions/mean_length": 290.015625, "eval_completions/min_length": 243.79166666666666, "eval_kl": 0.046284993489583336, "eval_loss": 0.0004632634518202394, "eval_reward": 0.45996410710116226, "eval_reward_std": 0.06501429070097704, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45996410710116226, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06501429163229962, "eval_runtime": 298.0799, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 307.25, "completions/min_length": 258.0, "epoch": 0.6418527708850289, "grad_norm": 0.22048896551132202, "kl": 0.044677734375, "learning_rate": 3.424738544460302e-06, "loss": 0.00044615939259529114, "memory(GiB)": 39.09, "reward": 0.4006553590297699, "reward_std": 0.05757874995470047, "rewards/VisualizationJSONCombinedORM/mean": 0.4006553590297699, "rewards/VisualizationJSONCombinedORM/std": 0.18093933165073395, "step": 776, "train_speed(iter/s)": 0.026681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 292.4375, "completions/min_length": 234.0, "epoch": 0.6426799007444168, "grad_norm": 0.20305271446704865, "kl": 0.076904296875, "learning_rate": 3.4110429020904924e-06, "loss": 0.0007687155157327652, "memory(GiB)": 39.09, "reward": 0.5581347942352295, "reward_std": 0.05735781788825989, "rewards/VisualizationJSONCombinedORM/mean": 0.5581347942352295, "rewards/VisualizationJSONCombinedORM/std": 0.2009611427783966, "step": 777, "train_speed(iter/s)": 0.026692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 283.875, "completions/min_length": 229.0, "epoch": 0.6435070306038048, "grad_norm": 0.1914573758840561, "kl": 0.042724609375, "learning_rate": 3.3973605078275955e-06, "loss": 0.0004273653030395508, "memory(GiB)": 39.09, "reward": 0.46174582839012146, "reward_std": 0.1919097602367401, "rewards/VisualizationJSONCombinedORM/mean": 0.46174582839012146, "rewards/VisualizationJSONCombinedORM/std": 0.2463812530040741, "step": 778, "train_speed(iter/s)": 0.026703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 282.1875, "completions/min_length": 228.0, "epoch": 0.6443341604631927, "grad_norm": 0.15894676744937897, "kl": 0.02294921875, "learning_rate": 3.3836914757501023e-06, "loss": 0.00022954493761062622, "memory(GiB)": 39.09, "reward": 0.7582126259803772, "reward_std": 0.07734626531600952, "rewards/VisualizationJSONCombinedORM/mean": 0.7582126259803772, "rewards/VisualizationJSONCombinedORM/std": 0.0789400041103363, "step": 779, "train_speed(iter/s)": 0.026715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.4375, "completions/min_length": 230.0, "epoch": 0.6451612903225806, "grad_norm": 0.2018371820449829, "kl": 0.06048583984375, "learning_rate": 3.3700359198250854e-06, "loss": 0.0006041601300239563, "memory(GiB)": 39.09, "reward": 0.6191701292991638, "reward_std": 0.11758744716644287, "rewards/VisualizationJSONCombinedORM/mean": 0.6191701292991638, "rewards/VisualizationJSONCombinedORM/std": 0.1318751871585846, "step": 780, "train_speed(iter/s)": 0.02673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 271.6875, "completions/min_length": 225.0, "epoch": 0.6459884201819686, "grad_norm": 0.15011301636695862, "kl": 0.03216552734375, "learning_rate": 3.356393953907271e-06, "loss": 0.00032201409339904785, "memory(GiB)": 39.09, "reward": 0.5762444138526917, "reward_std": 0.11626891791820526, "rewards/VisualizationJSONCombinedORM/mean": 0.5762444138526917, "rewards/VisualizationJSONCombinedORM/std": 0.11728287488222122, "step": 781, "train_speed(iter/s)": 0.026742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 267.5625, "completions/min_length": 237.0, "epoch": 0.6468155500413565, "grad_norm": 0.19064442813396454, "kl": 0.048828125, "learning_rate": 3.342765691738064e-06, "loss": 0.0004893802106380463, "memory(GiB)": 39.09, "reward": 0.4905385971069336, "reward_std": 0.1182744950056076, "rewards/VisualizationJSONCombinedORM/mean": 0.4905385971069336, "rewards/VisualizationJSONCombinedORM/std": 0.11459913104772568, "step": 782, "train_speed(iter/s)": 0.02675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 292.0625, "completions/min_length": 233.0, "epoch": 0.6476426799007444, "grad_norm": 0.16492865979671478, "kl": 0.05877685546875, "learning_rate": 3.3291512469446253e-06, "loss": 0.000587979331612587, "memory(GiB)": 39.09, "reward": 0.6886979341506958, "reward_std": 0.08477776497602463, "rewards/VisualizationJSONCombinedORM/mean": 0.6886979341506958, "rewards/VisualizationJSONCombinedORM/std": 0.14542649686336517, "step": 783, "train_speed(iter/s)": 0.026762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 295.0, "completions/min_length": 228.0, "epoch": 0.6484698097601324, "grad_norm": 0.23342563211917877, "kl": 0.051513671875, "learning_rate": 3.3155507330389004e-06, "loss": 0.000516306608915329, "memory(GiB)": 39.09, "reward": 0.41963642835617065, "reward_std": 0.09517356753349304, "rewards/VisualizationJSONCombinedORM/mean": 0.41963642835617065, "rewards/VisualizationJSONCombinedORM/std": 0.12914329767227173, "step": 784, "train_speed(iter/s)": 0.026766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 286.3125, "completions/min_length": 231.0, "epoch": 0.6492969396195203, "grad_norm": 0.15186381340026855, "kl": 0.027374267578125, "learning_rate": 3.301964263416693e-06, "loss": 0.0002742260694503784, "memory(GiB)": 39.09, "reward": 0.4966971278190613, "reward_std": 0.07528349757194519, "rewards/VisualizationJSONCombinedORM/mean": 0.4966971278190613, "rewards/VisualizationJSONCombinedORM/std": 0.19342873990535736, "step": 785, "train_speed(iter/s)": 0.026775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 276.25, "completions/min_length": 226.0, "epoch": 0.6501240694789082, "grad_norm": 0.1664363294839859, "kl": 0.03131103515625, "learning_rate": 3.2883919513567096e-06, "loss": 0.0003123655915260315, "memory(GiB)": 39.09, "reward": 0.5149900913238525, "reward_std": 0.06535505503416061, "rewards/VisualizationJSONCombinedORM/mean": 0.5149900913238525, "rewards/VisualizationJSONCombinedORM/std": 0.19730396568775177, "step": 786, "train_speed(iter/s)": 0.026791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 300.9375, "completions/min_length": 222.0, "epoch": 0.6509511993382961, "grad_norm": 0.18420541286468506, "kl": 0.06402587890625, "learning_rate": 3.2748339100196105e-06, "loss": 0.0006410703063011169, "memory(GiB)": 39.09, "reward": 0.5279021859169006, "reward_std": 0.11719633638858795, "rewards/VisualizationJSONCombinedORM/mean": 0.5279021859169006, "rewards/VisualizationJSONCombinedORM/std": 0.13432103395462036, "step": 787, "train_speed(iter/s)": 0.026803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 265.375, "completions/min_length": 214.0, "epoch": 0.6517783291976841, "grad_norm": 0.19627797603607178, "kl": 0.040069580078125, "learning_rate": 3.2612902524470803e-06, "loss": 0.0004018116742372513, "memory(GiB)": 39.09, "reward": 0.7113218903541565, "reward_std": 0.10910598933696747, "rewards/VisualizationJSONCombinedORM/mean": 0.7113218903541565, "rewards/VisualizationJSONCombinedORM/std": 0.10614072531461716, "step": 788, "train_speed(iter/s)": 0.026815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 319.5, "completions/min_length": 251.0, "epoch": 0.652605459057072, "grad_norm": 0.22105123102664948, "kl": 0.06121826171875, "learning_rate": 3.2477610915608705e-06, "loss": 0.0006122365593910217, "memory(GiB)": 39.09, "reward": 0.4962902069091797, "reward_std": 0.05287439376115799, "rewards/VisualizationJSONCombinedORM/mean": 0.4962902069091797, "rewards/VisualizationJSONCombinedORM/std": 0.1736578494310379, "step": 789, "train_speed(iter/s)": 0.026818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 288.375, "completions/min_length": 232.0, "epoch": 0.6534325889164598, "grad_norm": 0.1989174485206604, "kl": 0.0516357421875, "learning_rate": 3.2342465401618715e-06, "loss": 0.0005165114998817444, "memory(GiB)": 39.09, "reward": 0.5620536208152771, "reward_std": 0.0627017617225647, "rewards/VisualizationJSONCombinedORM/mean": 0.5620536208152771, "rewards/VisualizationJSONCombinedORM/std": 0.20713309943675995, "step": 790, "train_speed(iter/s)": 0.026825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 275.125, "completions/min_length": 234.0, "epoch": 0.6542597187758478, "grad_norm": 0.19599786400794983, "kl": 0.04193115234375, "learning_rate": 3.220746710929159e-06, "loss": 0.00041880644857883453, "memory(GiB)": 39.09, "reward": 0.49390077590942383, "reward_std": 0.10775954276323318, "rewards/VisualizationJSONCombinedORM/mean": 0.49390077590942383, "rewards/VisualizationJSONCombinedORM/std": 0.13983970880508423, "step": 791, "train_speed(iter/s)": 0.026837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 273.0625, "completions/min_length": 217.0, "epoch": 0.6550868486352357, "grad_norm": 0.18508976697921753, "kl": 0.03582763671875, "learning_rate": 3.207261716419067e-06, "loss": 0.0003572404384613037, "memory(GiB)": 39.09, "reward": 0.4430641829967499, "reward_std": 0.06158452481031418, "rewards/VisualizationJSONCombinedORM/mean": 0.4430641829967499, "rewards/VisualizationJSONCombinedORM/std": 0.07629889249801636, "step": 792, "train_speed(iter/s)": 0.026844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 275.8125, "completions/min_length": 233.0, "epoch": 0.6559139784946236, "grad_norm": 0.23488833010196686, "kl": 0.033935546875, "learning_rate": 3.1937916690642356e-06, "loss": 0.00033883750438690186, "memory(GiB)": 39.09, "reward": 0.7728770971298218, "reward_std": 0.0667916089296341, "rewards/VisualizationJSONCombinedORM/mean": 0.7728770971298218, "rewards/VisualizationJSONCombinedORM/std": 0.08101696521043777, "step": 793, "train_speed(iter/s)": 0.026857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 284.375, "completions/min_length": 209.0, "epoch": 0.6567411083540116, "grad_norm": 0.2100040316581726, "kl": 0.0601806640625, "learning_rate": 3.180336681172691e-06, "loss": 0.00060267373919487, "memory(GiB)": 39.09, "reward": 0.4309711456298828, "reward_std": 0.11038069427013397, "rewards/VisualizationJSONCombinedORM/mean": 0.4309711456298828, "rewards/VisualizationJSONCombinedORM/std": 0.11254175007343292, "step": 794, "train_speed(iter/s)": 0.026871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 300.5625, "completions/min_length": 239.0, "epoch": 0.6575682382133995, "grad_norm": 0.17543943226337433, "kl": 0.03094482421875, "learning_rate": 3.1668968649268905e-06, "loss": 0.0003084372729063034, "memory(GiB)": 39.09, "reward": 0.6995436549186707, "reward_std": 0.07224959135055542, "rewards/VisualizationJSONCombinedORM/mean": 0.6995436549186707, "rewards/VisualizationJSONCombinedORM/std": 0.13325294852256775, "step": 795, "train_speed(iter/s)": 0.026877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 263.375, "completions/min_length": 232.0, "epoch": 0.6583953680727874, "grad_norm": 0.18731875717639923, "kl": 0.12078857421875, "learning_rate": 3.153472332382803e-06, "loss": 0.0012092366814613342, "memory(GiB)": 39.09, "reward": 0.6436992883682251, "reward_std": 0.09139847755432129, "rewards/VisualizationJSONCombinedORM/mean": 0.6436992883682251, "rewards/VisualizationJSONCombinedORM/std": 0.14923912286758423, "step": 796, "train_speed(iter/s)": 0.026883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 315.5625, "completions/min_length": 242.0, "epoch": 0.6592224979321754, "grad_norm": 0.17318949103355408, "kl": 0.0595703125, "learning_rate": 3.1400631954689626e-06, "loss": 0.0005957633256912231, "memory(GiB)": 39.09, "reward": 0.5059018135070801, "reward_std": 0.06493178009986877, "rewards/VisualizationJSONCombinedORM/mean": 0.5059018135070801, "rewards/VisualizationJSONCombinedORM/std": 0.155953511595726, "step": 797, "train_speed(iter/s)": 0.026894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 313.75, "completions/min_length": 225.0, "epoch": 0.6600496277915633, "grad_norm": 0.1710783839225769, "kl": 0.02618408203125, "learning_rate": 3.1266695659855462e-06, "loss": 0.00026212818920612335, "memory(GiB)": 39.09, "reward": 0.5480714440345764, "reward_std": 0.06331533193588257, "rewards/VisualizationJSONCombinedORM/mean": 0.5480714440345764, "rewards/VisualizationJSONCombinedORM/std": 0.21610547602176666, "step": 798, "train_speed(iter/s)": 0.026902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 318.375, "completions/min_length": 248.0, "epoch": 0.6608767576509512, "grad_norm": 0.1660020500421524, "kl": 0.0662841796875, "learning_rate": 3.1132915556034283e-06, "loss": 0.0006631594151258469, "memory(GiB)": 39.09, "reward": 0.6736794710159302, "reward_std": 0.07513132691383362, "rewards/VisualizationJSONCombinedORM/mean": 0.6736794710159302, "rewards/VisualizationJSONCombinedORM/std": 0.07444041222333908, "step": 799, "train_speed(iter/s)": 0.026911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 282.625, "completions/min_length": 232.0, "epoch": 0.6617038875103392, "grad_norm": 0.18191008269786835, "kl": 0.04412841796875, "learning_rate": 3.099929275863266e-06, "loss": 0.00044182687997817993, "memory(GiB)": 39.09, "reward": 0.27479082345962524, "reward_std": 0.03464386239647865, "rewards/VisualizationJSONCombinedORM/mean": 0.27479082345962524, "rewards/VisualizationJSONCombinedORM/std": 0.05745643377304077, "step": 800, "train_speed(iter/s)": 0.026923 }, { "epoch": 0.6617038875103392, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 354.7916666666667, "eval_completions/mean_length": 295.6979166666667, "eval_completions/min_length": 249.29166666666666, "eval_kl": 0.046473185221354164, "eval_loss": 0.00046709179878234863, "eval_reward": 0.4426754868278901, "eval_reward_std": 0.07158579025417566, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4426754868278901, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07158579064222674, "eval_runtime": 305.4226, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 295.3125, "completions/min_length": 251.0, "epoch": 0.6625310173697271, "grad_norm": 0.2271556258201599, "kl": 0.043701171875, "learning_rate": 3.0865828381745515e-06, "loss": 0.00043667852878570557, "memory(GiB)": 39.09, "reward": 0.44713258743286133, "reward_std": 0.08156894892454147, "rewards/VisualizationJSONCombinedORM/mean": 0.44713258743286133, "rewards/VisualizationJSONCombinedORM/std": 0.08436431735754013, "step": 801, "train_speed(iter/s)": 0.026661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 295.625, "completions/min_length": 234.0, "epoch": 0.663358147229115, "grad_norm": 0.16824352741241455, "kl": 0.0462646484375, "learning_rate": 3.0732523538146997e-06, "loss": 0.0004631616175174713, "memory(GiB)": 39.09, "reward": 0.688427746295929, "reward_std": 0.07620745897293091, "rewards/VisualizationJSONCombinedORM/mean": 0.688427746295929, "rewards/VisualizationJSONCombinedORM/std": 0.11600523442029953, "step": 802, "train_speed(iter/s)": 0.026671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 298.25, "completions/min_length": 240.0, "epoch": 0.664185277088503, "grad_norm": 0.16207566857337952, "kl": 0.0439453125, "learning_rate": 3.05993793392811e-06, "loss": 0.0004390031099319458, "memory(GiB)": 39.09, "reward": 0.6117581129074097, "reward_std": 0.11586374044418335, "rewards/VisualizationJSONCombinedORM/mean": 0.6117581129074097, "rewards/VisualizationJSONCombinedORM/std": 0.11314307898283005, "step": 803, "train_speed(iter/s)": 0.026681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 301.6875, "completions/min_length": 262.0, "epoch": 0.6650124069478908, "grad_norm": 0.18129697442054749, "kl": 0.02435302734375, "learning_rate": 3.0466396895252405e-06, "loss": 0.00024404376745224, "memory(GiB)": 39.09, "reward": 0.5749490261077881, "reward_std": 0.05777958780527115, "rewards/VisualizationJSONCombinedORM/mean": 0.5749490261077881, "rewards/VisualizationJSONCombinedORM/std": 0.09923860430717468, "step": 804, "train_speed(iter/s)": 0.026693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 310.1875, "completions/min_length": 254.0, "epoch": 0.6658395368072787, "grad_norm": 0.16386543214321136, "kl": 0.04034423828125, "learning_rate": 3.0333577314816883e-06, "loss": 0.0004028528928756714, "memory(GiB)": 39.09, "reward": 0.3917567729949951, "reward_std": 0.050479911267757416, "rewards/VisualizationJSONCombinedORM/mean": 0.3917567729949951, "rewards/VisualizationJSONCombinedORM/std": 0.05439026653766632, "step": 805, "train_speed(iter/s)": 0.026705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 302.125, "completions/min_length": 249.0, "epoch": 0.6666666666666666, "grad_norm": 0.16691578924655914, "kl": 0.04058837890625, "learning_rate": 3.0200921705372555e-06, "loss": 0.00040534883737564087, "memory(GiB)": 39.09, "reward": 0.499957799911499, "reward_std": 0.06360076367855072, "rewards/VisualizationJSONCombinedORM/mean": 0.499957799911499, "rewards/VisualizationJSONCombinedORM/std": 0.07224386930465698, "step": 806, "train_speed(iter/s)": 0.026711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 286.25, "completions/min_length": 240.0, "epoch": 0.6674937965260546, "grad_norm": 0.18433877825737, "kl": 0.03289794921875, "learning_rate": 3.0068431172950387e-06, "loss": 0.000328749418258667, "memory(GiB)": 39.09, "reward": 0.4803966283798218, "reward_std": 0.04868503287434578, "rewards/VisualizationJSONCombinedORM/mean": 0.4803966283798218, "rewards/VisualizationJSONCombinedORM/std": 0.07380826026201248, "step": 807, "train_speed(iter/s)": 0.026722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 284.4375, "completions/min_length": 231.0, "epoch": 0.6683209263854425, "grad_norm": 0.17844682931900024, "kl": 0.0379638671875, "learning_rate": 2.9936106822204937e-06, "loss": 0.0003801807761192322, "memory(GiB)": 39.09, "reward": 0.47502875328063965, "reward_std": 0.07380212098360062, "rewards/VisualizationJSONCombinedORM/mean": 0.47502875328063965, "rewards/VisualizationJSONCombinedORM/std": 0.29450711607933044, "step": 808, "train_speed(iter/s)": 0.026728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 324.9375, "completions/min_length": 225.0, "epoch": 0.6691480562448304, "grad_norm": 0.1471523940563202, "kl": 0.08404541015625, "learning_rate": 2.980394975640526e-06, "loss": 0.0008396622724831104, "memory(GiB)": 39.09, "reward": 0.36481180787086487, "reward_std": 0.09252294152975082, "rewards/VisualizationJSONCombinedORM/mean": 0.36481180787086487, "rewards/VisualizationJSONCombinedORM/std": 0.18508316576480865, "step": 809, "train_speed(iter/s)": 0.026731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 304.6875, "completions/min_length": 249.0, "epoch": 0.6699751861042184, "grad_norm": 0.1872325986623764, "kl": 0.0357666015625, "learning_rate": 2.9671961077425583e-06, "loss": 0.00035790540277957916, "memory(GiB)": 39.09, "reward": 0.43615055084228516, "reward_std": 0.045410312712192535, "rewards/VisualizationJSONCombinedORM/mean": 0.43615055084228516, "rewards/VisualizationJSONCombinedORM/std": 0.04524223878979683, "step": 810, "train_speed(iter/s)": 0.02674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 271.0, "completions/min_length": 208.0, "epoch": 0.6708023159636063, "grad_norm": 0.14213094115257263, "kl": 0.024688720703125, "learning_rate": 2.954014188573626e-06, "loss": 0.0002465546131134033, "memory(GiB)": 39.09, "reward": 0.37612637877464294, "reward_std": 0.04074844717979431, "rewards/VisualizationJSONCombinedORM/mean": 0.37612637877464294, "rewards/VisualizationJSONCombinedORM/std": 0.04099246859550476, "step": 811, "train_speed(iter/s)": 0.026748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 285.8125, "completions/min_length": 231.0, "epoch": 0.6716294458229942, "grad_norm": 0.1844828724861145, "kl": 0.047607421875, "learning_rate": 2.940849328039447e-06, "loss": 0.0004763007164001465, "memory(GiB)": 39.09, "reward": 0.5773547887802124, "reward_std": 0.07738108932971954, "rewards/VisualizationJSONCombinedORM/mean": 0.5773547887802124, "rewards/VisualizationJSONCombinedORM/std": 0.11239714920520782, "step": 812, "train_speed(iter/s)": 0.026758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 271.9375, "completions/min_length": 209.0, "epoch": 0.6724565756823822, "grad_norm": 0.16169774532318115, "kl": 0.04071044921875, "learning_rate": 2.9277016359035165e-06, "loss": 0.0004063323140144348, "memory(GiB)": 39.09, "reward": 0.6908046007156372, "reward_std": 0.07748699188232422, "rewards/VisualizationJSONCombinedORM/mean": 0.6908046007156372, "rewards/VisualizationJSONCombinedORM/std": 0.07886719703674316, "step": 813, "train_speed(iter/s)": 0.026769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 309.375, "completions/min_length": 244.0, "epoch": 0.6732837055417701, "grad_norm": 0.15989595651626587, "kl": 0.0416259765625, "learning_rate": 2.914571221786179e-06, "loss": 0.00041713565587997437, "memory(GiB)": 39.09, "reward": 0.5017245411872864, "reward_std": 0.04165511950850487, "rewards/VisualizationJSONCombinedORM/mean": 0.5017245411872864, "rewards/VisualizationJSONCombinedORM/std": 0.244963601231575, "step": 814, "train_speed(iter/s)": 0.02678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 288.375, "completions/min_length": 225.0, "epoch": 0.674110835401158, "grad_norm": 0.17108099162578583, "kl": 0.034423828125, "learning_rate": 2.9014581951637295e-06, "loss": 0.0003441125154495239, "memory(GiB)": 39.09, "reward": 0.48850584030151367, "reward_std": 0.05952262878417969, "rewards/VisualizationJSONCombinedORM/mean": 0.48850584030151367, "rewards/VisualizationJSONCombinedORM/std": 0.19772574305534363, "step": 815, "train_speed(iter/s)": 0.026789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 285.875, "completions/min_length": 245.0, "epoch": 0.674937965260546, "grad_norm": 0.19379502534866333, "kl": 0.06671142578125, "learning_rate": 2.8883626653674867e-06, "loss": 0.000666491687297821, "memory(GiB)": 39.09, "reward": 0.47602352499961853, "reward_std": 0.08643584698438644, "rewards/VisualizationJSONCombinedORM/mean": 0.47602352499961853, "rewards/VisualizationJSONCombinedORM/std": 0.14042338728904724, "step": 816, "train_speed(iter/s)": 0.026795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 334.125, "completions/min_length": 272.0, "epoch": 0.6757650951199339, "grad_norm": 0.1595628708600998, "kl": 0.04254150390625, "learning_rate": 2.8752847415828923e-06, "loss": 0.0004253387451171875, "memory(GiB)": 39.09, "reward": 0.5567511320114136, "reward_std": 0.09727313369512558, "rewards/VisualizationJSONCombinedORM/mean": 0.5567511320114136, "rewards/VisualizationJSONCombinedORM/std": 0.20487672090530396, "step": 817, "train_speed(iter/s)": 0.026798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 275.875, "completions/min_length": 225.0, "epoch": 0.6765922249793217, "grad_norm": 0.17702537775039673, "kl": 0.0333251953125, "learning_rate": 2.862224532848591e-06, "loss": 0.00033336877822875977, "memory(GiB)": 39.09, "reward": 0.4096966087818146, "reward_std": 0.09504484385251999, "rewards/VisualizationJSONCombinedORM/mean": 0.4096966087818146, "rewards/VisualizationJSONCombinedORM/std": 0.11035774648189545, "step": 818, "train_speed(iter/s)": 0.026806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 305.375, "completions/min_length": 238.0, "epoch": 0.6774193548387096, "grad_norm": 0.1854686439037323, "kl": 0.06134033203125, "learning_rate": 2.8491821480555283e-06, "loss": 0.000613706186413765, "memory(GiB)": 39.09, "reward": 0.589709997177124, "reward_std": 0.12446898221969604, "rewards/VisualizationJSONCombinedORM/mean": 0.589709997177124, "rewards/VisualizationJSONCombinedORM/std": 0.14322373270988464, "step": 819, "train_speed(iter/s)": 0.026817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 351.75, "completions/min_length": 285.0, "epoch": 0.6782464846980976, "grad_norm": 0.1526159942150116, "kl": 0.0528564453125, "learning_rate": 2.836157695946047e-06, "loss": 0.0005293404683470726, "memory(GiB)": 39.09, "reward": 0.6110031008720398, "reward_std": 0.08261202275753021, "rewards/VisualizationJSONCombinedORM/mean": 0.6110031008720398, "rewards/VisualizationJSONCombinedORM/std": 0.10165252536535263, "step": 820, "train_speed(iter/s)": 0.026823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 298.9375, "completions/min_length": 248.0, "epoch": 0.6790736145574855, "grad_norm": 0.171527698636055, "kl": 0.032745361328125, "learning_rate": 2.8231512851129596e-06, "loss": 0.0003265365958213806, "memory(GiB)": 39.09, "reward": 0.3912530541419983, "reward_std": 0.0768657997250557, "rewards/VisualizationJSONCombinedORM/mean": 0.3912530541419983, "rewards/VisualizationJSONCombinedORM/std": 0.08975975215435028, "step": 821, "train_speed(iter/s)": 0.026831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 306.8125, "completions/min_length": 254.0, "epoch": 0.6799007444168734, "grad_norm": 0.17248284816741943, "kl": 0.05059814453125, "learning_rate": 2.810163023998673e-06, "loss": 0.0005058310925960541, "memory(GiB)": 39.09, "reward": 0.5239322781562805, "reward_std": 0.06761109083890915, "rewards/VisualizationJSONCombinedORM/mean": 0.5239322781562805, "rewards/VisualizationJSONCombinedORM/std": 0.19129543006420135, "step": 822, "train_speed(iter/s)": 0.02684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 302.375, "completions/min_length": 221.0, "epoch": 0.6807278742762614, "grad_norm": 0.1669289767742157, "kl": 0.110107421875, "learning_rate": 2.79719302089426e-06, "loss": 0.0011008558794856071, "memory(GiB)": 39.09, "reward": 0.5297099351882935, "reward_std": 0.08756959438323975, "rewards/VisualizationJSONCombinedORM/mean": 0.5297099351882935, "rewards/VisualizationJSONCombinedORM/std": 0.09956007450819016, "step": 823, "train_speed(iter/s)": 0.026848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 286.0, "completions/min_length": 243.0, "epoch": 0.6815550041356493, "grad_norm": 0.18302680552005768, "kl": 0.0465087890625, "learning_rate": 2.784241383938566e-06, "loss": 0.0004649534821510315, "memory(GiB)": 39.09, "reward": 0.6607856750488281, "reward_std": 0.14550486207008362, "rewards/VisualizationJSONCombinedORM/mean": 0.6607856750488281, "rewards/VisualizationJSONCombinedORM/std": 0.1413624882698059, "step": 824, "train_speed(iter/s)": 0.02686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 308.9375, "completions/min_length": 252.0, "epoch": 0.6823821339950372, "grad_norm": 0.16479817032814026, "kl": 0.07080078125, "learning_rate": 2.771308221117309e-06, "loss": 0.0007062200456857681, "memory(GiB)": 39.09, "reward": 0.5190185308456421, "reward_std": 0.1225801333785057, "rewards/VisualizationJSONCombinedORM/mean": 0.5190185308456421, "rewards/VisualizationJSONCombinedORM/std": 0.16549985110759735, "step": 825, "train_speed(iter/s)": 0.026869 }, { "epoch": 0.6823821339950372, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 362.25, "eval_completions/mean_length": 300.1302083333333, "eval_completions/min_length": 249.70833333333334, "eval_kl": 0.040685017903645836, "eval_loss": 0.0004086792469024658, "eval_reward": 0.4647994463642438, "eval_reward_std": 0.060097979148849845, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4647994463642438, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06009798147715628, "eval_runtime": 309.777, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 301.125, "completions/min_length": 234.0, "epoch": 0.6832092638544252, "grad_norm": 0.17006486654281616, "kl": 0.041748046875, "learning_rate": 2.7583936402621753e-06, "loss": 0.0004169270396232605, "memory(GiB)": 39.09, "reward": 0.34307458996772766, "reward_std": 0.0611780509352684, "rewards/VisualizationJSONCombinedORM/mean": 0.34307458996772766, "rewards/VisualizationJSONCombinedORM/std": 0.061465997248888016, "step": 826, "train_speed(iter/s)": 0.02661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 323.125, "completions/min_length": 238.0, "epoch": 0.6840363937138131, "grad_norm": 0.17688967287540436, "kl": 0.04034423828125, "learning_rate": 2.745497749049922e-06, "loss": 0.000403575599193573, "memory(GiB)": 39.09, "reward": 0.5353891849517822, "reward_std": 0.09337752312421799, "rewards/VisualizationJSONCombinedORM/mean": 0.5353891849517822, "rewards/VisualizationJSONCombinedORM/std": 0.16346290707588196, "step": 827, "train_speed(iter/s)": 0.026618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 313.0, "completions/min_length": 256.0, "epoch": 0.684863523573201, "grad_norm": 0.19562838971614838, "kl": 0.04925537109375, "learning_rate": 2.7326206550014793e-06, "loss": 0.0004926882684230804, "memory(GiB)": 39.09, "reward": 0.39071333408355713, "reward_std": 0.055730484426021576, "rewards/VisualizationJSONCombinedORM/mean": 0.39071333408355713, "rewards/VisualizationJSONCombinedORM/std": 0.15831968188285828, "step": 828, "train_speed(iter/s)": 0.026624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 319.3125, "completions/min_length": 262.0, "epoch": 0.685690653432589, "grad_norm": 0.17352846264839172, "kl": 0.068359375, "learning_rate": 2.719762465481055e-06, "loss": 0.0006853044033050537, "memory(GiB)": 39.09, "reward": 0.6655445694923401, "reward_std": 0.10737547278404236, "rewards/VisualizationJSONCombinedORM/mean": 0.6655445694923401, "rewards/VisualizationJSONCombinedORM/std": 0.11917655169963837, "step": 829, "train_speed(iter/s)": 0.026635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 309.625, "completions/min_length": 261.0, "epoch": 0.6865177832919769, "grad_norm": 0.16528306901454926, "kl": 0.031768798828125, "learning_rate": 2.7069232876952368e-06, "loss": 0.0003171749413013458, "memory(GiB)": 39.09, "reward": 0.7392429113388062, "reward_std": 0.13282619416713715, "rewards/VisualizationJSONCombinedORM/mean": 0.7392429113388062, "rewards/VisualizationJSONCombinedORM/std": 0.13010627031326294, "step": 830, "train_speed(iter/s)": 0.026641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 300.5625, "completions/min_length": 231.0, "epoch": 0.6873449131513648, "grad_norm": 0.1724211722612381, "kl": 0.0501708984375, "learning_rate": 2.694103228692099e-06, "loss": 0.0005017444491386414, "memory(GiB)": 39.09, "reward": 0.47648701071739197, "reward_std": 0.07169988751411438, "rewards/VisualizationJSONCombinedORM/mean": 0.47648701071739197, "rewards/VisualizationJSONCombinedORM/std": 0.2576058506965637, "step": 831, "train_speed(iter/s)": 0.026651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 325.25, "completions/min_length": 235.0, "epoch": 0.6881720430107527, "grad_norm": 0.16479314863681793, "kl": 0.0545654296875, "learning_rate": 2.6813023953603168e-06, "loss": 0.0005456879734992981, "memory(GiB)": 39.09, "reward": 0.5960202217102051, "reward_std": 0.06571266055107117, "rewards/VisualizationJSONCombinedORM/mean": 0.5960202217102051, "rewards/VisualizationJSONCombinedORM/std": 0.1911078691482544, "step": 832, "train_speed(iter/s)": 0.026663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 323.25, "completions/min_length": 265.0, "epoch": 0.6889991728701406, "grad_norm": 0.16008564829826355, "kl": 0.03106689453125, "learning_rate": 2.668520894428259e-06, "loss": 0.0003103688359260559, "memory(GiB)": 39.09, "reward": 0.5824466347694397, "reward_std": 0.05330132693052292, "rewards/VisualizationJSONCombinedORM/mean": 0.5824466347694397, "rewards/VisualizationJSONCombinedORM/std": 0.1433940976858139, "step": 833, "train_speed(iter/s)": 0.026673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 276.25, "completions/min_length": 210.0, "epoch": 0.6898263027295285, "grad_norm": 0.18358354270458221, "kl": 0.03192138671875, "learning_rate": 2.6557588324631223e-06, "loss": 0.00031977519392967224, "memory(GiB)": 39.09, "reward": 0.607029914855957, "reward_std": 0.10721920430660248, "rewards/VisualizationJSONCombinedORM/mean": 0.607029914855957, "rewards/VisualizationJSONCombinedORM/std": 0.10780501365661621, "step": 834, "train_speed(iter/s)": 0.026679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 318.6875, "completions/min_length": 267.0, "epoch": 0.6906534325889164, "grad_norm": 0.15617841482162476, "kl": 0.036163330078125, "learning_rate": 2.6430163158700116e-06, "loss": 0.0003618057817220688, "memory(GiB)": 39.09, "reward": 0.7394974231719971, "reward_std": 0.10070683062076569, "rewards/VisualizationJSONCombinedORM/mean": 0.7394974231719971, "rewards/VisualizationJSONCombinedORM/std": 0.11375640332698822, "step": 835, "train_speed(iter/s)": 0.026694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 333.1875, "completions/min_length": 251.0, "epoch": 0.6914805624483044, "grad_norm": 0.18450671434402466, "kl": 0.05792236328125, "learning_rate": 2.630293450891086e-06, "loss": 0.0005799904465675354, "memory(GiB)": 39.09, "reward": 0.6630468368530273, "reward_std": 0.10465850681066513, "rewards/VisualizationJSONCombinedORM/mean": 0.6630468368530273, "rewards/VisualizationJSONCombinedORM/std": 0.13847710192203522, "step": 836, "train_speed(iter/s)": 0.0267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 308.9375, "completions/min_length": 224.0, "epoch": 0.6923076923076923, "grad_norm": 0.18816308677196503, "kl": 0.04364013671875, "learning_rate": 2.617590343604648e-06, "loss": 0.00043676048517227173, "memory(GiB)": 39.09, "reward": 0.5560399293899536, "reward_std": 0.12571953237056732, "rewards/VisualizationJSONCombinedORM/mean": 0.5560399293899536, "rewards/VisualizationJSONCombinedORM/std": 0.13849708437919617, "step": 837, "train_speed(iter/s)": 0.026711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 330.5625, "completions/min_length": 240.0, "epoch": 0.6931348221670802, "grad_norm": 0.15149514377117157, "kl": 0.03216552734375, "learning_rate": 2.6049070999242708e-06, "loss": 0.0003210529685020447, "memory(GiB)": 39.09, "reward": 0.3549908995628357, "reward_std": 0.03418278694152832, "rewards/VisualizationJSONCombinedORM/mean": 0.3549908995628357, "rewards/VisualizationJSONCombinedORM/std": 0.15368278324604034, "step": 838, "train_speed(iter/s)": 0.026719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 293.0625, "completions/min_length": 244.0, "epoch": 0.6939619520264682, "grad_norm": 0.16727566719055176, "kl": 0.03802490234375, "learning_rate": 2.5922438255979125e-06, "loss": 0.0003805011510848999, "memory(GiB)": 39.09, "reward": 0.398110568523407, "reward_std": 0.056663013994693756, "rewards/VisualizationJSONCombinedORM/mean": 0.398110568523407, "rewards/VisualizationJSONCombinedORM/std": 0.13393987715244293, "step": 839, "train_speed(iter/s)": 0.026728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 314.0, "completions/min_length": 236.0, "epoch": 0.6947890818858561, "grad_norm": 0.2185550183057785, "kl": 0.029998779296875, "learning_rate": 2.5796006262070337e-06, "loss": 0.00029994547367095947, "memory(GiB)": 39.09, "reward": 0.6741945743560791, "reward_std": 0.11511798948049545, "rewards/VisualizationJSONCombinedORM/mean": 0.6741945743560791, "rewards/VisualizationJSONCombinedORM/std": 0.14638450741767883, "step": 840, "train_speed(iter/s)": 0.026735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 330.0625, "completions/min_length": 248.0, "epoch": 0.695616211745244, "grad_norm": 0.23018909990787506, "kl": 0.04449462890625, "learning_rate": 2.5669776071657194e-06, "loss": 0.0004439614713191986, "memory(GiB)": 39.09, "reward": 0.42546045780181885, "reward_std": 0.09212158620357513, "rewards/VisualizationJSONCombinedORM/mean": 0.42546045780181885, "rewards/VisualizationJSONCombinedORM/std": 0.1575392484664917, "step": 841, "train_speed(iter/s)": 0.026745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 295.3125, "completions/min_length": 232.0, "epoch": 0.696443341604632, "grad_norm": 0.16566407680511475, "kl": 0.0299072265625, "learning_rate": 2.5543748737197953e-06, "loss": 0.0002994164824485779, "memory(GiB)": 39.09, "reward": 0.5299397706985474, "reward_std": 0.07255853712558746, "rewards/VisualizationJSONCombinedORM/mean": 0.5299397706985474, "rewards/VisualizationJSONCombinedORM/std": 0.15181417763233185, "step": 842, "train_speed(iter/s)": 0.026753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 279.375, "completions/min_length": 252.0, "epoch": 0.6972704714640199, "grad_norm": 0.1494678258895874, "kl": 0.039215087890625, "learning_rate": 2.5417925309459623e-06, "loss": 0.00039183348417282104, "memory(GiB)": 39.09, "reward": 0.5012941956520081, "reward_std": 0.05660352110862732, "rewards/VisualizationJSONCombinedORM/mean": 0.5012941956520081, "rewards/VisualizationJSONCombinedORM/std": 0.27526968717575073, "step": 843, "train_speed(iter/s)": 0.026765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 286.375, "completions/min_length": 250.0, "epoch": 0.6980976013234078, "grad_norm": 0.1557992547750473, "kl": 0.020782470703125, "learning_rate": 2.529230683750897e-06, "loss": 0.00020741671323776245, "memory(GiB)": 39.09, "reward": 0.6914926767349243, "reward_std": 0.06595221161842346, "rewards/VisualizationJSONCombinedORM/mean": 0.6914926767349243, "rewards/VisualizationJSONCombinedORM/std": 0.07264997065067291, "step": 844, "train_speed(iter/s)": 0.026774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 331.0625, "completions/min_length": 262.0, "epoch": 0.6989247311827957, "grad_norm": 0.18702486157417297, "kl": 0.03424072265625, "learning_rate": 2.51668943687041e-06, "loss": 0.0003430880606174469, "memory(GiB)": 39.09, "reward": 0.6590533256530762, "reward_std": 0.10104073584079742, "rewards/VisualizationJSONCombinedORM/mean": 0.6590533256530762, "rewards/VisualizationJSONCombinedORM/std": 0.1225583404302597, "step": 845, "train_speed(iter/s)": 0.02678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 320.0, "completions/min_length": 262.0, "epoch": 0.6997518610421837, "grad_norm": 0.1980973780155182, "kl": 0.034423828125, "learning_rate": 2.5041688948685367e-06, "loss": 0.0003439486026763916, "memory(GiB)": 39.09, "reward": 0.5147122740745544, "reward_std": 0.09433142840862274, "rewards/VisualizationJSONCombinedORM/mean": 0.5147122740745544, "rewards/VisualizationJSONCombinedORM/std": 0.19476918876171112, "step": 846, "train_speed(iter/s)": 0.026789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 289.375, "completions/min_length": 227.0, "epoch": 0.7005789909015715, "grad_norm": 0.18243776261806488, "kl": 0.04119873046875, "learning_rate": 2.4916691621366984e-06, "loss": 0.00041216611862182617, "memory(GiB)": 39.09, "reward": 0.6388009786605835, "reward_std": 0.09442837536334991, "rewards/VisualizationJSONCombinedORM/mean": 0.6388009786605835, "rewards/VisualizationJSONCombinedORM/std": 0.14556416869163513, "step": 847, "train_speed(iter/s)": 0.026798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 300.75, "completions/min_length": 247.0, "epoch": 0.7014061207609594, "grad_norm": 0.14210918545722961, "kl": 0.06268310546875, "learning_rate": 2.479190342892804e-06, "loss": 0.0006289295852184296, "memory(GiB)": 39.09, "reward": 0.5817718505859375, "reward_std": 0.08520098030567169, "rewards/VisualizationJSONCombinedORM/mean": 0.5817718505859375, "rewards/VisualizationJSONCombinedORM/std": 0.2120257169008255, "step": 848, "train_speed(iter/s)": 0.026809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 296.8125, "completions/min_length": 241.0, "epoch": 0.7022332506203474, "grad_norm": 0.18909335136413574, "kl": 0.02789306640625, "learning_rate": 2.466732541180404e-06, "loss": 0.00027990899980068207, "memory(GiB)": 39.09, "reward": 0.6034812927246094, "reward_std": 0.09723765403032303, "rewards/VisualizationJSONCombinedORM/mean": 0.6034812927246094, "rewards/VisualizationJSONCombinedORM/std": 0.15896476805210114, "step": 849, "train_speed(iter/s)": 0.026819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 291.5625, "completions/min_length": 236.0, "epoch": 0.7030603804797353, "grad_norm": 0.18712058663368225, "kl": 0.027618408203125, "learning_rate": 2.4542958608678075e-06, "loss": 0.0002759695053100586, "memory(GiB)": 39.09, "reward": 0.6063892841339111, "reward_std": 0.11121848970651627, "rewards/VisualizationJSONCombinedORM/mean": 0.6063892841339111, "rewards/VisualizationJSONCombinedORM/std": 0.12149753421545029, "step": 850, "train_speed(iter/s)": 0.026829 }, { "epoch": 0.7030603804797353, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.875, "eval_completions/mean_length": 302.78125, "eval_completions/min_length": 259.5, "eval_kl": 0.0421905517578125, "eval_loss": 0.00042456015944480896, "eval_reward": 0.44676116667687893, "eval_reward_std": 0.0701043939916417, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44676116667687893, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07010439422447234, "eval_runtime": 306.2144, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 310.3125, "completions/min_length": 239.0, "epoch": 0.7038875103391232, "grad_norm": 0.19036836922168732, "kl": 0.05084228515625, "learning_rate": 2.4418804056472228e-06, "loss": 0.0005090944468975067, "memory(GiB)": 39.09, "reward": 0.4472106695175171, "reward_std": 0.12966546416282654, "rewards/VisualizationJSONCombinedORM/mean": 0.4472106695175171, "rewards/VisualizationJSONCombinedORM/std": 0.13651403784751892, "step": 851, "train_speed(iter/s)": 0.026577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 295.0, "completions/min_length": 234.0, "epoch": 0.7047146401985112, "grad_norm": 0.2232498675584793, "kl": 0.0435791015625, "learning_rate": 2.429486279033892e-06, "loss": 0.00043671950697898865, "memory(GiB)": 39.09, "reward": 0.6767058372497559, "reward_std": 0.07444989681243896, "rewards/VisualizationJSONCombinedORM/mean": 0.6767058372497559, "rewards/VisualizationJSONCombinedORM/std": 0.07897759228944778, "step": 852, "train_speed(iter/s)": 0.02659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 279.75, "completions/min_length": 241.0, "epoch": 0.7055417700578991, "grad_norm": 0.16466739773750305, "kl": 0.03875732421875, "learning_rate": 2.4171135843652256e-06, "loss": 0.0003878287971019745, "memory(GiB)": 39.09, "reward": 0.525387704372406, "reward_std": 0.07618735730648041, "rewards/VisualizationJSONCombinedORM/mean": 0.525387704372406, "rewards/VisualizationJSONCombinedORM/std": 0.16558225452899933, "step": 853, "train_speed(iter/s)": 0.026602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 295.4375, "completions/min_length": 255.0, "epoch": 0.706368899917287, "grad_norm": 0.17945770919322968, "kl": 0.0423583984375, "learning_rate": 2.4047624247999484e-06, "loss": 0.00042374804615974426, "memory(GiB)": 39.09, "reward": 0.602842390537262, "reward_std": 0.07726205885410309, "rewards/VisualizationJSONCombinedORM/mean": 0.602842390537262, "rewards/VisualizationJSONCombinedORM/std": 0.13670478761196136, "step": 854, "train_speed(iter/s)": 0.026614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 289.6875, "completions/min_length": 223.0, "epoch": 0.707196029776675, "grad_norm": 0.19323720037937164, "kl": 0.0377197265625, "learning_rate": 2.3924329033172246e-06, "loss": 0.00037680938839912415, "memory(GiB)": 39.09, "reward": 0.7181365489959717, "reward_std": 0.07132399082183838, "rewards/VisualizationJSONCombinedORM/mean": 0.7181365489959717, "rewards/VisualizationJSONCombinedORM/std": 0.0695037841796875, "step": 855, "train_speed(iter/s)": 0.026622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 343.125, "completions/min_length": 283.0, "epoch": 0.7080231596360629, "grad_norm": 0.18664905428886414, "kl": 0.0418701171875, "learning_rate": 2.38012512271582e-06, "loss": 0.0004184618592262268, "memory(GiB)": 39.09, "reward": 0.4237529933452606, "reward_std": 0.05546919256448746, "rewards/VisualizationJSONCombinedORM/mean": 0.4237529933452606, "rewards/VisualizationJSONCombinedORM/std": 0.17676395177841187, "step": 856, "train_speed(iter/s)": 0.026627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 328.0, "completions/min_length": 262.0, "epoch": 0.7088502894954508, "grad_norm": 0.5629217624664307, "kl": 0.03790283203125, "learning_rate": 2.3678391856132203e-06, "loss": 0.0003792792558670044, "memory(GiB)": 39.09, "reward": 0.22597193717956543, "reward_std": 0.03193248063325882, "rewards/VisualizationJSONCombinedORM/mean": 0.22597193717956543, "rewards/VisualizationJSONCombinedORM/std": 0.05617675557732582, "step": 857, "train_speed(iter/s)": 0.026636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 344.5, "completions/min_length": 265.0, "epoch": 0.7096774193548387, "grad_norm": 0.21213839948177338, "kl": 0.037841796875, "learning_rate": 2.3555751944448036e-06, "loss": 0.00037778913974761963, "memory(GiB)": 39.09, "reward": 0.5390397310256958, "reward_std": 0.20845679938793182, "rewards/VisualizationJSONCombinedORM/mean": 0.5390397310256958, "rewards/VisualizationJSONCombinedORM/std": 0.20771212875843048, "step": 858, "train_speed(iter/s)": 0.026642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 299.9375, "completions/min_length": 250.0, "epoch": 0.7105045492142267, "grad_norm": 0.16783872246742249, "kl": 0.03753662109375, "learning_rate": 2.343333251462954e-06, "loss": 0.0003757178783416748, "memory(GiB)": 39.09, "reward": 0.6492241621017456, "reward_std": 0.09315364807844162, "rewards/VisualizationJSONCombinedORM/mean": 0.6492241621017456, "rewards/VisualizationJSONCombinedORM/std": 0.14778436720371246, "step": 859, "train_speed(iter/s)": 0.026656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 295.875, "completions/min_length": 227.0, "epoch": 0.7113316790736146, "grad_norm": 0.17109517753124237, "kl": 0.06243896484375, "learning_rate": 2.3311134587362426e-06, "loss": 0.0006223060190677643, "memory(GiB)": 39.09, "reward": 0.6405960321426392, "reward_std": 0.1199478879570961, "rewards/VisualizationJSONCombinedORM/mean": 0.6405960321426392, "rewards/VisualizationJSONCombinedORM/std": 0.13815397024154663, "step": 860, "train_speed(iter/s)": 0.02666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 275.5625, "completions/min_length": 219.0, "epoch": 0.7121588089330024, "grad_norm": 0.178823322057724, "kl": 0.047088623046875, "learning_rate": 2.3189159181485517e-06, "loss": 0.00047078728675842285, "memory(GiB)": 39.09, "reward": 0.5571128129959106, "reward_std": 0.13774819672107697, "rewards/VisualizationJSONCombinedORM/mean": 0.5571128129959106, "rewards/VisualizationJSONCombinedORM/std": 0.14218446612358093, "step": 861, "train_speed(iter/s)": 0.026673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 296.4375, "completions/min_length": 231.0, "epoch": 0.7129859387923904, "grad_norm": 0.2018757313489914, "kl": 0.0994873046875, "learning_rate": 2.306740731398234e-06, "loss": 0.000999394804239273, "memory(GiB)": 39.09, "reward": 0.4615720510482788, "reward_std": 0.06941930204629898, "rewards/VisualizationJSONCombinedORM/mean": 0.4615720510482788, "rewards/VisualizationJSONCombinedORM/std": 0.21937495470046997, "step": 862, "train_speed(iter/s)": 0.026682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 318.5, "completions/min_length": 254.0, "epoch": 0.7138130686517783, "grad_norm": 0.17775095999240875, "kl": 0.0516357421875, "learning_rate": 2.2945879999972676e-06, "loss": 0.000515766441822052, "memory(GiB)": 39.09, "reward": 0.5450314879417419, "reward_std": 0.07546170800924301, "rewards/VisualizationJSONCombinedORM/mean": 0.5450314879417419, "rewards/VisualizationJSONCombinedORM/std": 0.1397303193807602, "step": 863, "train_speed(iter/s)": 0.026692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 325.75, "completions/min_length": 262.0, "epoch": 0.7146401985111662, "grad_norm": 0.15436792373657227, "kl": 0.028961181640625, "learning_rate": 2.2824578252704042e-06, "loss": 0.00028949975967407227, "memory(GiB)": 39.09, "reward": 0.5887259840965271, "reward_std": 0.07255285233259201, "rewards/VisualizationJSONCombinedORM/mean": 0.5887259840965271, "rewards/VisualizationJSONCombinedORM/std": 0.1492253839969635, "step": 864, "train_speed(iter/s)": 0.0267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 335.625, "completions/min_length": 219.0, "epoch": 0.7154673283705542, "grad_norm": 0.1722412109375, "kl": 0.0322265625, "learning_rate": 2.2703503083543288e-06, "loss": 0.0003234855830669403, "memory(GiB)": 39.09, "reward": 0.33899006247520447, "reward_std": 0.03097270429134369, "rewards/VisualizationJSONCombinedORM/mean": 0.33899006247520447, "rewards/VisualizationJSONCombinedORM/std": 0.17548027634620667, "step": 865, "train_speed(iter/s)": 0.026708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 321.0625, "completions/min_length": 285.0, "epoch": 0.7162944582299421, "grad_norm": 0.18238800764083862, "kl": 0.05279541015625, "learning_rate": 2.258265550196812e-06, "loss": 0.0005271779373288155, "memory(GiB)": 39.09, "reward": 0.33765652775764465, "reward_std": 0.06276120990514755, "rewards/VisualizationJSONCombinedORM/mean": 0.33765652775764465, "rewards/VisualizationJSONCombinedORM/std": 0.12059395760297775, "step": 866, "train_speed(iter/s)": 0.026716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 317.9375, "completions/min_length": 242.0, "epoch": 0.71712158808933, "grad_norm": 0.18569128215312958, "kl": 0.0517578125, "learning_rate": 2.2462036515558726e-06, "loss": 0.0005176365375518799, "memory(GiB)": 39.09, "reward": 0.5501012802124023, "reward_std": 0.061469245702028275, "rewards/VisualizationJSONCombinedORM/mean": 0.5501012802124023, "rewards/VisualizationJSONCombinedORM/std": 0.2526741325855255, "step": 867, "train_speed(iter/s)": 0.026727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 308.0, "completions/min_length": 231.0, "epoch": 0.717948717948718, "grad_norm": 0.17073491215705872, "kl": 0.03662109375, "learning_rate": 2.234164712998935e-06, "loss": 0.0003653690218925476, "memory(GiB)": 39.09, "reward": 0.6162635087966919, "reward_std": 0.12018326669931412, "rewards/VisualizationJSONCombinedORM/mean": 0.6162635087966919, "rewards/VisualizationJSONCombinedORM/std": 0.12842229008674622, "step": 868, "train_speed(iter/s)": 0.026736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 298.3125, "completions/min_length": 242.0, "epoch": 0.7187758478081059, "grad_norm": 0.18350577354431152, "kl": 0.046600341796875, "learning_rate": 2.2221488349019903e-06, "loss": 0.0004649311304092407, "memory(GiB)": 39.09, "reward": 0.5166282057762146, "reward_std": 0.10026572644710541, "rewards/VisualizationJSONCombinedORM/mean": 0.5166282057762146, "rewards/VisualizationJSONCombinedORM/std": 0.10286014527082443, "step": 869, "train_speed(iter/s)": 0.026746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 313.0625, "completions/min_length": 241.0, "epoch": 0.7196029776674938, "grad_norm": 0.19232197105884552, "kl": 0.0465087890625, "learning_rate": 2.2101561174487606e-06, "loss": 0.00046493858098983765, "memory(GiB)": 39.09, "reward": 0.3613745868206024, "reward_std": 0.08545207977294922, "rewards/VisualizationJSONCombinedORM/mean": 0.3613745868206024, "rewards/VisualizationJSONCombinedORM/std": 0.1205364242196083, "step": 870, "train_speed(iter/s)": 0.026757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 330.9375, "completions/min_length": 258.0, "epoch": 0.7204301075268817, "grad_norm": 0.1765686273574829, "kl": 0.07562255859375, "learning_rate": 2.1981866606298684e-06, "loss": 0.0007554385811090469, "memory(GiB)": 39.09, "reward": 0.509441614151001, "reward_std": 0.11187462508678436, "rewards/VisualizationJSONCombinedORM/mean": 0.509441614151001, "rewards/VisualizationJSONCombinedORM/std": 0.13623154163360596, "step": 871, "train_speed(iter/s)": 0.026769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 320.8125, "completions/min_length": 262.0, "epoch": 0.7212572373862697, "grad_norm": 0.17478719353675842, "kl": 0.0584716796875, "learning_rate": 2.186240564241992e-06, "loss": 0.0005859024822711945, "memory(GiB)": 39.09, "reward": 0.5552232265472412, "reward_std": 0.07416349649429321, "rewards/VisualizationJSONCombinedORM/mean": 0.5552232265472412, "rewards/VisualizationJSONCombinedORM/std": 0.15092816948890686, "step": 872, "train_speed(iter/s)": 0.026777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 282.8125, "completions/min_length": 231.0, "epoch": 0.7220843672456576, "grad_norm": 0.2130991667509079, "kl": 0.03741455078125, "learning_rate": 2.174317927887041e-06, "loss": 0.0003738477826118469, "memory(GiB)": 39.09, "reward": 0.6541577577590942, "reward_std": 0.11544067412614822, "rewards/VisualizationJSONCombinedORM/mean": 0.6541577577590942, "rewards/VisualizationJSONCombinedORM/std": 0.1391042321920395, "step": 873, "train_speed(iter/s)": 0.02679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 294.5625, "completions/min_length": 255.0, "epoch": 0.7229114971050455, "grad_norm": 0.16726252436637878, "kl": 0.05010986328125, "learning_rate": 2.162418850971325e-06, "loss": 0.0005011409521102905, "memory(GiB)": 39.09, "reward": 0.6230510473251343, "reward_std": 0.1415964514017105, "rewards/VisualizationJSONCombinedORM/mean": 0.6230510473251343, "rewards/VisualizationJSONCombinedORM/std": 0.13883306086063385, "step": 874, "train_speed(iter/s)": 0.026799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 300.0, "completions/min_length": 218.0, "epoch": 0.7237386269644334, "grad_norm": 0.1927185356616974, "kl": 0.04193115234375, "learning_rate": 2.1505434327047246e-06, "loss": 0.0004198029637336731, "memory(GiB)": 39.09, "reward": 0.41254371404647827, "reward_std": 0.10822190344333649, "rewards/VisualizationJSONCombinedORM/mean": 0.41254371404647827, "rewards/VisualizationJSONCombinedORM/std": 0.29568031430244446, "step": 875, "train_speed(iter/s)": 0.026806 }, { "epoch": 0.7237386269644334, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.9583333333333, "eval_completions/mean_length": 311.4375, "eval_completions/min_length": 258.7916666666667, "eval_kl": 0.046529134114583336, "eval_loss": 0.0004652130010072142, "eval_reward": 0.48513424458603066, "eval_reward_std": 0.0770002151063333, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.48513424458603066, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07700022053904831, "eval_runtime": 313.0366, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 313.8125, "completions/min_length": 243.0, "epoch": 0.7245657568238213, "grad_norm": 0.2099900245666504, "kl": 0.031005859375, "learning_rate": 2.138691772099863e-06, "loss": 0.00031027011573314667, "memory(GiB)": 39.09, "reward": 0.5564221143722534, "reward_std": 0.10550105571746826, "rewards/VisualizationJSONCombinedORM/mean": 0.5564221143722534, "rewards/VisualizationJSONCombinedORM/std": 0.1138799786567688, "step": 876, "train_speed(iter/s)": 0.02656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 314.4375, "completions/min_length": 220.0, "epoch": 0.7253928866832092, "grad_norm": 0.23212002217769623, "kl": 0.0517578125, "learning_rate": 2.1268639679712814e-06, "loss": 0.0005182698369026184, "memory(GiB)": 39.09, "reward": 0.34522101283073425, "reward_std": 0.04822377488017082, "rewards/VisualizationJSONCombinedORM/mean": 0.34522101283073425, "rewards/VisualizationJSONCombinedORM/std": 0.13444958627223969, "step": 877, "train_speed(iter/s)": 0.02657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 283.6875, "completions/min_length": 227.0, "epoch": 0.7262200165425972, "grad_norm": 0.1796262264251709, "kl": 0.090576171875, "learning_rate": 2.115060118934616e-06, "loss": 0.0009064674377441406, "memory(GiB)": 39.09, "reward": 0.5329114198684692, "reward_std": 0.06288902461528778, "rewards/VisualizationJSONCombinedORM/mean": 0.5329114198684692, "rewards/VisualizationJSONCombinedORM/std": 0.18103304505348206, "step": 878, "train_speed(iter/s)": 0.026579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 306.625, "completions/min_length": 241.0, "epoch": 0.7270471464019851, "grad_norm": 0.16207191348075867, "kl": 0.06597900390625, "learning_rate": 2.1032803234057725e-06, "loss": 0.0006590783596038818, "memory(GiB)": 39.09, "reward": 0.4491407871246338, "reward_std": 0.09640015661716461, "rewards/VisualizationJSONCombinedORM/mean": 0.4491407871246338, "rewards/VisualizationJSONCombinedORM/std": 0.09518858045339584, "step": 879, "train_speed(iter/s)": 0.026588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 298.4375, "completions/min_length": 234.0, "epoch": 0.727874276261373, "grad_norm": 0.1560741364955902, "kl": 0.0335693359375, "learning_rate": 2.0915246796001077e-06, "loss": 0.00033597275614738464, "memory(GiB)": 39.09, "reward": 0.719169020652771, "reward_std": 0.11880229413509369, "rewards/VisualizationJSONCombinedORM/mean": 0.719169020652771, "rewards/VisualizationJSONCombinedORM/std": 0.11722152680158615, "step": 880, "train_speed(iter/s)": 0.026596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 298.5, "completions/min_length": 221.0, "epoch": 0.728701406120761, "grad_norm": 0.17216403782367706, "kl": 0.04010009765625, "learning_rate": 2.0797932855316183e-06, "loss": 0.00040124356746673584, "memory(GiB)": 39.09, "reward": 0.5288676023483276, "reward_std": 0.09285244345664978, "rewards/VisualizationJSONCombinedORM/mean": 0.5288676023483276, "rewards/VisualizationJSONCombinedORM/std": 0.20593847334384918, "step": 881, "train_speed(iter/s)": 0.026605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 290.9375, "completions/min_length": 235.0, "epoch": 0.7295285359801489, "grad_norm": 0.1717369705438614, "kl": 0.0411376953125, "learning_rate": 2.0680862390121015e-06, "loss": 0.0004106462001800537, "memory(GiB)": 39.09, "reward": 0.6912218928337097, "reward_std": 0.11187393218278885, "rewards/VisualizationJSONCombinedORM/mean": 0.6912218928337097, "rewards/VisualizationJSONCombinedORM/std": 0.12770648300647736, "step": 882, "train_speed(iter/s)": 0.026616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 331.75, "completions/min_length": 255.0, "epoch": 0.7303556658395368, "grad_norm": 0.16137467324733734, "kl": 0.04144287109375, "learning_rate": 2.056403637650371e-06, "loss": 0.00041428208351135254, "memory(GiB)": 39.09, "reward": 0.7100121974945068, "reward_std": 0.09168775379657745, "rewards/VisualizationJSONCombinedORM/mean": 0.7100121974945068, "rewards/VisualizationJSONCombinedORM/std": 0.10120991617441177, "step": 883, "train_speed(iter/s)": 0.026624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 314.125, "completions/min_length": 230.0, "epoch": 0.7311827956989247, "grad_norm": 0.17271988093852997, "kl": 0.0556640625, "learning_rate": 2.0447455788514105e-06, "loss": 0.0005563721060752869, "memory(GiB)": 39.09, "reward": 0.7137552499771118, "reward_std": 0.10787488520145416, "rewards/VisualizationJSONCombinedORM/mean": 0.7137552499771118, "rewards/VisualizationJSONCombinedORM/std": 0.11939222365617752, "step": 884, "train_speed(iter/s)": 0.026633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 311.0625, "completions/min_length": 229.0, "epoch": 0.7320099255583127, "grad_norm": 0.16978950798511505, "kl": 0.062744140625, "learning_rate": 2.0331121598155905e-06, "loss": 0.0006281137466430664, "memory(GiB)": 39.09, "reward": 0.5755322575569153, "reward_std": 0.08681157231330872, "rewards/VisualizationJSONCombinedORM/mean": 0.5755322575569153, "rewards/VisualizationJSONCombinedORM/std": 0.17957919836044312, "step": 885, "train_speed(iter/s)": 0.026643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 279.5, "completions/min_length": 230.0, "epoch": 0.7328370554177006, "grad_norm": 0.17366378009319305, "kl": 0.06256103515625, "learning_rate": 2.0215034775378336e-06, "loss": 0.0006255563348531723, "memory(GiB)": 39.09, "reward": 0.5469871759414673, "reward_std": 0.0933712050318718, "rewards/VisualizationJSONCombinedORM/mean": 0.5469871759414673, "rewards/VisualizationJSONCombinedORM/std": 0.10806877166032791, "step": 886, "train_speed(iter/s)": 0.026655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 326.0625, "completions/min_length": 274.0, "epoch": 0.7336641852770885, "grad_norm": 0.16424280405044556, "kl": 0.05853271484375, "learning_rate": 2.009919628806826e-06, "loss": 0.0005847401916980743, "memory(GiB)": 39.09, "reward": 0.5845077633857727, "reward_std": 0.08421964198350906, "rewards/VisualizationJSONCombinedORM/mean": 0.5845077633857727, "rewards/VisualizationJSONCombinedORM/std": 0.13657402992248535, "step": 887, "train_speed(iter/s)": 0.026665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 304.875, "completions/min_length": 227.0, "epoch": 0.7344913151364765, "grad_norm": 0.18443547189235687, "kl": 0.0509033203125, "learning_rate": 1.9983607102041974e-06, "loss": 0.0005079880356788635, "memory(GiB)": 39.09, "reward": 0.3622606694698334, "reward_std": 0.05311238765716553, "rewards/VisualizationJSONCombinedORM/mean": 0.3622606694698334, "rewards/VisualizationJSONCombinedORM/std": 0.14895573258399963, "step": 888, "train_speed(iter/s)": 0.026676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 322.6875, "completions/min_length": 262.0, "epoch": 0.7353184449958643, "grad_norm": 0.1884668916463852, "kl": 0.03521728515625, "learning_rate": 1.9868268181037186e-06, "loss": 0.0003523658961057663, "memory(GiB)": 39.09, "reward": 0.6016180515289307, "reward_std": 0.11266402900218964, "rewards/VisualizationJSONCombinedORM/mean": 0.6016180515289307, "rewards/VisualizationJSONCombinedORM/std": 0.12126846611499786, "step": 889, "train_speed(iter/s)": 0.026685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 281.1875, "completions/min_length": 235.0, "epoch": 0.7361455748552522, "grad_norm": 0.15662257373332977, "kl": 0.043212890625, "learning_rate": 1.9753180486705013e-06, "loss": 0.0004314593970775604, "memory(GiB)": 39.09, "reward": 0.45233622193336487, "reward_std": 0.07394913583993912, "rewards/VisualizationJSONCombinedORM/mean": 0.45233622193336487, "rewards/VisualizationJSONCombinedORM/std": 0.25281792879104614, "step": 890, "train_speed(iter/s)": 0.026695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 303.0625, "completions/min_length": 238.0, "epoch": 0.7369727047146402, "grad_norm": 0.16865397989749908, "kl": 0.05352783203125, "learning_rate": 1.963834497860192e-06, "loss": 0.0005353409796953201, "memory(GiB)": 39.09, "reward": 0.2831394076347351, "reward_std": 0.044488199055194855, "rewards/VisualizationJSONCombinedORM/mean": 0.2831394076347351, "rewards/VisualizationJSONCombinedORM/std": 0.045292362570762634, "step": 891, "train_speed(iter/s)": 0.026704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 267.5625, "completions/min_length": 211.0, "epoch": 0.7377998345740281, "grad_norm": 0.2070365995168686, "kl": 0.03326416015625, "learning_rate": 1.95237626141818e-06, "loss": 0.0003328882157802582, "memory(GiB)": 39.09, "reward": 0.4285798966884613, "reward_std": 0.1377953439950943, "rewards/VisualizationJSONCombinedORM/mean": 0.4285798966884613, "rewards/VisualizationJSONCombinedORM/std": 0.160593181848526, "step": 892, "train_speed(iter/s)": 0.026715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 308.875, "completions/min_length": 259.0, "epoch": 0.738626964433416, "grad_norm": 0.2220798134803772, "kl": 0.04376220703125, "learning_rate": 1.9409434348787824e-06, "loss": 0.0004378855228424072, "memory(GiB)": 39.09, "reward": 0.47580486536026, "reward_std": 0.12174156308174133, "rewards/VisualizationJSONCombinedORM/mean": 0.47580486536026, "rewards/VisualizationJSONCombinedORM/std": 0.11809342354536057, "step": 893, "train_speed(iter/s)": 0.026726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 312.375, "completions/min_length": 231.0, "epoch": 0.739454094292804, "grad_norm": 0.1802222579717636, "kl": 0.03515625, "learning_rate": 1.9295361135644724e-06, "loss": 0.00035131722688674927, "memory(GiB)": 39.09, "reward": 0.4331345558166504, "reward_std": 0.057648368179798126, "rewards/VisualizationJSONCombinedORM/mean": 0.4331345558166504, "rewards/VisualizationJSONCombinedORM/std": 0.06243215501308441, "step": 894, "train_speed(iter/s)": 0.026734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 311.625, "completions/min_length": 267.0, "epoch": 0.7402812241521919, "grad_norm": 0.17887131869792938, "kl": 0.0382080078125, "learning_rate": 1.9181543925850544e-06, "loss": 0.0003825873136520386, "memory(GiB)": 39.09, "reward": 0.5724925994873047, "reward_std": 0.10601595789194107, "rewards/VisualizationJSONCombinedORM/mean": 0.5724925994873047, "rewards/VisualizationJSONCombinedORM/std": 0.12340089678764343, "step": 895, "train_speed(iter/s)": 0.026742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 291.75, "completions/min_length": 246.0, "epoch": 0.7411083540115798, "grad_norm": 0.1903621256351471, "kl": 0.0540771484375, "learning_rate": 1.9067983668369038e-06, "loss": 0.0005414299666881561, "memory(GiB)": 39.09, "reward": 0.45546016097068787, "reward_std": 0.10344547033309937, "rewards/VisualizationJSONCombinedORM/mean": 0.45546016097068787, "rewards/VisualizationJSONCombinedORM/std": 0.15401926636695862, "step": 896, "train_speed(iter/s)": 0.026752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 297.6875, "completions/min_length": 241.0, "epoch": 0.7419354838709677, "grad_norm": 0.16869594156742096, "kl": 0.03106689453125, "learning_rate": 1.8954681310021434e-06, "loss": 0.0003106147050857544, "memory(GiB)": 39.09, "reward": 0.7114471197128296, "reward_std": 0.09218356013298035, "rewards/VisualizationJSONCombinedORM/mean": 0.7114471197128296, "rewards/VisualizationJSONCombinedORM/std": 0.09065741300582886, "step": 897, "train_speed(iter/s)": 0.026762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 297.625, "completions/min_length": 252.0, "epoch": 0.7427626137303557, "grad_norm": 0.16181322932243347, "kl": 0.0894775390625, "learning_rate": 1.8841637795478835e-06, "loss": 0.0008973665535449982, "memory(GiB)": 39.09, "reward": 0.4018268585205078, "reward_std": 0.057580988854169846, "rewards/VisualizationJSONCombinedORM/mean": 0.4018268585205078, "rewards/VisualizationJSONCombinedORM/std": 0.08612435311079025, "step": 898, "train_speed(iter/s)": 0.026775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 296.4375, "completions/min_length": 233.0, "epoch": 0.7435897435897436, "grad_norm": 0.21609921753406525, "kl": 0.0723876953125, "learning_rate": 1.872885406725412e-06, "loss": 0.0007213559001684189, "memory(GiB)": 39.09, "reward": 0.5753511190414429, "reward_std": 0.1245214194059372, "rewards/VisualizationJSONCombinedORM/mean": 0.5753511190414429, "rewards/VisualizationJSONCombinedORM/std": 0.12390739470720291, "step": 899, "train_speed(iter/s)": 0.026785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 293.4375, "completions/min_length": 234.0, "epoch": 0.7444168734491315, "grad_norm": 0.21191194653511047, "kl": 0.06964111328125, "learning_rate": 1.8616331065694193e-06, "loss": 0.0006953850388526917, "memory(GiB)": 39.09, "reward": 0.6497175097465515, "reward_std": 0.10826604068279266, "rewards/VisualizationJSONCombinedORM/mean": 0.6497175097465515, "rewards/VisualizationJSONCombinedORM/std": 0.10768633335828781, "step": 900, "train_speed(iter/s)": 0.026798 }, { "epoch": 0.7444168734491315, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.0833333333333, "eval_completions/mean_length": 303.4114583333333, "eval_completions/min_length": 250.79166666666666, "eval_kl": 0.048055013020833336, "eval_loss": 0.0004813149571418762, "eval_reward": 0.4684443057825168, "eval_reward_std": 0.08494511658015351, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4684443057825168, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08494511999500294, "eval_runtime": 307.7019, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 304.0625, "completions/min_length": 232.0, "epoch": 0.7452440033085195, "grad_norm": 0.17381373047828674, "kl": 0.04180908203125, "learning_rate": 1.8504069728972124e-06, "loss": 0.0004181191325187683, "memory(GiB)": 39.09, "reward": 0.5595912337303162, "reward_std": 0.11958572268486023, "rewards/VisualizationJSONCombinedORM/mean": 0.5595912337303162, "rewards/VisualizationJSONCombinedORM/std": 0.1161227747797966, "step": 901, "train_speed(iter/s)": 0.026559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 301.3125, "completions/min_length": 240.0, "epoch": 0.7460711331679074, "grad_norm": 0.17534957826137543, "kl": 0.05596923828125, "learning_rate": 1.8392070993079326e-06, "loss": 0.0005603022873401642, "memory(GiB)": 39.09, "reward": 0.4763094186782837, "reward_std": 0.07776413857936859, "rewards/VisualizationJSONCombinedORM/mean": 0.4763094186782837, "rewards/VisualizationJSONCombinedORM/std": 0.1716732680797577, "step": 902, "train_speed(iter/s)": 0.026567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 307.5, "completions/min_length": 223.0, "epoch": 0.7468982630272953, "grad_norm": 0.19169485569000244, "kl": 0.03643798828125, "learning_rate": 1.8280335791817733e-06, "loss": 0.00036472827196121216, "memory(GiB)": 39.09, "reward": 0.5388740301132202, "reward_std": 0.05757272243499756, "rewards/VisualizationJSONCombinedORM/mean": 0.5388740301132202, "rewards/VisualizationJSONCombinedORM/std": 0.1980902999639511, "step": 903, "train_speed(iter/s)": 0.026579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 311.8125, "completions/min_length": 259.0, "epoch": 0.7477253928866832, "grad_norm": 0.18279379606246948, "kl": 0.05145263671875, "learning_rate": 1.8168865056792029e-06, "loss": 0.0005147811025381088, "memory(GiB)": 39.09, "reward": 0.5849775075912476, "reward_std": 0.09225393831729889, "rewards/VisualizationJSONCombinedORM/mean": 0.5849775075912476, "rewards/VisualizationJSONCombinedORM/std": 0.09476412832736969, "step": 904, "train_speed(iter/s)": 0.02659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 298.1875, "completions/min_length": 249.0, "epoch": 0.7485525227460711, "grad_norm": 0.1792723834514618, "kl": 0.079833984375, "learning_rate": 1.8057659717401948e-06, "loss": 0.0007986873388290405, "memory(GiB)": 39.09, "reward": 0.3715812563896179, "reward_std": 0.07927821576595306, "rewards/VisualizationJSONCombinedORM/mean": 0.3715812563896179, "rewards/VisualizationJSONCombinedORM/std": 0.09724052250385284, "step": 905, "train_speed(iter/s)": 0.026602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 286.3125, "completions/min_length": 245.0, "epoch": 0.749379652605459, "grad_norm": 0.175351083278656, "kl": 0.06011962890625, "learning_rate": 1.7946720700834324e-06, "loss": 0.0006015822291374207, "memory(GiB)": 39.09, "reward": 0.4372860789299011, "reward_std": 0.05851557105779648, "rewards/VisualizationJSONCombinedORM/mean": 0.4372860789299011, "rewards/VisualizationJSONCombinedORM/std": 0.2593769133090973, "step": 906, "train_speed(iter/s)": 0.026611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 301.875, "completions/min_length": 252.0, "epoch": 0.750206782464847, "grad_norm": 0.17410984635353088, "kl": 0.06719970703125, "learning_rate": 1.7836048932055643e-06, "loss": 0.0006719827651977539, "memory(GiB)": 39.09, "reward": 0.48253947496414185, "reward_std": 0.0902477353811264, "rewards/VisualizationJSONCombinedORM/mean": 0.48253947496414185, "rewards/VisualizationJSONCombinedORM/std": 0.13742123544216156, "step": 907, "train_speed(iter/s)": 0.02662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 297.3125, "completions/min_length": 251.0, "epoch": 0.7510339123242349, "grad_norm": 0.23500041663646698, "kl": 0.05908203125, "learning_rate": 1.7725645333804054e-06, "loss": 0.000590987503528595, "memory(GiB)": 39.09, "reward": 0.5056084394454956, "reward_std": 0.07715575397014618, "rewards/VisualizationJSONCombinedORM/mean": 0.5056084394454956, "rewards/VisualizationJSONCombinedORM/std": 0.08579806238412857, "step": 908, "train_speed(iter/s)": 0.026627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 315.875, "completions/min_length": 222.0, "epoch": 0.7518610421836228, "grad_norm": 0.2048249989748001, "kl": 0.0447998046875, "learning_rate": 1.7615510826581906e-06, "loss": 0.0004475824534893036, "memory(GiB)": 39.09, "reward": 0.5158319473266602, "reward_std": 0.10801900923252106, "rewards/VisualizationJSONCombinedORM/mean": 0.5158319473266602, "rewards/VisualizationJSONCombinedORM/std": 0.14184121787548065, "step": 909, "train_speed(iter/s)": 0.026634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 314.75, "completions/min_length": 259.0, "epoch": 0.7526881720430108, "grad_norm": 0.19190549850463867, "kl": 0.05029296875, "learning_rate": 1.7505646328647913e-06, "loss": 0.0005019046366214752, "memory(GiB)": 39.09, "reward": 0.49537888169288635, "reward_std": 0.08323145657777786, "rewards/VisualizationJSONCombinedORM/mean": 0.49537888169288635, "rewards/VisualizationJSONCombinedORM/std": 0.21422302722930908, "step": 910, "train_speed(iter/s)": 0.02664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 311.875, "completions/min_length": 246.0, "epoch": 0.7535153019023987, "grad_norm": 0.15828479826450348, "kl": 0.0321044921875, "learning_rate": 1.7396052756009574e-06, "loss": 0.0003210250288248062, "memory(GiB)": 39.09, "reward": 0.643614649772644, "reward_std": 0.06667543947696686, "rewards/VisualizationJSONCombinedORM/mean": 0.643614649772644, "rewards/VisualizationJSONCombinedORM/std": 0.19555668532848358, "step": 911, "train_speed(iter/s)": 0.026646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.5, "completions/min_length": 209.0, "epoch": 0.7543424317617866, "grad_norm": 0.17933471500873566, "kl": 0.0355224609375, "learning_rate": 1.7286731022415515e-06, "loss": 0.0003555417060852051, "memory(GiB)": 39.09, "reward": 0.6976432800292969, "reward_std": 0.09828512370586395, "rewards/VisualizationJSONCombinedORM/mean": 0.6976432800292969, "rewards/VisualizationJSONCombinedORM/std": 0.11255870759487152, "step": 912, "train_speed(iter/s)": 0.026655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 317.0625, "completions/min_length": 242.0, "epoch": 0.7551695616211745, "grad_norm": 0.17896537482738495, "kl": 0.04241943359375, "learning_rate": 1.7177682039347875e-06, "loss": 0.00042463839054107666, "memory(GiB)": 39.09, "reward": 0.5587661266326904, "reward_std": 0.08590134233236313, "rewards/VisualizationJSONCombinedORM/mean": 0.5587661266326904, "rewards/VisualizationJSONCombinedORM/std": 0.12476615607738495, "step": 913, "train_speed(iter/s)": 0.026664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 318.625, "completions/min_length": 268.0, "epoch": 0.7559966914805625, "grad_norm": 0.18485453724861145, "kl": 0.036865234375, "learning_rate": 1.706890671601471e-06, "loss": 0.00036976486444473267, "memory(GiB)": 39.09, "reward": 0.6236680746078491, "reward_std": 0.10712325572967529, "rewards/VisualizationJSONCombinedORM/mean": 0.6236680746078491, "rewards/VisualizationJSONCombinedORM/std": 0.13055886328220367, "step": 914, "train_speed(iter/s)": 0.026672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 312.0625, "completions/min_length": 240.0, "epoch": 0.7568238213399504, "grad_norm": 0.18237139284610748, "kl": 0.0478515625, "learning_rate": 1.6960405959342402e-06, "loss": 0.00047814100980758667, "memory(GiB)": 39.09, "reward": 0.5221695303916931, "reward_std": 0.07597877085208893, "rewards/VisualizationJSONCombinedORM/mean": 0.5221695303916931, "rewards/VisualizationJSONCombinedORM/std": 0.21472768485546112, "step": 915, "train_speed(iter/s)": 0.026681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 276.75, "completions/min_length": 210.0, "epoch": 0.7576509511993383, "grad_norm": 0.17907363176345825, "kl": 0.0404052734375, "learning_rate": 1.6852180673968093e-06, "loss": 0.00040420517325401306, "memory(GiB)": 39.09, "reward": 0.45676907896995544, "reward_std": 0.06110260635614395, "rewards/VisualizationJSONCombinedORM/mean": 0.45676907896995544, "rewards/VisualizationJSONCombinedORM/std": 0.08328559249639511, "step": 916, "train_speed(iter/s)": 0.026689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 274.875, "completions/min_length": 226.0, "epoch": 0.7584780810587263, "grad_norm": 0.19869950413703918, "kl": 0.0513916015625, "learning_rate": 1.6744231762232178e-06, "loss": 0.000514540821313858, "memory(GiB)": 39.09, "reward": 0.46139949560165405, "reward_std": 0.10165999084711075, "rewards/VisualizationJSONCombinedORM/mean": 0.46139949560165405, "rewards/VisualizationJSONCombinedORM/std": 0.14793500304222107, "step": 917, "train_speed(iter/s)": 0.0267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 309.4375, "completions/min_length": 244.0, "epoch": 0.7593052109181141, "grad_norm": 0.1540646106004715, "kl": 0.0836181640625, "learning_rate": 1.6636560124170713e-06, "loss": 0.0008366033434867859, "memory(GiB)": 39.09, "reward": 0.5141621828079224, "reward_std": 0.09604312479496002, "rewards/VisualizationJSONCombinedORM/mean": 0.5141621828079224, "rewards/VisualizationJSONCombinedORM/std": 0.16510051488876343, "step": 918, "train_speed(iter/s)": 0.026707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 293.125, "completions/min_length": 212.0, "epoch": 0.760132340777502, "grad_norm": 0.19132055342197418, "kl": 0.0626220703125, "learning_rate": 1.6529166657508033e-06, "loss": 0.0006265118718147278, "memory(GiB)": 39.09, "reward": 0.3355531096458435, "reward_std": 0.08069464564323425, "rewards/VisualizationJSONCombinedORM/mean": 0.3355531096458435, "rewards/VisualizationJSONCombinedORM/std": 0.14324812591075897, "step": 919, "train_speed(iter/s)": 0.026716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 326.625, "completions/min_length": 276.0, "epoch": 0.76095947063689, "grad_norm": 0.1697060614824295, "kl": 0.05755615234375, "learning_rate": 1.642205225764908e-06, "loss": 0.0005762409418821335, "memory(GiB)": 39.09, "reward": 0.5689199566841125, "reward_std": 0.17418937385082245, "rewards/VisualizationJSONCombinedORM/mean": 0.5689199566841125, "rewards/VisualizationJSONCombinedORM/std": 0.2049141377210617, "step": 920, "train_speed(iter/s)": 0.026723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 299.4375, "completions/min_length": 247.0, "epoch": 0.7617866004962779, "grad_norm": 0.1698896735906601, "kl": 0.04595947265625, "learning_rate": 1.6315217817672142e-06, "loss": 0.00045910850167274475, "memory(GiB)": 39.09, "reward": 0.48278218507766724, "reward_std": 0.06971963495016098, "rewards/VisualizationJSONCombinedORM/mean": 0.48278218507766724, "rewards/VisualizationJSONCombinedORM/std": 0.2616596519947052, "step": 921, "train_speed(iter/s)": 0.026729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 290.625, "completions/min_length": 243.0, "epoch": 0.7626137303556658, "grad_norm": 0.18780097365379333, "kl": 0.06304931640625, "learning_rate": 1.6208664228321254e-06, "loss": 0.0006307549774646759, "memory(GiB)": 39.09, "reward": 0.6402691602706909, "reward_std": 0.1468389630317688, "rewards/VisualizationJSONCombinedORM/mean": 0.6402691602706909, "rewards/VisualizationJSONCombinedORM/std": 0.1595073938369751, "step": 922, "train_speed(iter/s)": 0.026736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 327.0, "completions/min_length": 240.0, "epoch": 0.7634408602150538, "grad_norm": 0.1696920543909073, "kl": 0.04571533203125, "learning_rate": 1.610239237799885e-06, "loss": 0.00045687146484851837, "memory(GiB)": 39.09, "reward": 0.5142549872398376, "reward_std": 0.04952407628297806, "rewards/VisualizationJSONCombinedORM/mean": 0.5142549872398376, "rewards/VisualizationJSONCombinedORM/std": 0.2534283399581909, "step": 923, "train_speed(iter/s)": 0.026745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 315.125, "completions/min_length": 216.0, "epoch": 0.7642679900744417, "grad_norm": 0.1700543314218521, "kl": 0.048095703125, "learning_rate": 1.5996403152758315e-06, "loss": 0.0004810616374015808, "memory(GiB)": 39.09, "reward": 0.5656055212020874, "reward_std": 0.10048860311508179, "rewards/VisualizationJSONCombinedORM/mean": 0.5656055212020874, "rewards/VisualizationJSONCombinedORM/std": 0.10558974742889404, "step": 924, "train_speed(iter/s)": 0.026754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 296.4375, "completions/min_length": 261.0, "epoch": 0.7650951199338296, "grad_norm": 0.1821938157081604, "kl": 0.02508544921875, "learning_rate": 1.5890697436296648e-06, "loss": 0.00025035440921783447, "memory(GiB)": 39.09, "reward": 0.5566007494926453, "reward_std": 0.07906714081764221, "rewards/VisualizationJSONCombinedORM/mean": 0.5566007494926453, "rewards/VisualizationJSONCombinedORM/std": 0.24304448068141937, "step": 925, "train_speed(iter/s)": 0.026764 }, { "epoch": 0.7650951199338296, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 361.7916666666667, "eval_completions/mean_length": 306.8385416666667, "eval_completions/min_length": 252.16666666666666, "eval_kl": 0.044550577799479164, "eval_loss": 0.000447549537057057, "eval_reward": 0.47568818492194015, "eval_reward_std": 0.07558777284187575, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47568818492194015, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07558777493735154, "eval_runtime": 309.6638, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 307.0, "completions/min_length": 241.0, "epoch": 0.7659222497932175, "grad_norm": 0.19987329840660095, "kl": 0.02947998046875, "learning_rate": 1.5785276109947028e-06, "loss": 0.00029428303241729736, "memory(GiB)": 39.09, "reward": 0.36407947540283203, "reward_std": 0.046641647815704346, "rewards/VisualizationJSONCombinedORM/mean": 0.36407947540283203, "rewards/VisualizationJSONCombinedORM/std": 0.04929003119468689, "step": 926, "train_speed(iter/s)": 0.026535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 294.5625, "completions/min_length": 233.0, "epoch": 0.7667493796526055, "grad_norm": 0.17669925093650818, "kl": 0.0423583984375, "learning_rate": 1.5680140052671516e-06, "loss": 0.0004234910011291504, "memory(GiB)": 39.09, "reward": 0.5102521181106567, "reward_std": 0.08410578966140747, "rewards/VisualizationJSONCombinedORM/mean": 0.5102521181106567, "rewards/VisualizationJSONCombinedORM/std": 0.0833052545785904, "step": 927, "train_speed(iter/s)": 0.026545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 294.25, "completions/min_length": 234.0, "epoch": 0.7675765095119934, "grad_norm": 0.20727428793907166, "kl": 0.0684814453125, "learning_rate": 1.5575290141053712e-06, "loss": 0.0006851889193058014, "memory(GiB)": 39.09, "reward": 0.48196566104888916, "reward_std": 0.061299294233322144, "rewards/VisualizationJSONCombinedORM/mean": 0.48196566104888916, "rewards/VisualizationJSONCombinedORM/std": 0.15279361605644226, "step": 928, "train_speed(iter/s)": 0.026555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 304.75, "completions/min_length": 239.0, "epoch": 0.7684036393713813, "grad_norm": 0.18217052519321442, "kl": 0.0509033203125, "learning_rate": 1.5470727249291423e-06, "loss": 0.0005098432302474976, "memory(GiB)": 39.09, "reward": 0.29799360036849976, "reward_std": 0.05250922590494156, "rewards/VisualizationJSONCombinedORM/mean": 0.29799360036849976, "rewards/VisualizationJSONCombinedORM/std": 0.15910518169403076, "step": 929, "train_speed(iter/s)": 0.026564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 303.125, "completions/min_length": 218.0, "epoch": 0.7692307692307693, "grad_norm": 0.21633611619472504, "kl": 0.03717041015625, "learning_rate": 1.5366452249189462e-06, "loss": 0.0003718230873346329, "memory(GiB)": 39.09, "reward": 0.5608936548233032, "reward_std": 0.09697893261909485, "rewards/VisualizationJSONCombinedORM/mean": 0.5608936548233032, "rewards/VisualizationJSONCombinedORM/std": 0.15403145551681519, "step": 930, "train_speed(iter/s)": 0.026572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 304.0, "completions/min_length": 250.0, "epoch": 0.7700578990901572, "grad_norm": 0.16202129423618317, "kl": 0.034759521484375, "learning_rate": 1.52624660101522e-06, "loss": 0.00034772977232933044, "memory(GiB)": 39.09, "reward": 0.3792062997817993, "reward_std": 0.054036203771829605, "rewards/VisualizationJSONCombinedORM/mean": 0.3792062997817993, "rewards/VisualizationJSONCombinedORM/std": 0.05890047177672386, "step": 931, "train_speed(iter/s)": 0.026584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 314.5, "completions/min_length": 243.0, "epoch": 0.770885028949545, "grad_norm": 0.18666529655456543, "kl": 0.06085205078125, "learning_rate": 1.5158769399176559e-06, "loss": 0.0006089508533477783, "memory(GiB)": 39.09, "reward": 0.5907208919525146, "reward_std": 0.09690713882446289, "rewards/VisualizationJSONCombinedORM/mean": 0.5907208919525146, "rewards/VisualizationJSONCombinedORM/std": 0.14663760364055634, "step": 932, "train_speed(iter/s)": 0.026594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 312.625, "completions/min_length": 258.0, "epoch": 0.771712158808933, "grad_norm": 0.15503954887390137, "kl": 0.025787353515625, "learning_rate": 1.505536328084453e-06, "loss": 0.00025802478194236755, "memory(GiB)": 39.09, "reward": 0.723086953163147, "reward_std": 0.08128078281879425, "rewards/VisualizationJSONCombinedORM/mean": 0.723086953163147, "rewards/VisualizationJSONCombinedORM/std": 0.12692447006702423, "step": 933, "train_speed(iter/s)": 0.026603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 300.0, "completions/min_length": 231.0, "epoch": 0.7725392886683209, "grad_norm": 0.20092220604419708, "kl": 0.06591796875, "learning_rate": 1.4952248517316215e-06, "loss": 0.0006587030366063118, "memory(GiB)": 39.09, "reward": 0.6522202491760254, "reward_std": 0.12363910675048828, "rewards/VisualizationJSONCombinedORM/mean": 0.6522202491760254, "rewards/VisualizationJSONCombinedORM/std": 0.12224866449832916, "step": 934, "train_speed(iter/s)": 0.026613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 314.4375, "completions/min_length": 235.0, "epoch": 0.7733664185277088, "grad_norm": 0.1882186383008957, "kl": 0.0565185546875, "learning_rate": 1.4849425968322384e-06, "loss": 0.0005652578547596931, "memory(GiB)": 39.09, "reward": 0.6148444414138794, "reward_std": 0.09811494499444962, "rewards/VisualizationJSONCombinedORM/mean": 0.6148444414138794, "rewards/VisualizationJSONCombinedORM/std": 0.188655287027359, "step": 935, "train_speed(iter/s)": 0.026625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 316.5625, "completions/min_length": 266.0, "epoch": 0.7741935483870968, "grad_norm": 0.17184267938137054, "kl": 0.04962158203125, "learning_rate": 1.4746896491157541e-06, "loss": 0.0004967711865901947, "memory(GiB)": 39.09, "reward": 0.4587188959121704, "reward_std": 0.07016000896692276, "rewards/VisualizationJSONCombinedORM/mean": 0.4587188959121704, "rewards/VisualizationJSONCombinedORM/std": 0.2398512214422226, "step": 936, "train_speed(iter/s)": 0.026633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 323.375, "completions/min_length": 286.0, "epoch": 0.7750206782464847, "grad_norm": 0.1991884559392929, "kl": 0.05401611328125, "learning_rate": 1.4644660940672628e-06, "loss": 0.000539824366569519, "memory(GiB)": 39.09, "reward": 0.4755786955356598, "reward_std": 0.09160035848617554, "rewards/VisualizationJSONCombinedORM/mean": 0.4755786955356598, "rewards/VisualizationJSONCombinedORM/std": 0.14050327241420746, "step": 937, "train_speed(iter/s)": 0.026642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 303.125, "completions/min_length": 262.0, "epoch": 0.7758478081058726, "grad_norm": 0.2097424864768982, "kl": 0.05950927734375, "learning_rate": 1.4542720169267933e-06, "loss": 0.000594213604927063, "memory(GiB)": 39.09, "reward": 0.5277783870697021, "reward_std": 0.11160619556903839, "rewards/VisualizationJSONCombinedORM/mean": 0.5277783870697021, "rewards/VisualizationJSONCombinedORM/std": 0.17100079357624054, "step": 938, "train_speed(iter/s)": 0.026651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 326.4375, "completions/min_length": 255.0, "epoch": 0.7766749379652605, "grad_norm": 0.23339961469173431, "kl": 0.0653076171875, "learning_rate": 1.4441075026885999e-06, "loss": 0.0006521232426166534, "memory(GiB)": 39.09, "reward": 0.2560023069381714, "reward_std": 0.04550236836075783, "rewards/VisualizationJSONCombinedORM/mean": 0.2560023069381714, "rewards/VisualizationJSONCombinedORM/std": 0.06255263835191727, "step": 939, "train_speed(iter/s)": 0.026664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 335.4375, "completions/min_length": 260.0, "epoch": 0.7775020678246485, "grad_norm": 0.1888681948184967, "kl": 0.0408935546875, "learning_rate": 1.433972636100452e-06, "loss": 0.00040875375270843506, "memory(GiB)": 39.09, "reward": 0.6148555874824524, "reward_std": 0.09331878274679184, "rewards/VisualizationJSONCombinedORM/mean": 0.6148555874824524, "rewards/VisualizationJSONCombinedORM/std": 0.11623988300561905, "step": 940, "train_speed(iter/s)": 0.026672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 309.4375, "completions/min_length": 242.0, "epoch": 0.7783291976840364, "grad_norm": 0.16717080771923065, "kl": 0.041961669921875, "learning_rate": 1.423867501662934e-06, "loss": 0.00041904300451278687, "memory(GiB)": 39.09, "reward": 0.6325931549072266, "reward_std": 0.10581882297992706, "rewards/VisualizationJSONCombinedORM/mean": 0.6325931549072266, "rewards/VisualizationJSONCombinedORM/std": 0.12187089771032333, "step": 941, "train_speed(iter/s)": 0.026682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 279.5, "completions/min_length": 221.0, "epoch": 0.7791563275434243, "grad_norm": 0.18281468749046326, "kl": 0.039581298828125, "learning_rate": 1.4137921836287238e-06, "loss": 0.0003961920738220215, "memory(GiB)": 39.09, "reward": 0.6603606939315796, "reward_std": 0.09938378632068634, "rewards/VisualizationJSONCombinedORM/mean": 0.6603606939315796, "rewards/VisualizationJSONCombinedORM/std": 0.15954913198947906, "step": 942, "train_speed(iter/s)": 0.026694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 288.125, "completions/min_length": 225.0, "epoch": 0.7799834574028123, "grad_norm": 0.15531764924526215, "kl": 0.039306640625, "learning_rate": 1.4037467660019156e-06, "loss": 0.0003934726119041443, "memory(GiB)": 39.09, "reward": 0.6107636094093323, "reward_std": 0.08395546674728394, "rewards/VisualizationJSONCombinedORM/mean": 0.6107636094093323, "rewards/VisualizationJSONCombinedORM/std": 0.09175485372543335, "step": 943, "train_speed(iter/s)": 0.026702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 282.8125, "completions/min_length": 235.0, "epoch": 0.7808105872622002, "grad_norm": 0.1683541238307953, "kl": 0.075927734375, "learning_rate": 1.3937313325372919e-06, "loss": 0.0007594674825668335, "memory(GiB)": 39.09, "reward": 0.516433835029602, "reward_std": 0.09382247924804688, "rewards/VisualizationJSONCombinedORM/mean": 0.516433835029602, "rewards/VisualizationJSONCombinedORM/std": 0.10922622680664062, "step": 944, "train_speed(iter/s)": 0.026713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 310.125, "completions/min_length": 264.0, "epoch": 0.7816377171215881, "grad_norm": 0.17211803793907166, "kl": 0.079833984375, "learning_rate": 1.383745966739652e-06, "loss": 0.000797939021140337, "memory(GiB)": 39.09, "reward": 0.45664721727371216, "reward_std": 0.052321188151836395, "rewards/VisualizationJSONCombinedORM/mean": 0.45664721727371216, "rewards/VisualizationJSONCombinedORM/std": 0.21810147166252136, "step": 945, "train_speed(iter/s)": 0.026721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 300.5625, "completions/min_length": 239.0, "epoch": 0.782464846980976, "grad_norm": 0.17969079315662384, "kl": 0.03729248046875, "learning_rate": 1.37379075186309e-06, "loss": 0.00037360191345214844, "memory(GiB)": 39.09, "reward": 0.4005689024925232, "reward_std": 0.06056099012494087, "rewards/VisualizationJSONCombinedORM/mean": 0.4005689024925232, "rewards/VisualizationJSONCombinedORM/std": 0.09028639644384384, "step": 946, "train_speed(iter/s)": 0.02673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 311.625, "completions/min_length": 235.0, "epoch": 0.7832919768403639, "grad_norm": 0.16694603860378265, "kl": 0.0723876953125, "learning_rate": 1.3638657709103238e-06, "loss": 0.0007227212190628052, "memory(GiB)": 39.09, "reward": 0.4896264970302582, "reward_std": 0.05942791700363159, "rewards/VisualizationJSONCombinedORM/mean": 0.4896264970302582, "rewards/VisualizationJSONCombinedORM/std": 0.2488601952791214, "step": 947, "train_speed(iter/s)": 0.026737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 296.5625, "completions/min_length": 216.0, "epoch": 0.7841191066997518, "grad_norm": 0.1966501623392105, "kl": 0.04888916015625, "learning_rate": 1.3539711066319873e-06, "loss": 0.0004902295768260956, "memory(GiB)": 39.09, "reward": 0.6443449854850769, "reward_std": 0.07379058003425598, "rewards/VisualizationJSONCombinedORM/mean": 0.6443449854850769, "rewards/VisualizationJSONCombinedORM/std": 0.07443422824144363, "step": 948, "train_speed(iter/s)": 0.026749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 314.8125, "completions/min_length": 251.0, "epoch": 0.7849462365591398, "grad_norm": 0.14441372454166412, "kl": 0.04132080078125, "learning_rate": 1.3441068415259462e-06, "loss": 0.00041367486119270325, "memory(GiB)": 39.09, "reward": 0.44014525413513184, "reward_std": 0.09593071043491364, "rewards/VisualizationJSONCombinedORM/mean": 0.44014525413513184, "rewards/VisualizationJSONCombinedORM/std": 0.13118739426136017, "step": 949, "train_speed(iter/s)": 0.026755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 289.9375, "completions/min_length": 250.0, "epoch": 0.7857733664185277, "grad_norm": 0.1596180945634842, "kl": 0.05853271484375, "learning_rate": 1.334273057836611e-06, "loss": 0.0005857124924659729, "memory(GiB)": 39.09, "reward": 0.6119577884674072, "reward_std": 0.0885331928730011, "rewards/VisualizationJSONCombinedORM/mean": 0.6119577884674072, "rewards/VisualizationJSONCombinedORM/std": 0.08816297352313995, "step": 950, "train_speed(iter/s)": 0.026764 }, { "epoch": 0.7857733664185277, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.125, "eval_completions/mean_length": 306.4375, "eval_completions/min_length": 253.75, "eval_kl": 0.0519256591796875, "eval_loss": 0.0005231878603808582, "eval_reward": 0.4802237693220377, "eval_reward_std": 0.07407595527668794, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4802237693220377, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07407595690650244, "eval_runtime": 314.8087, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 329.875, "completions/min_length": 267.0, "epoch": 0.7866004962779156, "grad_norm": 0.16388611495494843, "kl": 0.037841796875, "learning_rate": 1.3244698375542492e-06, "loss": 0.00037894025444984436, "memory(GiB)": 39.09, "reward": 0.6715024709701538, "reward_std": 0.09084511548280716, "rewards/VisualizationJSONCombinedORM/mean": 0.6715024709701538, "rewards/VisualizationJSONCombinedORM/std": 0.08878372609615326, "step": 951, "train_speed(iter/s)": 0.026536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 291.1875, "completions/min_length": 237.0, "epoch": 0.7874276261373035, "grad_norm": 0.24147553741931915, "kl": 0.049560546875, "learning_rate": 1.3146972624143024e-06, "loss": 0.0004952484741806984, "memory(GiB)": 39.09, "reward": 0.48047327995300293, "reward_std": 0.11867618560791016, "rewards/VisualizationJSONCombinedORM/mean": 0.48047327995300293, "rewards/VisualizationJSONCombinedORM/std": 0.16038931906223297, "step": 952, "train_speed(iter/s)": 0.026547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 282.4375, "completions/min_length": 230.0, "epoch": 0.7882547559966915, "grad_norm": 0.1725580096244812, "kl": 0.07427978515625, "learning_rate": 1.3049554138967052e-06, "loss": 0.0007417015731334686, "memory(GiB)": 39.09, "reward": 0.5901263356208801, "reward_std": 0.08156348764896393, "rewards/VisualizationJSONCombinedORM/mean": 0.5901263356208801, "rewards/VisualizationJSONCombinedORM/std": 0.1599806696176529, "step": 953, "train_speed(iter/s)": 0.026553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 335.0625, "completions/min_length": 242.0, "epoch": 0.7890818858560794, "grad_norm": 0.18264953792095184, "kl": 0.05059814453125, "learning_rate": 1.2952443732252058e-06, "loss": 0.0005054175853729248, "memory(GiB)": 39.09, "reward": 0.32619673013687134, "reward_std": 0.049537692219018936, "rewards/VisualizationJSONCombinedORM/mean": 0.32619673013687134, "rewards/VisualizationJSONCombinedORM/std": 0.16211195290088654, "step": 954, "train_speed(iter/s)": 0.026562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 329.8125, "completions/min_length": 257.0, "epoch": 0.7899090157154673, "grad_norm": 0.17407651245594025, "kl": 0.0386962890625, "learning_rate": 1.2855642213666858e-06, "loss": 0.0003863498568534851, "memory(GiB)": 39.09, "reward": 0.4015618860721588, "reward_std": 0.056100912392139435, "rewards/VisualizationJSONCombinedORM/mean": 0.4015618860721588, "rewards/VisualizationJSONCombinedORM/std": 0.15747220814228058, "step": 955, "train_speed(iter/s)": 0.02657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 316.1875, "completions/min_length": 237.0, "epoch": 0.7907361455748553, "grad_norm": 0.21592716872692108, "kl": 0.04412841796875, "learning_rate": 1.2759150390304953e-06, "loss": 0.0004417337477207184, "memory(GiB)": 39.09, "reward": 0.5753893852233887, "reward_std": 0.06055102124810219, "rewards/VisualizationJSONCombinedORM/mean": 0.5753893852233887, "rewards/VisualizationJSONCombinedORM/std": 0.17271848022937775, "step": 956, "train_speed(iter/s)": 0.026577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 303.0, "completions/min_length": 251.0, "epoch": 0.7915632754342432, "grad_norm": 0.15612152218818665, "kl": 0.0623779296875, "learning_rate": 1.266296906667762e-06, "loss": 0.0006231889128684998, "memory(GiB)": 39.09, "reward": 0.502646803855896, "reward_std": 0.061235781759023666, "rewards/VisualizationJSONCombinedORM/mean": 0.502646803855896, "rewards/VisualizationJSONCombinedORM/std": 0.19085870683193207, "step": 957, "train_speed(iter/s)": 0.026586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 299.1875, "completions/min_length": 243.0, "epoch": 0.7923904052936311, "grad_norm": 0.1722518652677536, "kl": 0.06695556640625, "learning_rate": 1.256709904470741e-06, "loss": 0.0006684381514787674, "memory(GiB)": 39.09, "reward": 0.48291271924972534, "reward_std": 0.09620439261198044, "rewards/VisualizationJSONCombinedORM/mean": 0.48291271924972534, "rewards/VisualizationJSONCombinedORM/std": 0.20711937546730042, "step": 958, "train_speed(iter/s)": 0.026596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 321.8125, "completions/min_length": 255.0, "epoch": 0.7932175351530191, "grad_norm": 0.16717499494552612, "kl": 0.04168701171875, "learning_rate": 1.2471541123721292e-06, "loss": 0.000416390597820282, "memory(GiB)": 39.09, "reward": 0.4602806866168976, "reward_std": 0.05723627656698227, "rewards/VisualizationJSONCombinedORM/mean": 0.4602806866168976, "rewards/VisualizationJSONCombinedORM/std": 0.1907578557729721, "step": 959, "train_speed(iter/s)": 0.026604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 333.4375, "completions/min_length": 265.0, "epoch": 0.794044665012407, "grad_norm": 0.1847013235092163, "kl": 0.06036376953125, "learning_rate": 1.2376296100444092e-06, "loss": 0.0006027966737747192, "memory(GiB)": 39.09, "reward": 0.4812857508659363, "reward_std": 0.07791218161582947, "rewards/VisualizationJSONCombinedORM/mean": 0.4812857508659363, "rewards/VisualizationJSONCombinedORM/std": 0.25034835934638977, "step": 960, "train_speed(iter/s)": 0.026612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 307.125, "completions/min_length": 247.0, "epoch": 0.7948717948717948, "grad_norm": 0.16924485564231873, "kl": 0.0662841796875, "learning_rate": 1.2281364768991804e-06, "loss": 0.000661022961139679, "memory(GiB)": 39.09, "reward": 0.40536919236183167, "reward_std": 0.06979101896286011, "rewards/VisualizationJSONCombinedORM/mean": 0.40536919236183167, "rewards/VisualizationJSONCombinedORM/std": 0.187258780002594, "step": 961, "train_speed(iter/s)": 0.026621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 299.0625, "completions/min_length": 227.0, "epoch": 0.7956989247311828, "grad_norm": 0.1635429859161377, "kl": 0.030059814453125, "learning_rate": 1.2186747920864993e-06, "loss": 0.00029971450567245483, "memory(GiB)": 39.09, "reward": 0.5247483849525452, "reward_std": 0.04310149326920509, "rewards/VisualizationJSONCombinedORM/mean": 0.5247483849525452, "rewards/VisualizationJSONCombinedORM/std": 0.1993320882320404, "step": 962, "train_speed(iter/s)": 0.026634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 316.25, "completions/min_length": 240.0, "epoch": 0.7965260545905707, "grad_norm": 0.15917757153511047, "kl": 0.04229736328125, "learning_rate": 1.2092446344942165e-06, "loss": 0.00042232125997543335, "memory(GiB)": 39.09, "reward": 0.4124253988265991, "reward_std": 0.07607731968164444, "rewards/VisualizationJSONCombinedORM/mean": 0.4124253988265991, "rewards/VisualizationJSONCombinedORM/std": 0.1520145684480667, "step": 963, "train_speed(iter/s)": 0.026642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 276.8125, "completions/min_length": 215.0, "epoch": 0.7973531844499586, "grad_norm": 0.17794443666934967, "kl": 0.0457763671875, "learning_rate": 1.199846082747323e-06, "loss": 0.0004585385322570801, "memory(GiB)": 39.09, "reward": 0.5167819857597351, "reward_std": 0.06367464363574982, "rewards/VisualizationJSONCombinedORM/mean": 0.5167819857597351, "rewards/VisualizationJSONCombinedORM/std": 0.16996119916439056, "step": 964, "train_speed(iter/s)": 0.026651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 301.6875, "completions/min_length": 252.0, "epoch": 0.7981803143093466, "grad_norm": 0.18616169691085815, "kl": 0.034423828125, "learning_rate": 1.1904792152072914e-06, "loss": 0.0003434717655181885, "memory(GiB)": 39.09, "reward": 0.6155998706817627, "reward_std": 0.16302332282066345, "rewards/VisualizationJSONCombinedORM/mean": 0.6155998706817627, "rewards/VisualizationJSONCombinedORM/std": 0.18785984814167023, "step": 965, "train_speed(iter/s)": 0.026662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 287.75, "completions/min_length": 225.0, "epoch": 0.7990074441687345, "grad_norm": 0.1687197983264923, "kl": 0.05328369140625, "learning_rate": 1.1811441099714232e-06, "loss": 0.0005327276885509491, "memory(GiB)": 39.09, "reward": 0.7096627950668335, "reward_std": 0.11989232897758484, "rewards/VisualizationJSONCombinedORM/mean": 0.7096627950668335, "rewards/VisualizationJSONCombinedORM/std": 0.11912550032138824, "step": 966, "train_speed(iter/s)": 0.02667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 299.625, "completions/min_length": 228.0, "epoch": 0.7998345740281224, "grad_norm": 0.1943158060312271, "kl": 0.04498291015625, "learning_rate": 1.171840844872198e-06, "loss": 0.00045041367411613464, "memory(GiB)": 39.09, "reward": 0.7309154868125916, "reward_std": 0.10465750098228455, "rewards/VisualizationJSONCombinedORM/mean": 0.7309154868125916, "rewards/VisualizationJSONCombinedORM/std": 0.10996538400650024, "step": 967, "train_speed(iter/s)": 0.026679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 302.3125, "completions/min_length": 238.0, "epoch": 0.8006617038875103, "grad_norm": 0.15935373306274414, "kl": 0.037078857421875, "learning_rate": 1.16256949747663e-06, "loss": 0.0003708302974700928, "memory(GiB)": 39.09, "reward": 0.6323527097702026, "reward_std": 0.10322894155979156, "rewards/VisualizationJSONCombinedORM/mean": 0.6323527097702026, "rewards/VisualizationJSONCombinedORM/std": 0.1292533576488495, "step": 968, "train_speed(iter/s)": 0.026687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 296.8125, "completions/min_length": 242.0, "epoch": 0.8014888337468983, "grad_norm": 0.17994654178619385, "kl": 0.04388427734375, "learning_rate": 1.1533301450856054e-06, "loss": 0.000437907874584198, "memory(GiB)": 39.09, "reward": 0.5832387804985046, "reward_std": 0.08041536808013916, "rewards/VisualizationJSONCombinedORM/mean": 0.5832387804985046, "rewards/VisualizationJSONCombinedORM/std": 0.1349005252122879, "step": 969, "train_speed(iter/s)": 0.026696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 290.5, "completions/min_length": 239.0, "epoch": 0.8023159636062862, "grad_norm": 0.19681815803050995, "kl": 0.05645751953125, "learning_rate": 1.1441228647332602e-06, "loss": 0.0005630478262901306, "memory(GiB)": 39.09, "reward": 0.5392558574676514, "reward_std": 0.10619382560253143, "rewards/VisualizationJSONCombinedORM/mean": 0.5392558574676514, "rewards/VisualizationJSONCombinedORM/std": 0.11599757522344589, "step": 970, "train_speed(iter/s)": 0.026707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 297.0, "completions/min_length": 258.0, "epoch": 0.8031430934656741, "grad_norm": 0.16299986839294434, "kl": 0.029571533203125, "learning_rate": 1.134947733186315e-06, "loss": 0.00029530376195907593, "memory(GiB)": 39.09, "reward": 0.3815761208534241, "reward_std": 0.04828953370451927, "rewards/VisualizationJSONCombinedORM/mean": 0.3815761208534241, "rewards/VisualizationJSONCombinedORM/std": 0.21993568539619446, "step": 971, "train_speed(iter/s)": 0.026714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 305.25, "completions/min_length": 252.0, "epoch": 0.8039702233250621, "grad_norm": 0.19126170873641968, "kl": 0.05816650390625, "learning_rate": 1.1258048269434569e-06, "loss": 0.0005813799798488617, "memory(GiB)": 39.09, "reward": 0.6248958110809326, "reward_std": 0.10422472655773163, "rewards/VisualizationJSONCombinedORM/mean": 0.6248958110809326, "rewards/VisualizationJSONCombinedORM/std": 0.1717258244752884, "step": 972, "train_speed(iter/s)": 0.026724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 289.1875, "completions/min_length": 249.0, "epoch": 0.80479735318445, "grad_norm": 0.18850763142108917, "kl": 0.0390625, "learning_rate": 1.1166942222346828e-06, "loss": 0.00039051473140716553, "memory(GiB)": 39.09, "reward": 0.42621031403541565, "reward_std": 0.07478342950344086, "rewards/VisualizationJSONCombinedORM/mean": 0.42621031403541565, "rewards/VisualizationJSONCombinedORM/std": 0.07266490161418915, "step": 973, "train_speed(iter/s)": 0.026732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 320.25, "completions/min_length": 251.0, "epoch": 0.8056244830438379, "grad_norm": 0.18111437559127808, "kl": 0.06756591796875, "learning_rate": 1.1076159950206762e-06, "loss": 0.0006745755672454834, "memory(GiB)": 39.09, "reward": 0.5472505688667297, "reward_std": 0.10680036246776581, "rewards/VisualizationJSONCombinedORM/mean": 0.5472505688667297, "rewards/VisualizationJSONCombinedORM/std": 0.12164057791233063, "step": 974, "train_speed(iter/s)": 0.02674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 307.5, "completions/min_length": 243.0, "epoch": 0.8064516129032258, "grad_norm": 0.21188434958457947, "kl": 0.191162109375, "learning_rate": 1.0985702209921677e-06, "loss": 0.001903928816318512, "memory(GiB)": 39.09, "reward": 0.5032490491867065, "reward_std": 0.11158548295497894, "rewards/VisualizationJSONCombinedORM/mean": 0.5032490491867065, "rewards/VisualizationJSONCombinedORM/std": 0.13736364245414734, "step": 975, "train_speed(iter/s)": 0.02675 }, { "epoch": 0.8064516129032258, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 363.5, "eval_completions/mean_length": 301.796875, "eval_completions/min_length": 245.58333333333334, "eval_kl": 0.050923665364583336, "eval_loss": 0.0005120362038724124, "eval_reward": 0.4865717329084873, "eval_reward_std": 0.0756392259305964, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4865717329084873, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07563922701713939, "eval_runtime": 310.7138, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 268.4375, "completions/min_length": 213.0, "epoch": 0.8072787427626137, "grad_norm": 0.1812172830104828, "kl": 0.04974365234375, "learning_rate": 1.0895569755693076e-06, "loss": 0.0004972070455551147, "memory(GiB)": 39.09, "reward": 0.689007043838501, "reward_std": 0.12348787486553192, "rewards/VisualizationJSONCombinedORM/mean": 0.689007043838501, "rewards/VisualizationJSONCombinedORM/std": 0.12416458129882812, "step": 976, "train_speed(iter/s)": 0.026533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 279.25, "completions/min_length": 232.0, "epoch": 0.8081058726220016, "grad_norm": 0.1852990984916687, "kl": 0.06402587890625, "learning_rate": 1.0805763339010329e-06, "loss": 0.0006399713456630707, "memory(GiB)": 39.09, "reward": 0.45622846484184265, "reward_std": 0.05528771132230759, "rewards/VisualizationJSONCombinedORM/mean": 0.45622846484184265, "rewards/VisualizationJSONCombinedORM/std": 0.22626186907291412, "step": 977, "train_speed(iter/s)": 0.026542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 294.625, "completions/min_length": 243.0, "epoch": 0.8089330024813896, "grad_norm": 0.1892116665840149, "kl": 0.06317138671875, "learning_rate": 1.0716283708644431e-06, "loss": 0.0006335489451885223, "memory(GiB)": 39.09, "reward": 0.6156029105186462, "reward_std": 0.10741639137268066, "rewards/VisualizationJSONCombinedORM/mean": 0.6156029105186462, "rewards/VisualizationJSONCombinedORM/std": 0.16632983088493347, "step": 978, "train_speed(iter/s)": 0.026553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 322.4375, "completions/min_length": 261.0, "epoch": 0.8097601323407775, "grad_norm": 0.18403981626033783, "kl": 0.086669921875, "learning_rate": 1.0627131610641829e-06, "loss": 0.0008665323257446289, "memory(GiB)": 39.09, "reward": 0.5647057890892029, "reward_std": 0.06986890733242035, "rewards/VisualizationJSONCombinedORM/mean": 0.5647057890892029, "rewards/VisualizationJSONCombinedORM/std": 0.19952858984470367, "step": 979, "train_speed(iter/s)": 0.02656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 299.375, "completions/min_length": 237.0, "epoch": 0.8105872622001654, "grad_norm": 0.2176441252231598, "kl": 0.04302978515625, "learning_rate": 1.0538307788318014e-06, "loss": 0.0004315376281738281, "memory(GiB)": 39.09, "reward": 0.49511808156967163, "reward_std": 0.08289165049791336, "rewards/VisualizationJSONCombinedORM/mean": 0.49511808156967163, "rewards/VisualizationJSONCombinedORM/std": 0.08615316450595856, "step": 980, "train_speed(iter/s)": 0.026568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 294.25, "completions/min_length": 231.0, "epoch": 0.8114143920595533, "grad_norm": 0.16655761003494263, "kl": 0.0621337890625, "learning_rate": 1.0449812982251556e-06, "loss": 0.0006209909915924072, "memory(GiB)": 39.09, "reward": 0.3017359972000122, "reward_std": 0.052520785480737686, "rewards/VisualizationJSONCombinedORM/mean": 0.3017359972000122, "rewards/VisualizationJSONCombinedORM/std": 0.051583025604486465, "step": 981, "train_speed(iter/s)": 0.026572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 294.625, "completions/min_length": 235.0, "epoch": 0.8122415219189413, "grad_norm": 0.20191536843776703, "kl": 0.04248046875, "learning_rate": 1.0361647930277719e-06, "loss": 0.00042505189776420593, "memory(GiB)": 39.09, "reward": 0.3747369647026062, "reward_std": 0.061640821397304535, "rewards/VisualizationJSONCombinedORM/mean": 0.3747369647026062, "rewards/VisualizationJSONCombinedORM/std": 0.07976623624563217, "step": 982, "train_speed(iter/s)": 0.02658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 280.375, "completions/min_length": 223.0, "epoch": 0.8130686517783292, "grad_norm": 0.17273254692554474, "kl": 0.05181884765625, "learning_rate": 1.02738133674825e-06, "loss": 0.0005189180374145508, "memory(GiB)": 39.09, "reward": 0.47403213381767273, "reward_std": 0.06835552304983139, "rewards/VisualizationJSONCombinedORM/mean": 0.47403213381767273, "rewards/VisualizationJSONCombinedORM/std": 0.07050327956676483, "step": 983, "train_speed(iter/s)": 0.026587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 318.25, "completions/min_length": 230.0, "epoch": 0.8138957816377171, "grad_norm": 0.17615364491939545, "kl": 0.0570068359375, "learning_rate": 1.01863100261963e-06, "loss": 0.0005704611539840698, "memory(GiB)": 39.09, "reward": 0.501478374004364, "reward_std": 0.1490393877029419, "rewards/VisualizationJSONCombinedORM/mean": 0.501478374004364, "rewards/VisualizationJSONCombinedORM/std": 0.14517663419246674, "step": 984, "train_speed(iter/s)": 0.026592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 316.4375, "completions/min_length": 237.0, "epoch": 0.8147229114971051, "grad_norm": 0.15309497714042664, "kl": 0.0572509765625, "learning_rate": 1.0099138635988026e-06, "loss": 0.0005719475448131561, "memory(GiB)": 39.09, "reward": 0.37784528732299805, "reward_std": 0.039186786860227585, "rewards/VisualizationJSONCombinedORM/mean": 0.37784528732299805, "rewards/VisualizationJSONCombinedORM/std": 0.04062031954526901, "step": 985, "train_speed(iter/s)": 0.026603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 287.5, "completions/min_length": 244.0, "epoch": 0.815550041356493, "grad_norm": 0.18816913664340973, "kl": 0.0657958984375, "learning_rate": 1.0012299923658848e-06, "loss": 0.000657595694065094, "memory(GiB)": 39.09, "reward": 0.42709365487098694, "reward_std": 0.0727391168475151, "rewards/VisualizationJSONCombinedORM/mean": 0.42709365487098694, "rewards/VisualizationJSONCombinedORM/std": 0.10751138627529144, "step": 986, "train_speed(iter/s)": 0.026612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 302.8125, "completions/min_length": 254.0, "epoch": 0.8163771712158809, "grad_norm": 0.19258183240890503, "kl": 0.08160400390625, "learning_rate": 9.925794613236201e-07, "loss": 0.0008163824677467346, "memory(GiB)": 39.09, "reward": 0.47146710753440857, "reward_std": 0.0777897834777832, "rewards/VisualizationJSONCombinedORM/mean": 0.47146710753440857, "rewards/VisualizationJSONCombinedORM/std": 0.090077705681324, "step": 987, "train_speed(iter/s)": 0.02662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 309.125, "completions/min_length": 270.0, "epoch": 0.8172043010752689, "grad_norm": 0.18772290647029877, "kl": 0.060546875, "learning_rate": 9.83962342596776e-07, "loss": 0.0006051436066627502, "memory(GiB)": 39.09, "reward": 0.5349188446998596, "reward_std": 0.09546561539173126, "rewards/VisualizationJSONCombinedORM/mean": 0.5349188446998596, "rewards/VisualizationJSONCombinedORM/std": 0.20556461811065674, "step": 988, "train_speed(iter/s)": 0.026631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 313.0, "completions/min_length": 218.0, "epoch": 0.8180314309346567, "grad_norm": 0.23821032047271729, "kl": 0.0477294921875, "learning_rate": 9.753787080315385e-07, "loss": 0.000476248562335968, "memory(GiB)": 39.09, "reward": 0.5298649072647095, "reward_std": 0.1349419802427292, "rewards/VisualizationJSONCombinedORM/mean": 0.5298649072647095, "rewards/VisualizationJSONCombinedORM/std": 0.14647126197814941, "step": 989, "train_speed(iter/s)": 0.026641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 306.4375, "completions/min_length": 225.0, "epoch": 0.8188585607940446, "grad_norm": 0.1721617579460144, "kl": 0.03692626953125, "learning_rate": 9.668286291949224e-07, "loss": 0.0003699101507663727, "memory(GiB)": 39.09, "reward": 0.38405001163482666, "reward_std": 0.0477546751499176, "rewards/VisualizationJSONCombinedORM/mean": 0.38405001163482666, "rewards/VisualizationJSONCombinedORM/std": 0.1745258867740631, "step": 990, "train_speed(iter/s)": 0.026649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 275.8125, "completions/min_length": 221.0, "epoch": 0.8196856906534326, "grad_norm": 0.15320487320423126, "kl": 0.067138671875, "learning_rate": 9.583121773741571e-07, "loss": 0.0006711743772029877, "memory(GiB)": 39.09, "reward": 0.6223404407501221, "reward_std": 0.08986510336399078, "rewards/VisualizationJSONCombinedORM/mean": 0.6223404407501221, "rewards/VisualizationJSONCombinedORM/std": 0.21279403567314148, "step": 991, "train_speed(iter/s)": 0.026657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 282.875, "completions/min_length": 236.0, "epoch": 0.8205128205128205, "grad_norm": 0.159048393368721, "kl": 0.03704833984375, "learning_rate": 9.498294235761141e-07, "loss": 0.00037115439772605896, "memory(GiB)": 39.09, "reward": 0.4639088213443756, "reward_std": 0.02589123509824276, "rewards/VisualizationJSONCombinedORM/mean": 0.4639088213443756, "rewards/VisualizationJSONCombinedORM/std": 0.2907179892063141, "step": 992, "train_speed(iter/s)": 0.026665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 305.0625, "completions/min_length": 239.0, "epoch": 0.8213399503722084, "grad_norm": 0.20348809659481049, "kl": 0.05767822265625, "learning_rate": 9.41380438526694e-07, "loss": 0.0005763135850429535, "memory(GiB)": 39.09, "reward": 0.5911498665809631, "reward_std": 0.1414870321750641, "rewards/VisualizationJSONCombinedORM/mean": 0.5911498665809631, "rewards/VisualizationJSONCombinedORM/std": 0.14188486337661743, "step": 993, "train_speed(iter/s)": 0.026673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 309.625, "completions/min_length": 252.0, "epoch": 0.8221670802315963, "grad_norm": 0.1971539705991745, "kl": 0.04302978515625, "learning_rate": 9.329652926702559e-07, "loss": 0.0004299059510231018, "memory(GiB)": 39.09, "reward": 0.40283626317977905, "reward_std": 0.062107451260089874, "rewards/VisualizationJSONCombinedORM/mean": 0.40283626317977905, "rewards/VisualizationJSONCombinedORM/std": 0.075241319835186, "step": 994, "train_speed(iter/s)": 0.026682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 311.375, "completions/min_length": 254.0, "epoch": 0.8229942100909843, "grad_norm": 0.14702101051807404, "kl": 0.02264404296875, "learning_rate": 9.245840561690117e-07, "loss": 0.00022674910724163055, "memory(GiB)": 39.09, "reward": 0.6074839234352112, "reward_std": 0.05449815094470978, "rewards/VisualizationJSONCombinedORM/mean": 0.6074839234352112, "rewards/VisualizationJSONCombinedORM/std": 0.14899888634681702, "step": 995, "train_speed(iter/s)": 0.02669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 284.5, "completions/min_length": 255.0, "epoch": 0.8238213399503722, "grad_norm": 0.17205384373664856, "kl": 0.021942138671875, "learning_rate": 9.162367989024584e-07, "loss": 0.000219687819480896, "memory(GiB)": 39.09, "reward": 0.5255715250968933, "reward_std": 0.06954158842563629, "rewards/VisualizationJSONCombinedORM/mean": 0.5255715250968933, "rewards/VisualizationJSONCombinedORM/std": 0.12823918461799622, "step": 996, "train_speed(iter/s)": 0.026699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 323.1875, "completions/min_length": 244.0, "epoch": 0.8246484698097601, "grad_norm": 0.16224981844425201, "kl": 0.046630859375, "learning_rate": 9.079235904667826e-07, "loss": 0.00046716630458831787, "memory(GiB)": 39.09, "reward": 0.6464412212371826, "reward_std": 0.10204155743122101, "rewards/VisualizationJSONCombinedORM/mean": 0.6464412212371826, "rewards/VisualizationJSONCombinedORM/std": 0.10330688953399658, "step": 997, "train_speed(iter/s)": 0.026704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 326.0, "completions/min_length": 266.0, "epoch": 0.8254755996691481, "grad_norm": 0.16746994853019714, "kl": 0.05810546875, "learning_rate": 8.996445001742871e-07, "loss": 0.0005809571593999863, "memory(GiB)": 39.09, "reward": 0.5614629983901978, "reward_std": 0.052223023027181625, "rewards/VisualizationJSONCombinedORM/mean": 0.5614629983901978, "rewards/VisualizationJSONCombinedORM/std": 0.07752644270658493, "step": 998, "train_speed(iter/s)": 0.02671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 286.0, "completions/min_length": 238.0, "epoch": 0.826302729528536, "grad_norm": 0.18534483015537262, "kl": 0.0516357421875, "learning_rate": 8.913995970528089e-07, "loss": 0.000516250729560852, "memory(GiB)": 39.09, "reward": 0.25105491280555725, "reward_std": 0.035226523876190186, "rewards/VisualizationJSONCombinedORM/mean": 0.25105491280555725, "rewards/VisualizationJSONCombinedORM/std": 0.04070214182138443, "step": 999, "train_speed(iter/s)": 0.02672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 301.5, "completions/min_length": 250.0, "epoch": 0.8271298593879239, "grad_norm": 0.18212254345417023, "kl": 0.064697265625, "learning_rate": 8.831889498451474e-07, "loss": 0.0006498955190181732, "memory(GiB)": 39.09, "reward": 0.5394583940505981, "reward_std": 0.10387606918811798, "rewards/VisualizationJSONCombinedORM/mean": 0.5394583940505981, "rewards/VisualizationJSONCombinedORM/std": 0.10879886150360107, "step": 1000, "train_speed(iter/s)": 0.026732 }, { "epoch": 0.8271298593879239, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 352.5416666666667, "eval_completions/mean_length": 302.7239583333333, "eval_completions/min_length": 252.375, "eval_kl": 0.044179280598958336, "eval_loss": 0.0004411078989505768, "eval_reward": 0.4612639310459296, "eval_reward_std": 0.07638041713895898, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4612639310459296, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07638041970009606, "eval_runtime": 304.1582, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 303.4375, "completions/min_length": 235.0, "epoch": 0.8279569892473119, "grad_norm": 0.20162898302078247, "kl": 0.0443115234375, "learning_rate": 8.750126270084891e-07, "loss": 0.00044307950884103775, "memory(GiB)": 39.09, "reward": 0.2610708475112915, "reward_std": 0.048765163868665695, "rewards/VisualizationJSONCombinedORM/mean": 0.2610708475112915, "rewards/VisualizationJSONCombinedORM/std": 0.06242978572845459, "step": 1001, "train_speed(iter/s)": 0.026527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 267.6875, "completions/min_length": 231.0, "epoch": 0.8287841191066998, "grad_norm": 0.19708728790283203, "kl": 0.0347900390625, "learning_rate": 8.668706967138363e-07, "loss": 0.000348113477230072, "memory(GiB)": 39.09, "reward": 0.5740796327590942, "reward_std": 0.08268114924430847, "rewards/VisualizationJSONCombinedORM/mean": 0.5740796327590942, "rewards/VisualizationJSONCombinedORM/std": 0.17156867682933807, "step": 1002, "train_speed(iter/s)": 0.026537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 322.3125, "completions/min_length": 256.0, "epoch": 0.8296112489660876, "grad_norm": 0.15548335015773773, "kl": 0.04022216796875, "learning_rate": 8.587632268454405e-07, "loss": 0.0004016868770122528, "memory(GiB)": 39.09, "reward": 0.4406937062740326, "reward_std": 0.06037100404500961, "rewards/VisualizationJSONCombinedORM/mean": 0.4406937062740326, "rewards/VisualizationJSONCombinedORM/std": 0.13739913702011108, "step": 1003, "train_speed(iter/s)": 0.026544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 320.4375, "completions/min_length": 236.0, "epoch": 0.8304383788254756, "grad_norm": 0.19570906460285187, "kl": 0.050048828125, "learning_rate": 8.506902850002358e-07, "loss": 0.0005005598068237305, "memory(GiB)": 39.09, "reward": 0.48582005500793457, "reward_std": 0.08833354711532593, "rewards/VisualizationJSONCombinedORM/mean": 0.48582005500793457, "rewards/VisualizationJSONCombinedORM/std": 0.11229635775089264, "step": 1004, "train_speed(iter/s)": 0.026553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 321.8125, "completions/min_length": 263.0, "epoch": 0.8312655086848635, "grad_norm": 0.2011270672082901, "kl": 0.06195068359375, "learning_rate": 8.426519384872733e-07, "loss": 0.0006194338202476501, "memory(GiB)": 39.09, "reward": 0.46276623010635376, "reward_std": 0.0678367093205452, "rewards/VisualizationJSONCombinedORM/mean": 0.46276623010635376, "rewards/VisualizationJSONCombinedORM/std": 0.22271454334259033, "step": 1005, "train_speed(iter/s)": 0.026559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 320.5, "completions/min_length": 232.0, "epoch": 0.8320926385442514, "grad_norm": 0.19815799593925476, "kl": 0.041748046875, "learning_rate": 8.346482543271656e-07, "loss": 0.00041710957884788513, "memory(GiB)": 39.09, "reward": 0.5889570713043213, "reward_std": 0.10894021391868591, "rewards/VisualizationJSONCombinedORM/mean": 0.5889570713043213, "rewards/VisualizationJSONCombinedORM/std": 0.19165197014808655, "step": 1006, "train_speed(iter/s)": 0.026566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 293.25, "completions/min_length": 236.0, "epoch": 0.8329197684036393, "grad_norm": 0.19497177004814148, "kl": 0.05242919921875, "learning_rate": 8.266792992515199e-07, "loss": 0.0005256086587905884, "memory(GiB)": 39.09, "reward": 0.4835922420024872, "reward_std": 0.07799416780471802, "rewards/VisualizationJSONCombinedORM/mean": 0.4835922420024872, "rewards/VisualizationJSONCombinedORM/std": 0.164476677775383, "step": 1007, "train_speed(iter/s)": 0.026572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 286.8125, "completions/min_length": 201.0, "epoch": 0.8337468982630273, "grad_norm": 0.1569284349679947, "kl": 0.027862548828125, "learning_rate": 8.187451397023877e-07, "loss": 0.00027856044471263885, "memory(GiB)": 39.09, "reward": 0.5808077454566956, "reward_std": 0.06876342743635178, "rewards/VisualizationJSONCombinedORM/mean": 0.5808077454566956, "rewards/VisualizationJSONCombinedORM/std": 0.12774215638637543, "step": 1008, "train_speed(iter/s)": 0.02658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 308.5625, "completions/min_length": 251.0, "epoch": 0.8345740281224152, "grad_norm": 0.19084803760051727, "kl": 0.05340576171875, "learning_rate": 8.108458418317089e-07, "loss": 0.0005351752042770386, "memory(GiB)": 39.09, "reward": 0.42269453406333923, "reward_std": 0.0792817547917366, "rewards/VisualizationJSONCombinedORM/mean": 0.42269453406333923, "rewards/VisualizationJSONCombinedORM/std": 0.1257520616054535, "step": 1009, "train_speed(iter/s)": 0.02659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 280.875, "completions/min_length": 228.0, "epoch": 0.8354011579818031, "grad_norm": 0.2468474805355072, "kl": 0.0333251953125, "learning_rate": 8.029814715007589e-07, "loss": 0.0003322707489132881, "memory(GiB)": 39.09, "reward": 0.36764416098594666, "reward_std": 0.09294769167900085, "rewards/VisualizationJSONCombinedORM/mean": 0.36764416098594666, "rewards/VisualizationJSONCombinedORM/std": 0.10649053752422333, "step": 1010, "train_speed(iter/s)": 0.026599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 316.875, "completions/min_length": 234.0, "epoch": 0.8362282878411911, "grad_norm": 0.2011031210422516, "kl": 0.02777099609375, "learning_rate": 7.951520942796026e-07, "loss": 0.0002776309847831726, "memory(GiB)": 39.09, "reward": 0.6024690270423889, "reward_std": 0.12348000705242157, "rewards/VisualizationJSONCombinedORM/mean": 0.6024690270423889, "rewards/VisualizationJSONCombinedORM/std": 0.12392807751893997, "step": 1011, "train_speed(iter/s)": 0.026605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 344.4375, "completions/min_length": 282.0, "epoch": 0.837055417700579, "grad_norm": 0.22017142176628113, "kl": 0.0560302734375, "learning_rate": 7.873577754465456e-07, "loss": 0.0005594752728939056, "memory(GiB)": 39.09, "reward": 0.5637142658233643, "reward_std": 0.11187498271465302, "rewards/VisualizationJSONCombinedORM/mean": 0.5637142658233643, "rewards/VisualizationJSONCombinedORM/std": 0.11902409791946411, "step": 1012, "train_speed(iter/s)": 0.026614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 278.8125, "completions/min_length": 215.0, "epoch": 0.8378825475599669, "grad_norm": 0.17277397215366364, "kl": 0.035308837890625, "learning_rate": 7.7959857998759e-07, "loss": 0.0003531612455844879, "memory(GiB)": 39.09, "reward": 0.4835584759712219, "reward_std": 0.05878007411956787, "rewards/VisualizationJSONCombinedORM/mean": 0.4835584759712219, "rewards/VisualizationJSONCombinedORM/std": 0.07063458114862442, "step": 1013, "train_speed(iter/s)": 0.026622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 320.125, "completions/min_length": 255.0, "epoch": 0.8387096774193549, "grad_norm": 0.17708337306976318, "kl": 0.022979736328125, "learning_rate": 7.718745725958914e-07, "loss": 0.00022943317890167236, "memory(GiB)": 39.09, "reward": 0.35386455059051514, "reward_std": 0.07329145073890686, "rewards/VisualizationJSONCombinedORM/mean": 0.35386455059051514, "rewards/VisualizationJSONCombinedORM/std": 0.19436509907245636, "step": 1014, "train_speed(iter/s)": 0.02663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 310.625, "completions/min_length": 237.0, "epoch": 0.8395368072787428, "grad_norm": 0.16368485987186432, "kl": 0.043975830078125, "learning_rate": 7.641858176712241e-07, "loss": 0.00043971091508865356, "memory(GiB)": 39.09, "reward": 0.4032381772994995, "reward_std": 0.04986265301704407, "rewards/VisualizationJSONCombinedORM/mean": 0.4032381772994995, "rewards/VisualizationJSONCombinedORM/std": 0.14683309197425842, "step": 1015, "train_speed(iter/s)": 0.026638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 302.75, "completions/min_length": 245.0, "epoch": 0.8403639371381307, "grad_norm": 0.1693601757287979, "kl": 0.04132080078125, "learning_rate": 7.565323793194373e-07, "loss": 0.000413370318710804, "memory(GiB)": 39.09, "reward": 0.3908531367778778, "reward_std": 0.05785466730594635, "rewards/VisualizationJSONCombinedORM/mean": 0.3908531367778778, "rewards/VisualizationJSONCombinedORM/std": 0.061078354716300964, "step": 1016, "train_speed(iter/s)": 0.026647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 298.75, "completions/min_length": 256.0, "epoch": 0.8411910669975186, "grad_norm": 0.20859992504119873, "kl": 0.04339599609375, "learning_rate": 7.489143213519301e-07, "loss": 0.00043314695358276367, "memory(GiB)": 39.09, "reward": 0.38386720418930054, "reward_std": 0.07372366636991501, "rewards/VisualizationJSONCombinedORM/mean": 0.38386720418930054, "rewards/VisualizationJSONCombinedORM/std": 0.11683420836925507, "step": 1017, "train_speed(iter/s)": 0.026656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 330.8125, "completions/min_length": 257.0, "epoch": 0.8420181968569065, "grad_norm": 0.17589181661605835, "kl": 0.080078125, "learning_rate": 7.413317072851051e-07, "loss": 0.0007992908358573914, "memory(GiB)": 39.09, "reward": 0.6659958362579346, "reward_std": 0.1136479377746582, "rewards/VisualizationJSONCombinedORM/mean": 0.6659958362579346, "rewards/VisualizationJSONCombinedORM/std": 0.11320392042398453, "step": 1018, "train_speed(iter/s)": 0.026661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 320.5625, "completions/min_length": 243.0, "epoch": 0.8428453267162944, "grad_norm": 0.17171935737133026, "kl": 0.06072998046875, "learning_rate": 7.337846003398568e-07, "loss": 0.0006081201136112213, "memory(GiB)": 39.09, "reward": 0.5612122416496277, "reward_std": 0.08174585551023483, "rewards/VisualizationJSONCombinedORM/mean": 0.5612122416496277, "rewards/VisualizationJSONCombinedORM/std": 0.18477556109428406, "step": 1019, "train_speed(iter/s)": 0.02667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 282.3125, "completions/min_length": 242.0, "epoch": 0.8436724565756824, "grad_norm": 0.17187350988388062, "kl": 0.02313232421875, "learning_rate": 7.262730634410259e-07, "loss": 0.0002309754490852356, "memory(GiB)": 39.09, "reward": 0.45917820930480957, "reward_std": 0.09150642156600952, "rewards/VisualizationJSONCombinedORM/mean": 0.45917820930480957, "rewards/VisualizationJSONCombinedORM/std": 0.19318623840808868, "step": 1020, "train_speed(iter/s)": 0.026682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 281.4375, "completions/min_length": 226.0, "epoch": 0.8444995864350703, "grad_norm": 0.15075768530368805, "kl": 0.0323486328125, "learning_rate": 7.187971592168936e-07, "loss": 0.00032334402203559875, "memory(GiB)": 39.09, "reward": 0.7396902441978455, "reward_std": 0.07099458575248718, "rewards/VisualizationJSONCombinedORM/mean": 0.7396902441978455, "rewards/VisualizationJSONCombinedORM/std": 0.10809776932001114, "step": 1021, "train_speed(iter/s)": 0.02669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 290.5, "completions/min_length": 235.0, "epoch": 0.8453267162944582, "grad_norm": 0.13750579953193665, "kl": 0.05621337890625, "learning_rate": 7.113569499986401e-07, "loss": 0.0005629695951938629, "memory(GiB)": 39.09, "reward": 0.28896281123161316, "reward_std": 0.040016256272792816, "rewards/VisualizationJSONCombinedORM/mean": 0.28896281123161316, "rewards/VisualizationJSONCombinedORM/std": 0.10423850268125534, "step": 1022, "train_speed(iter/s)": 0.026699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 318.1875, "completions/min_length": 238.0, "epoch": 0.8461538461538461, "grad_norm": 0.16015776991844177, "kl": 0.0506591796875, "learning_rate": 7.039524978198414e-07, "loss": 0.0005058683454990387, "memory(GiB)": 39.09, "reward": 0.5598673820495605, "reward_std": 0.07406429946422577, "rewards/VisualizationJSONCombinedORM/mean": 0.5598673820495605, "rewards/VisualizationJSONCombinedORM/std": 0.27237793803215027, "step": 1023, "train_speed(iter/s)": 0.026707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 276.1875, "completions/min_length": 222.0, "epoch": 0.8469809760132341, "grad_norm": 0.15579591691493988, "kl": 0.0379638671875, "learning_rate": 6.965838644159434e-07, "loss": 0.0003790808841586113, "memory(GiB)": 39.09, "reward": 0.6231426000595093, "reward_std": 0.06963001191616058, "rewards/VisualizationJSONCombinedORM/mean": 0.6231426000595093, "rewards/VisualizationJSONCombinedORM/std": 0.09750393778085709, "step": 1024, "train_speed(iter/s)": 0.026715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 288.25, "completions/min_length": 230.0, "epoch": 0.847808105872622, "grad_norm": 0.1672288328409195, "kl": 0.03558349609375, "learning_rate": 6.892511112237472e-07, "loss": 0.000356040894985199, "memory(GiB)": 39.09, "reward": 0.5752550363540649, "reward_std": 0.08285659551620483, "rewards/VisualizationJSONCombinedORM/mean": 0.5752550363540649, "rewards/VisualizationJSONCombinedORM/std": 0.08339647203683853, "step": 1025, "train_speed(iter/s)": 0.026724 }, { "epoch": 0.847808105872622, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.625, "eval_completions/mean_length": 308.8489583333333, "eval_completions/min_length": 256.0416666666667, "eval_kl": 0.0437164306640625, "eval_loss": 0.00044011822319589555, "eval_reward": 0.4555728081613779, "eval_reward_std": 0.07245119591243565, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4555728081613779, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07245119924967487, "eval_runtime": 316.5968, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 310.375, "completions/min_length": 216.0, "epoch": 0.8486352357320099, "grad_norm": 0.18533270061016083, "kl": 0.041748046875, "learning_rate": 6.819542993809003e-07, "loss": 0.0004178732633590698, "memory(GiB)": 39.09, "reward": 0.44267964363098145, "reward_std": 0.08122897148132324, "rewards/VisualizationJSONCombinedORM/mean": 0.44267964363098145, "rewards/VisualizationJSONCombinedORM/std": 0.15429383516311646, "step": 1026, "train_speed(iter/s)": 0.026514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 296.9375, "completions/min_length": 248.0, "epoch": 0.8494623655913979, "grad_norm": 0.1382913589477539, "kl": 0.044586181640625, "learning_rate": 6.746934897253832e-07, "loss": 0.0004456751048564911, "memory(GiB)": 39.09, "reward": 0.5686290264129639, "reward_std": 0.10240467637777328, "rewards/VisualizationJSONCombinedORM/mean": 0.5686290264129639, "rewards/VisualizationJSONCombinedORM/std": 0.1520315408706665, "step": 1027, "train_speed(iter/s)": 0.026524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 301.75, "completions/min_length": 230.0, "epoch": 0.8502894954507858, "grad_norm": 0.17169100046157837, "kl": 0.0416259765625, "learning_rate": 6.6746874279501e-07, "loss": 0.00041630491614341736, "memory(GiB)": 39.09, "reward": 0.6067339777946472, "reward_std": 0.12576444447040558, "rewards/VisualizationJSONCombinedORM/mean": 0.6067339777946472, "rewards/VisualizationJSONCombinedORM/std": 0.1482764631509781, "step": 1028, "train_speed(iter/s)": 0.026534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 316.0, "completions/min_length": 246.0, "epoch": 0.8511166253101737, "grad_norm": 0.18590018153190613, "kl": 0.0460205078125, "learning_rate": 6.602801188269081e-07, "loss": 0.0004599429666996002, "memory(GiB)": 39.09, "reward": 0.6378060579299927, "reward_std": 0.08465894311666489, "rewards/VisualizationJSONCombinedORM/mean": 0.6378060579299927, "rewards/VisualizationJSONCombinedORM/std": 0.22180934250354767, "step": 1029, "train_speed(iter/s)": 0.026541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 291.625, "completions/min_length": 238.0, "epoch": 0.8519437551695617, "grad_norm": 0.16424155235290527, "kl": 0.039459228515625, "learning_rate": 6.531276777570361e-07, "loss": 0.000394970178604126, "memory(GiB)": 39.09, "reward": 0.6326309442520142, "reward_std": 0.09402001649141312, "rewards/VisualizationJSONCombinedORM/mean": 0.6326309442520142, "rewards/VisualizationJSONCombinedORM/std": 0.2562156021595001, "step": 1030, "train_speed(iter/s)": 0.026549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 315.375, "completions/min_length": 250.0, "epoch": 0.8527708850289496, "grad_norm": 0.18194113671779633, "kl": 0.044677734375, "learning_rate": 6.460114792196642e-07, "loss": 0.00044653937220573425, "memory(GiB)": 39.09, "reward": 0.6742037534713745, "reward_std": 0.08882513642311096, "rewards/VisualizationJSONCombinedORM/mean": 0.6742037534713745, "rewards/VisualizationJSONCombinedORM/std": 0.09368375688791275, "step": 1031, "train_speed(iter/s)": 0.026555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 314.0, "completions/min_length": 246.0, "epoch": 0.8535980148883374, "grad_norm": 0.17931734025478363, "kl": 0.033355712890625, "learning_rate": 6.38931582546895e-07, "loss": 0.0003327019512653351, "memory(GiB)": 39.09, "reward": 0.3304554224014282, "reward_std": 0.038299836218357086, "rewards/VisualizationJSONCombinedORM/mean": 0.3304554224014282, "rewards/VisualizationJSONCombinedORM/std": 0.04217635840177536, "step": 1032, "train_speed(iter/s)": 0.026564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 318.8125, "completions/min_length": 222.0, "epoch": 0.8544251447477254, "grad_norm": 0.15642718970775604, "kl": 0.038818359375, "learning_rate": 6.318880467681527e-07, "loss": 0.0003875661641359329, "memory(GiB)": 39.09, "reward": 0.6196478605270386, "reward_std": 0.08353380858898163, "rewards/VisualizationJSONCombinedORM/mean": 0.6196478605270386, "rewards/VisualizationJSONCombinedORM/std": 0.0834316685795784, "step": 1033, "train_speed(iter/s)": 0.026574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 294.125, "completions/min_length": 232.0, "epoch": 0.8552522746071133, "grad_norm": 0.1727413833141327, "kl": 0.02166748046875, "learning_rate": 6.248809306097036e-07, "loss": 0.00021661818027496338, "memory(GiB)": 39.09, "reward": 0.45487383008003235, "reward_std": 0.0821424275636673, "rewards/VisualizationJSONCombinedORM/mean": 0.45487383008003235, "rewards/VisualizationJSONCombinedORM/std": 0.22546596825122833, "step": 1034, "train_speed(iter/s)": 0.026583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 323.5625, "completions/min_length": 273.0, "epoch": 0.8560794044665012, "grad_norm": 0.1761455088853836, "kl": 0.029998779296875, "learning_rate": 6.179102924941599e-07, "loss": 0.0002992674708366394, "memory(GiB)": 39.09, "reward": 0.4062085449695587, "reward_std": 0.08389472961425781, "rewards/VisualizationJSONCombinedORM/mean": 0.4062085449695587, "rewards/VisualizationJSONCombinedORM/std": 0.11130858212709427, "step": 1035, "train_speed(iter/s)": 0.026591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 299.3125, "completions/min_length": 230.0, "epoch": 0.8569065343258891, "grad_norm": 0.2210129052400589, "kl": 0.04766845703125, "learning_rate": 6.10976190539993e-07, "loss": 0.00047704577445983887, "memory(GiB)": 39.09, "reward": 0.4527001976966858, "reward_std": 0.10451510548591614, "rewards/VisualizationJSONCombinedORM/mean": 0.4527001976966858, "rewards/VisualizationJSONCombinedORM/std": 0.10679007321596146, "step": 1036, "train_speed(iter/s)": 0.026601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 289.875, "completions/min_length": 224.0, "epoch": 0.8577336641852771, "grad_norm": 0.1856192648410797, "kl": 0.036865234375, "learning_rate": 6.040786825610518e-07, "loss": 0.0003693923354148865, "memory(GiB)": 39.09, "reward": 0.6847418546676636, "reward_std": 0.0713631808757782, "rewards/VisualizationJSONCombinedORM/mean": 0.6847418546676636, "rewards/VisualizationJSONCombinedORM/std": 0.07933342456817627, "step": 1037, "train_speed(iter/s)": 0.026611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 276.625, "completions/min_length": 212.0, "epoch": 0.858560794044665, "grad_norm": 0.1709088832139969, "kl": 0.07818603515625, "learning_rate": 5.972178260660771e-07, "loss": 0.0007817521691322327, "memory(GiB)": 39.09, "reward": 0.4547143578529358, "reward_std": 0.06604770570993423, "rewards/VisualizationJSONCombinedORM/mean": 0.4547143578529358, "rewards/VisualizationJSONCombinedORM/std": 0.16866418719291687, "step": 1038, "train_speed(iter/s)": 0.026618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 298.625, "completions/min_length": 254.0, "epoch": 0.8593879239040529, "grad_norm": 0.1638859659433365, "kl": 0.03955078125, "learning_rate": 5.903936782582253e-07, "loss": 0.0003955999854952097, "memory(GiB)": 39.09, "reward": 0.7202200889587402, "reward_std": 0.08077925443649292, "rewards/VisualizationJSONCombinedORM/mean": 0.7202200889587402, "rewards/VisualizationJSONCombinedORM/std": 0.08119533956050873, "step": 1039, "train_speed(iter/s)": 0.026629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 297.9375, "completions/min_length": 252.0, "epoch": 0.8602150537634409, "grad_norm": 0.1649300456047058, "kl": 0.04217529296875, "learning_rate": 5.836062960345878e-07, "loss": 0.0004230812191963196, "memory(GiB)": 39.09, "reward": 0.39255082607269287, "reward_std": 0.085117869079113, "rewards/VisualizationJSONCombinedORM/mean": 0.39255082607269287, "rewards/VisualizationJSONCombinedORM/std": 0.14689622819423676, "step": 1040, "train_speed(iter/s)": 0.026634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 284.5625, "completions/min_length": 236.0, "epoch": 0.8610421836228288, "grad_norm": 0.1555236130952835, "kl": 0.035888671875, "learning_rate": 5.768557359857241e-07, "loss": 0.00035896897315979004, "memory(GiB)": 39.09, "reward": 0.6264029741287231, "reward_std": 0.09125954657793045, "rewards/VisualizationJSONCombinedORM/mean": 0.6264029741287231, "rewards/VisualizationJSONCombinedORM/std": 0.14792996644973755, "step": 1041, "train_speed(iter/s)": 0.026642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 293.1875, "completions/min_length": 200.0, "epoch": 0.8618693134822167, "grad_norm": 0.2581319510936737, "kl": 0.05474853515625, "learning_rate": 5.701420543951757e-07, "loss": 0.0005476083606481552, "memory(GiB)": 39.09, "reward": 0.4198698401451111, "reward_std": 0.046357277780771255, "rewards/VisualizationJSONCombinedORM/mean": 0.4198698401451111, "rewards/VisualizationJSONCombinedORM/std": 0.14357775449752808, "step": 1042, "train_speed(iter/s)": 0.02665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.0625, "completions/min_length": 236.0, "epoch": 0.8626964433416047, "grad_norm": 0.18123747408390045, "kl": 0.0379638671875, "learning_rate": 5.634653072390167e-07, "loss": 0.00037994980812072754, "memory(GiB)": 39.09, "reward": 0.5167089104652405, "reward_std": 0.08925658464431763, "rewards/VisualizationJSONCombinedORM/mean": 0.5167089104652405, "rewards/VisualizationJSONCombinedORM/std": 0.12479742616415024, "step": 1043, "train_speed(iter/s)": 0.026658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 304.1875, "completions/min_length": 217.0, "epoch": 0.8635235732009926, "grad_norm": 0.17940373718738556, "kl": 0.033935546875, "learning_rate": 5.568255501853664e-07, "loss": 0.00033930689096450806, "memory(GiB)": 39.09, "reward": 0.6152209639549255, "reward_std": 0.08236373960971832, "rewards/VisualizationJSONCombinedORM/mean": 0.6152209639549255, "rewards/VisualizationJSONCombinedORM/std": 0.1593509167432785, "step": 1044, "train_speed(iter/s)": 0.026667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 312.75, "completions/min_length": 242.0, "epoch": 0.8643507030603805, "grad_norm": 0.2091139853000641, "kl": 0.041748046875, "learning_rate": 5.502228385939418e-07, "loss": 0.0004180893301963806, "memory(GiB)": 39.09, "reward": 0.6759440898895264, "reward_std": 0.11564630270004272, "rewards/VisualizationJSONCombinedORM/mean": 0.6759440898895264, "rewards/VisualizationJSONCombinedORM/std": 0.1231483519077301, "step": 1045, "train_speed(iter/s)": 0.026678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 288.1875, "completions/min_length": 234.0, "epoch": 0.8651778329197684, "grad_norm": 0.19812647998332977, "kl": 0.052978515625, "learning_rate": 5.43657227515586e-07, "loss": 0.0005290880799293518, "memory(GiB)": 39.09, "reward": 0.5403398871421814, "reward_std": 0.08726910501718521, "rewards/VisualizationJSONCombinedORM/mean": 0.5403398871421814, "rewards/VisualizationJSONCombinedORM/std": 0.09161459654569626, "step": 1046, "train_speed(iter/s)": 0.026689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 296.8125, "completions/min_length": 235.0, "epoch": 0.8660049627791563, "grad_norm": 0.15011478960514069, "kl": 0.0443115234375, "learning_rate": 5.371287716918128e-07, "loss": 0.0004435572773218155, "memory(GiB)": 39.09, "reward": 0.6436033844947815, "reward_std": 0.11919227242469788, "rewards/VisualizationJSONCombinedORM/mean": 0.6436033844947815, "rewards/VisualizationJSONCombinedORM/std": 0.16684669256210327, "step": 1047, "train_speed(iter/s)": 0.026698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 258.125, "completions/min_length": 221.0, "epoch": 0.8668320926385442, "grad_norm": 0.20973850786685944, "kl": 0.0631103515625, "learning_rate": 5.306375255543511e-07, "loss": 0.0006317421793937683, "memory(GiB)": 39.09, "reward": 0.4175848066806793, "reward_std": 0.07400806248188019, "rewards/VisualizationJSONCombinedORM/mean": 0.4175848066806793, "rewards/VisualizationJSONCombinedORM/std": 0.1968460977077484, "step": 1048, "train_speed(iter/s)": 0.026707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 285.8125, "completions/min_length": 226.0, "epoch": 0.8676592224979321, "grad_norm": 0.17534704506397247, "kl": 0.03900146484375, "learning_rate": 5.241835432246888e-07, "loss": 0.0003895629197359085, "memory(GiB)": 39.09, "reward": 0.34387752413749695, "reward_std": 0.06175510585308075, "rewards/VisualizationJSONCombinedORM/mean": 0.34387752413749695, "rewards/VisualizationJSONCombinedORM/std": 0.11189165711402893, "step": 1049, "train_speed(iter/s)": 0.026715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 302.375, "completions/min_length": 227.0, "epoch": 0.8684863523573201, "grad_norm": 0.19093671441078186, "kl": 0.04345703125, "learning_rate": 5.177668785136225e-07, "loss": 0.0004348158836364746, "memory(GiB)": 39.09, "reward": 0.6177617907524109, "reward_std": 0.1193464919924736, "rewards/VisualizationJSONCombinedORM/mean": 0.6177617907524109, "rewards/VisualizationJSONCombinedORM/std": 0.11935925483703613, "step": 1050, "train_speed(iter/s)": 0.026723 }, { "epoch": 0.8684863523573201, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.8333333333333, "eval_completions/mean_length": 305.2760416666667, "eval_completions/min_length": 256.625, "eval_kl": 0.0458984375, "eval_loss": 0.0004591892065946013, "eval_reward": 0.4575418389091889, "eval_reward_std": 0.08641276066191494, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4575418389091889, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08641276345588267, "eval_runtime": 311.5669, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 295.0, "completions/min_length": 240.0, "epoch": 0.869313482216708, "grad_norm": 0.16908065974712372, "kl": 0.052459716796875, "learning_rate": 5.1138758492081e-07, "loss": 0.0005256161093711853, "memory(GiB)": 39.09, "reward": 0.33074504137039185, "reward_std": 0.04758540913462639, "rewards/VisualizationJSONCombinedORM/mean": 0.33074504137039185, "rewards/VisualizationJSONCombinedORM/std": 0.049012191593647, "step": 1051, "train_speed(iter/s)": 0.026518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 283.1875, "completions/min_length": 236.0, "epoch": 0.8701406120760959, "grad_norm": 0.14336444437503815, "kl": 0.033447265625, "learning_rate": 5.050457156343225e-07, "loss": 0.0003336407244205475, "memory(GiB)": 39.09, "reward": 0.5054800510406494, "reward_std": 0.06249300017952919, "rewards/VisualizationJSONCombinedORM/mean": 0.5054800510406494, "rewards/VisualizationJSONCombinedORM/std": 0.1722608506679535, "step": 1052, "train_speed(iter/s)": 0.026531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 298.75, "completions/min_length": 233.0, "epoch": 0.8709677419354839, "grad_norm": 0.16373994946479797, "kl": 0.0745849609375, "learning_rate": 4.987413235302025e-07, "loss": 0.000745922327041626, "memory(GiB)": 39.09, "reward": 0.372549831867218, "reward_std": 0.07808254659175873, "rewards/VisualizationJSONCombinedORM/mean": 0.372549831867218, "rewards/VisualizationJSONCombinedORM/std": 0.08031246066093445, "step": 1053, "train_speed(iter/s)": 0.026537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 301.75, "completions/min_length": 241.0, "epoch": 0.8717948717948718, "grad_norm": 0.17075127363204956, "kl": 0.031707763671875, "learning_rate": 4.924744611720201e-07, "loss": 0.00031712278723716736, "memory(GiB)": 39.09, "reward": 0.402246356010437, "reward_std": 0.05057626962661743, "rewards/VisualizationJSONCombinedORM/mean": 0.402246356010437, "rewards/VisualizationJSONCombinedORM/std": 0.07209806144237518, "step": 1054, "train_speed(iter/s)": 0.026547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 290.0, "completions/min_length": 214.0, "epoch": 0.8726220016542597, "grad_norm": 0.1609923243522644, "kl": 0.06280517578125, "learning_rate": 4.862451808104419e-07, "loss": 0.0006278157234191895, "memory(GiB)": 39.09, "reward": 0.5959782004356384, "reward_std": 0.10343754291534424, "rewards/VisualizationJSONCombinedORM/mean": 0.5959782004356384, "rewards/VisualizationJSONCombinedORM/std": 0.12458617240190506, "step": 1055, "train_speed(iter/s)": 0.026553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 297.0, "completions/min_length": 228.0, "epoch": 0.8734491315136477, "grad_norm": 0.17990437150001526, "kl": 0.03582763671875, "learning_rate": 4.800535343827834e-07, "loss": 0.0003580581396818161, "memory(GiB)": 39.09, "reward": 0.5779972076416016, "reward_std": 0.10255438834428787, "rewards/VisualizationJSONCombinedORM/mean": 0.5779972076416016, "rewards/VisualizationJSONCombinedORM/std": 0.15718044340610504, "step": 1056, "train_speed(iter/s)": 0.026561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 299.0625, "completions/min_length": 224.0, "epoch": 0.8742762613730356, "grad_norm": 0.19387196004390717, "kl": 0.0333251953125, "learning_rate": 4.738995735125895e-07, "loss": 0.0003324970602989197, "memory(GiB)": 39.09, "reward": 0.46306493878364563, "reward_std": 0.09779994189739227, "rewards/VisualizationJSONCombinedORM/mean": 0.46306493878364563, "rewards/VisualizationJSONCombinedORM/std": 0.10666446387767792, "step": 1057, "train_speed(iter/s)": 0.026569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 316.125, "completions/min_length": 248.0, "epoch": 0.8751033912324235, "grad_norm": 0.1758423149585724, "kl": 0.060302734375, "learning_rate": 4.677833495091949e-07, "loss": 0.0006025917828083038, "memory(GiB)": 39.09, "reward": 0.4737606644630432, "reward_std": 0.06766726821660995, "rewards/VisualizationJSONCombinedORM/mean": 0.4737606644630432, "rewards/VisualizationJSONCombinedORM/std": 0.11374392360448837, "step": 1058, "train_speed(iter/s)": 0.026575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 276.25, "completions/min_length": 235.0, "epoch": 0.8759305210918115, "grad_norm": 0.17244423925876617, "kl": 0.023834228515625, "learning_rate": 4.6170491336729794e-07, "loss": 0.00023880600929260254, "memory(GiB)": 39.09, "reward": 0.5774863958358765, "reward_std": 0.12040305882692337, "rewards/VisualizationJSONCombinedORM/mean": 0.5774863958358765, "rewards/VisualizationJSONCombinedORM/std": 0.17226438224315643, "step": 1059, "train_speed(iter/s)": 0.026586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 299.9375, "completions/min_length": 259.0, "epoch": 0.8767576509511993, "grad_norm": 0.1647498905658722, "kl": 0.0281982421875, "learning_rate": 4.55664315766538e-07, "loss": 0.00028173625469207764, "memory(GiB)": 39.09, "reward": 0.6617639064788818, "reward_std": 0.08956889808177948, "rewards/VisualizationJSONCombinedORM/mean": 0.6617639064788818, "rewards/VisualizationJSONCombinedORM/std": 0.09486503154039383, "step": 1060, "train_speed(iter/s)": 0.026594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 319.8125, "completions/min_length": 269.0, "epoch": 0.8775847808105872, "grad_norm": 0.17825010418891907, "kl": 0.0343017578125, "learning_rate": 4.4966160707107075e-07, "loss": 0.00034267082810401917, "memory(GiB)": 39.09, "reward": 0.7616198062896729, "reward_std": 0.1128615289926529, "rewards/VisualizationJSONCombinedORM/mean": 0.7616198062896729, "rewards/VisualizationJSONCombinedORM/std": 0.1115870475769043, "step": 1061, "train_speed(iter/s)": 0.026603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 328.0625, "completions/min_length": 243.0, "epoch": 0.8784119106699751, "grad_norm": 0.16874808073043823, "kl": 0.03375244140625, "learning_rate": 4.436968373291489e-07, "loss": 0.00033773481845855713, "memory(GiB)": 39.09, "reward": 0.2317860722541809, "reward_std": 0.02863755263388157, "rewards/VisualizationJSONCombinedORM/mean": 0.2317860722541809, "rewards/VisualizationJSONCombinedORM/std": 0.08633949607610703, "step": 1062, "train_speed(iter/s)": 0.02661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 301.5, "completions/min_length": 255.0, "epoch": 0.8792390405293631, "grad_norm": 0.18374797701835632, "kl": 0.030120849609375, "learning_rate": 4.377700562727055e-07, "loss": 0.00030117854475975037, "memory(GiB)": 39.09, "reward": 0.48461368680000305, "reward_std": 0.08769053965806961, "rewards/VisualizationJSONCombinedORM/mean": 0.48461368680000305, "rewards/VisualizationJSONCombinedORM/std": 0.1766144335269928, "step": 1063, "train_speed(iter/s)": 0.026618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 336.25, "completions/min_length": 265.0, "epoch": 0.880066170388751, "grad_norm": 0.19308938086032867, "kl": 0.05047607421875, "learning_rate": 4.318813133169375e-07, "loss": 0.0005032941699028015, "memory(GiB)": 39.09, "reward": 0.2670571506023407, "reward_std": 0.03879503160715103, "rewards/VisualizationJSONCombinedORM/mean": 0.2670571506023407, "rewards/VisualizationJSONCombinedORM/std": 0.09230834245681763, "step": 1064, "train_speed(iter/s)": 0.026625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 317.625, "completions/min_length": 240.0, "epoch": 0.8808933002481389, "grad_norm": 0.16426704823970795, "kl": 0.036102294921875, "learning_rate": 4.2603065755989493e-07, "loss": 0.0003609657287597656, "memory(GiB)": 39.09, "reward": 0.6535018682479858, "reward_std": 0.12943080067634583, "rewards/VisualizationJSONCombinedORM/mean": 0.6535018682479858, "rewards/VisualizationJSONCombinedORM/std": 0.13762885332107544, "step": 1065, "train_speed(iter/s)": 0.026629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 307.0, "completions/min_length": 265.0, "epoch": 0.8817204301075269, "grad_norm": 0.1801074594259262, "kl": 0.04486083984375, "learning_rate": 4.202181377820752e-07, "loss": 0.0004478245973587036, "memory(GiB)": 39.09, "reward": 0.2568650543689728, "reward_std": 0.04340825602412224, "rewards/VisualizationJSONCombinedORM/mean": 0.2568650543689728, "rewards/VisualizationJSONCombinedORM/std": 0.05424895137548447, "step": 1066, "train_speed(iter/s)": 0.026637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 320.5625, "completions/min_length": 237.0, "epoch": 0.8825475599669148, "grad_norm": 0.19585192203521729, "kl": 0.037200927734375, "learning_rate": 4.1444380244600623e-07, "loss": 0.00037150830030441284, "memory(GiB)": 39.09, "reward": 0.47109439969062805, "reward_std": 0.09893324971199036, "rewards/VisualizationJSONCombinedORM/mean": 0.47109439969062805, "rewards/VisualizationJSONCombinedORM/std": 0.19312629103660583, "step": 1067, "train_speed(iter/s)": 0.026644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 307.125, "completions/min_length": 248.0, "epoch": 0.8833746898263027, "grad_norm": 0.1648993343114853, "kl": 0.030303955078125, "learning_rate": 4.087076996958561e-07, "loss": 0.00030344724655151367, "memory(GiB)": 39.09, "reward": 0.7261194586753845, "reward_std": 0.08327057957649231, "rewards/VisualizationJSONCombinedORM/mean": 0.7261194586753845, "rewards/VisualizationJSONCombinedORM/std": 0.08520433306694031, "step": 1068, "train_speed(iter/s)": 0.02665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 283.25, "completions/min_length": 248.0, "epoch": 0.8842018196856907, "grad_norm": 0.15568068623542786, "kl": 0.02703857421875, "learning_rate": 4.0300987735701733e-07, "loss": 0.00027051568031311035, "memory(GiB)": 39.09, "reward": 0.35912370681762695, "reward_std": 0.04963862895965576, "rewards/VisualizationJSONCombinedORM/mean": 0.35912370681762695, "rewards/VisualizationJSONCombinedORM/std": 0.13385741412639618, "step": 1069, "train_speed(iter/s)": 0.026658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 278.0, "completions/min_length": 231.0, "epoch": 0.8850289495450786, "grad_norm": 0.14972683787345886, "kl": 0.032470703125, "learning_rate": 3.973503829357223e-07, "loss": 0.00032469816505908966, "memory(GiB)": 39.09, "reward": 0.5471013188362122, "reward_std": 0.08146543055772781, "rewards/VisualizationJSONCombinedORM/mean": 0.5471013188362122, "rewards/VisualizationJSONCombinedORM/std": 0.1399802267551422, "step": 1070, "train_speed(iter/s)": 0.026666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 299.375, "completions/min_length": 227.0, "epoch": 0.8858560794044665, "grad_norm": 0.1614263653755188, "kl": 0.05828857421875, "learning_rate": 3.9172926361863316e-07, "loss": 0.0005811266601085663, "memory(GiB)": 39.09, "reward": 0.5497574210166931, "reward_std": 0.10461975634098053, "rewards/VisualizationJSONCombinedORM/mean": 0.5497574210166931, "rewards/VisualizationJSONCombinedORM/std": 0.12594358623027802, "step": 1071, "train_speed(iter/s)": 0.026675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 313.75, "completions/min_length": 266.0, "epoch": 0.8866832092638545, "grad_norm": 0.16283579170703888, "kl": 0.0701904296875, "learning_rate": 3.8614656627246115e-07, "loss": 0.0007022172212600708, "memory(GiB)": 39.09, "reward": 0.45018547773361206, "reward_std": 0.05769038945436478, "rewards/VisualizationJSONCombinedORM/mean": 0.45018547773361206, "rewards/VisualizationJSONCombinedORM/std": 0.24925729632377625, "step": 1072, "train_speed(iter/s)": 0.026683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 294.625, "completions/min_length": 238.0, "epoch": 0.8875103391232424, "grad_norm": 0.16026929020881653, "kl": 0.0291748046875, "learning_rate": 3.8060233744356634e-07, "loss": 0.00029186904430389404, "memory(GiB)": 39.09, "reward": 0.4618404507637024, "reward_std": 0.07827329635620117, "rewards/VisualizationJSONCombinedORM/mean": 0.4618404507637024, "rewards/VisualizationJSONCombinedORM/std": 0.11204471439123154, "step": 1073, "train_speed(iter/s)": 0.026691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 303.3125, "completions/min_length": 232.0, "epoch": 0.8883374689826302, "grad_norm": 0.17326706647872925, "kl": 0.05352783203125, "learning_rate": 3.750966233575753e-07, "loss": 0.0005347989499568939, "memory(GiB)": 39.09, "reward": 0.6170778274536133, "reward_std": 0.11315380036830902, "rewards/VisualizationJSONCombinedORM/mean": 0.6170778274536133, "rewards/VisualizationJSONCombinedORM/std": 0.11321822553873062, "step": 1074, "train_speed(iter/s)": 0.026699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 315.6875, "completions/min_length": 235.0, "epoch": 0.8891645988420182, "grad_norm": 0.1686524599790573, "kl": 0.093505859375, "learning_rate": 3.696294699189934e-07, "loss": 0.000936310738325119, "memory(GiB)": 39.09, "reward": 0.3721650540828705, "reward_std": 0.08990518748760223, "rewards/VisualizationJSONCombinedORM/mean": 0.3721650540828705, "rewards/VisualizationJSONCombinedORM/std": 0.1689193993806839, "step": 1075, "train_speed(iter/s)": 0.026707 }, { "epoch": 0.8891645988420182, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 350.2083333333333, "eval_completions/mean_length": 298.125, "eval_completions/min_length": 248.91666666666666, "eval_kl": 0.042882283528645836, "eval_loss": 0.00042952349758706987, "eval_reward": 0.47018465399742126, "eval_reward_std": 0.07739507445755105, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47018465399742126, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07739507756195962, "eval_runtime": 302.4336, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 322.25, "completions/min_length": 266.0, "epoch": 0.8899917287014061, "grad_norm": 0.2098405957221985, "kl": 0.0433349609375, "learning_rate": 3.642009227108195e-07, "loss": 0.00043331459164619446, "memory(GiB)": 39.09, "reward": 0.41530641913414, "reward_std": 0.06914155185222626, "rewards/VisualizationJSONCombinedORM/mean": 0.41530641913414, "rewards/VisualizationJSONCombinedORM/std": 0.1003955751657486, "step": 1076, "train_speed(iter/s)": 0.026516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 328.3125, "completions/min_length": 248.0, "epoch": 0.890818858560794, "grad_norm": 0.1794760376214981, "kl": 0.05047607421875, "learning_rate": 3.588110269941747e-07, "loss": 0.0005047749727964401, "memory(GiB)": 39.09, "reward": 0.4412914514541626, "reward_std": 0.10154610872268677, "rewards/VisualizationJSONCombinedORM/mean": 0.4412914514541626, "rewards/VisualizationJSONCombinedORM/std": 0.10712484270334244, "step": 1077, "train_speed(iter/s)": 0.026524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 309.5, "completions/min_length": 254.0, "epoch": 0.891645988420182, "grad_norm": 0.19692979753017426, "kl": 0.0638427734375, "learning_rate": 3.5345982770791096e-07, "loss": 0.0006385110318660736, "memory(GiB)": 39.09, "reward": 0.3033790588378906, "reward_std": 0.07983754575252533, "rewards/VisualizationJSONCombinedORM/mean": 0.3033790588378906, "rewards/VisualizationJSONCombinedORM/std": 0.08347871899604797, "step": 1078, "train_speed(iter/s)": 0.02653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 294.3125, "completions/min_length": 252.0, "epoch": 0.8924731182795699, "grad_norm": 0.2082144021987915, "kl": 0.0654296875, "learning_rate": 3.4814736946825357e-07, "loss": 0.0006542578339576721, "memory(GiB)": 39.09, "reward": 0.37255528569221497, "reward_std": 0.09094773977994919, "rewards/VisualizationJSONCombinedORM/mean": 0.37255528569221497, "rewards/VisualizationJSONCombinedORM/std": 0.15868380665779114, "step": 1079, "train_speed(iter/s)": 0.02654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 266.75, "completions/min_length": 226.0, "epoch": 0.8933002481389578, "grad_norm": 0.16826502978801727, "kl": 0.04583740234375, "learning_rate": 3.4287369656841095e-07, "loss": 0.0004579499363899231, "memory(GiB)": 39.09, "reward": 0.5009593367576599, "reward_std": 0.07389605045318604, "rewards/VisualizationJSONCombinedORM/mean": 0.5009593367576599, "rewards/VisualizationJSONCombinedORM/std": 0.20436064898967743, "step": 1080, "train_speed(iter/s)": 0.026549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 309.5, "completions/min_length": 249.0, "epoch": 0.8941273779983457, "grad_norm": 0.19168758392333984, "kl": 0.0413818359375, "learning_rate": 3.3763885297822153e-07, "loss": 0.0004144906997680664, "memory(GiB)": 39.09, "reward": 0.4687643349170685, "reward_std": 0.13128378987312317, "rewards/VisualizationJSONCombinedORM/mean": 0.4687643349170685, "rewards/VisualizationJSONCombinedORM/std": 0.13292990624904633, "step": 1081, "train_speed(iter/s)": 0.026557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 293.125, "completions/min_length": 232.0, "epoch": 0.8949545078577337, "grad_norm": 0.1757350116968155, "kl": 0.0361328125, "learning_rate": 3.324428823437753e-07, "loss": 0.00036134570837020874, "memory(GiB)": 39.09, "reward": 0.4692828059196472, "reward_std": 0.10954277962446213, "rewards/VisualizationJSONCombinedORM/mean": 0.4692828059196472, "rewards/VisualizationJSONCombinedORM/std": 0.11663203686475754, "step": 1082, "train_speed(iter/s)": 0.026565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 309.625, "completions/min_length": 247.0, "epoch": 0.8957816377171216, "grad_norm": 0.17546233534812927, "kl": 0.0325927734375, "learning_rate": 3.272858279870583e-07, "loss": 0.0003265403211116791, "memory(GiB)": 39.09, "reward": 0.7446534633636475, "reward_std": 0.09520002454519272, "rewards/VisualizationJSONCombinedORM/mean": 0.7446534633636475, "rewards/VisualizationJSONCombinedORM/std": 0.11259544640779495, "step": 1083, "train_speed(iter/s)": 0.026573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 306.875, "completions/min_length": 238.0, "epoch": 0.8966087675765095, "grad_norm": 0.19120877981185913, "kl": 0.043701171875, "learning_rate": 3.22167732905585e-07, "loss": 0.0004359893500804901, "memory(GiB)": 39.09, "reward": 0.4644339680671692, "reward_std": 0.08423773944377899, "rewards/VisualizationJSONCombinedORM/mean": 0.4644339680671692, "rewards/VisualizationJSONCombinedORM/std": 0.1857883334159851, "step": 1084, "train_speed(iter/s)": 0.02658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 291.875, "completions/min_length": 213.0, "epoch": 0.8974358974358975, "grad_norm": 0.17451626062393188, "kl": 0.03759765625, "learning_rate": 3.170886397720435e-07, "loss": 0.0003761202096939087, "memory(GiB)": 39.09, "reward": 0.7008322477340698, "reward_std": 0.10377179831266403, "rewards/VisualizationJSONCombinedORM/mean": 0.7008322477340698, "rewards/VisualizationJSONCombinedORM/std": 0.14634372293949127, "step": 1085, "train_speed(iter/s)": 0.02659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 331.5, "completions/min_length": 217.0, "epoch": 0.8982630272952854, "grad_norm": 0.204909548163414, "kl": 0.0377197265625, "learning_rate": 3.120485909339399e-07, "loss": 0.0003765784204006195, "memory(GiB)": 39.09, "reward": 0.4457817077636719, "reward_std": 0.07259152084589005, "rewards/VisualizationJSONCombinedORM/mean": 0.4457817077636719, "rewards/VisualizationJSONCombinedORM/std": 0.07256996631622314, "step": 1086, "train_speed(iter/s)": 0.026596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 303.375, "completions/min_length": 255.0, "epoch": 0.8990901571546733, "grad_norm": 0.20005415380001068, "kl": 0.034912109375, "learning_rate": 3.070476284132429e-07, "loss": 0.0003497898578643799, "memory(GiB)": 39.09, "reward": 0.5367544889450073, "reward_std": 0.1038818508386612, "rewards/VisualizationJSONCombinedORM/mean": 0.5367544889450073, "rewards/VisualizationJSONCombinedORM/std": 0.16768264770507812, "step": 1087, "train_speed(iter/s)": 0.026605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 302.4375, "completions/min_length": 250.0, "epoch": 0.8999172870140613, "grad_norm": 0.19013676047325134, "kl": 0.0626220703125, "learning_rate": 3.02085793906034e-07, "loss": 0.0006274320185184479, "memory(GiB)": 39.09, "reward": 0.5508673191070557, "reward_std": 0.10258147865533829, "rewards/VisualizationJSONCombinedORM/mean": 0.5508673191070557, "rewards/VisualizationJSONCombinedORM/std": 0.13573876023292542, "step": 1088, "train_speed(iter/s)": 0.026613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 315.375, "completions/min_length": 255.0, "epoch": 0.9007444168734491, "grad_norm": 0.16140219569206238, "kl": 0.0416259765625, "learning_rate": 2.9716312878216194e-07, "loss": 0.0004169084131717682, "memory(GiB)": 39.09, "reward": 0.3564770519733429, "reward_std": 0.058557771146297455, "rewards/VisualizationJSONCombinedORM/mean": 0.3564770519733429, "rewards/VisualizationJSONCombinedORM/std": 0.0596698634326458, "step": 1089, "train_speed(iter/s)": 0.026621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 310.5625, "completions/min_length": 223.0, "epoch": 0.901571546732837, "grad_norm": 0.25843292474746704, "kl": 0.04541015625, "learning_rate": 2.9227967408489653e-07, "loss": 0.0004536770284175873, "memory(GiB)": 39.09, "reward": 0.3798951506614685, "reward_std": 0.07732784748077393, "rewards/VisualizationJSONCombinedORM/mean": 0.3798951506614685, "rewards/VisualizationJSONCombinedORM/std": 0.08500133454799652, "step": 1090, "train_speed(iter/s)": 0.026625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 270.8125, "completions/min_length": 202.0, "epoch": 0.902398676592225, "grad_norm": 0.15754860639572144, "kl": 0.0380859375, "learning_rate": 2.874354705305843e-07, "loss": 0.0003805235028266907, "memory(GiB)": 39.09, "reward": 0.5565105676651001, "reward_std": 0.05826813355088234, "rewards/VisualizationJSONCombinedORM/mean": 0.5565105676651001, "rewards/VisualizationJSONCombinedORM/std": 0.19606949388980865, "step": 1091, "train_speed(iter/s)": 0.026635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 328.9375, "completions/min_length": 253.0, "epoch": 0.9032258064516129, "grad_norm": 0.19016288220882416, "kl": 0.04095458984375, "learning_rate": 2.826305585083144e-07, "loss": 0.0004096105694770813, "memory(GiB)": 39.09, "reward": 0.6163627505302429, "reward_std": 0.10319951176643372, "rewards/VisualizationJSONCombinedORM/mean": 0.6163627505302429, "rewards/VisualizationJSONCombinedORM/std": 0.10245928168296814, "step": 1092, "train_speed(iter/s)": 0.026641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 317.8125, "completions/min_length": 263.0, "epoch": 0.9040529363110008, "grad_norm": 0.16089880466461182, "kl": 0.03564453125, "learning_rate": 2.778649780795739e-07, "loss": 0.00035649538040161133, "memory(GiB)": 39.09, "reward": 0.7113543748855591, "reward_std": 0.07255378365516663, "rewards/VisualizationJSONCombinedORM/mean": 0.7113543748855591, "rewards/VisualizationJSONCombinedORM/std": 0.08159070461988449, "step": 1093, "train_speed(iter/s)": 0.026648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 314.9375, "completions/min_length": 232.0, "epoch": 0.9048800661703887, "grad_norm": 0.15633611381053925, "kl": 0.0701904296875, "learning_rate": 2.7313876897792304e-07, "loss": 0.0007025822997093201, "memory(GiB)": 39.09, "reward": 0.38801220059394836, "reward_std": 0.058981262147426605, "rewards/VisualizationJSONCombinedORM/mean": 0.38801220059394836, "rewards/VisualizationJSONCombinedORM/std": 0.14293234050273895, "step": 1094, "train_speed(iter/s)": 0.026653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 319.5, "completions/min_length": 255.0, "epoch": 0.9057071960297767, "grad_norm": 0.1621459275484085, "kl": 0.045166015625, "learning_rate": 2.684519706086558e-07, "loss": 0.0004513673484325409, "memory(GiB)": 39.09, "reward": 0.6041992902755737, "reward_std": 0.07303065061569214, "rewards/VisualizationJSONCombinedORM/mean": 0.6041992902755737, "rewards/VisualizationJSONCombinedORM/std": 0.08665964007377625, "step": 1095, "train_speed(iter/s)": 0.026658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 334.1875, "completions/min_length": 261.0, "epoch": 0.9065343258891646, "grad_norm": 0.15008990466594696, "kl": 0.0325927734375, "learning_rate": 2.6380462204847633e-07, "loss": 0.00032600387930870056, "memory(GiB)": 39.09, "reward": 0.5815634727478027, "reward_std": 0.09193912148475647, "rewards/VisualizationJSONCombinedORM/mean": 0.5815634727478027, "rewards/VisualizationJSONCombinedORM/std": 0.19530627131462097, "step": 1096, "train_speed(iter/s)": 0.026664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 310.75, "completions/min_length": 238.0, "epoch": 0.9073614557485525, "grad_norm": 0.17062999308109283, "kl": 0.035675048828125, "learning_rate": 2.5919676204517073e-07, "loss": 0.0003561675548553467, "memory(GiB)": 39.09, "reward": 0.36158883571624756, "reward_std": 0.08142223954200745, "rewards/VisualizationJSONCombinedORM/mean": 0.36158883571624756, "rewards/VisualizationJSONCombinedORM/std": 0.11320240795612335, "step": 1097, "train_speed(iter/s)": 0.026671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 294.6875, "completions/min_length": 263.0, "epoch": 0.9081885856079405, "grad_norm": 0.21738851070404053, "kl": 0.041259765625, "learning_rate": 2.546284290172862e-07, "loss": 0.00041300058364868164, "memory(GiB)": 39.09, "reward": 0.4952968955039978, "reward_std": 0.08552952110767365, "rewards/VisualizationJSONCombinedORM/mean": 0.4952968955039978, "rewards/VisualizationJSONCombinedORM/std": 0.22137919068336487, "step": 1098, "train_speed(iter/s)": 0.026676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 322.8125, "completions/min_length": 234.0, "epoch": 0.9090157154673284, "grad_norm": 0.15483471751213074, "kl": 0.0382080078125, "learning_rate": 2.500996610538081e-07, "loss": 0.00038278475403785706, "memory(GiB)": 39.09, "reward": 0.6217978000640869, "reward_std": 0.14702211320400238, "rewards/VisualizationJSONCombinedORM/mean": 0.6217978000640869, "rewards/VisualizationJSONCombinedORM/std": 0.1742393970489502, "step": 1099, "train_speed(iter/s)": 0.026679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 278.0625, "completions/min_length": 222.0, "epoch": 0.9098428453267163, "grad_norm": 0.198515847325325, "kl": 0.03521728515625, "learning_rate": 2.4561049591384387e-07, "loss": 0.00035266485065221786, "memory(GiB)": 39.09, "reward": 0.5347734689712524, "reward_std": 0.06840994954109192, "rewards/VisualizationJSONCombinedORM/mean": 0.5347734689712524, "rewards/VisualizationJSONCombinedORM/std": 0.17413920164108276, "step": 1100, "train_speed(iter/s)": 0.026687 }, { "epoch": 0.9098428453267163, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 354.4583333333333, "eval_completions/mean_length": 296.6041666666667, "eval_completions/min_length": 245.79166666666666, "eval_kl": 0.0432891845703125, "eval_loss": 0.00043272972106933594, "eval_reward": 0.4709331536044677, "eval_reward_std": 0.07889871811494231, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4709331536044677, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07889871989997725, "eval_runtime": 305.2875, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 274.0625, "completions/min_length": 229.0, "epoch": 0.9106699751861043, "grad_norm": 0.1971379965543747, "kl": 0.04473876953125, "learning_rate": 2.411609710263091e-07, "loss": 0.00044820085167884827, "memory(GiB)": 39.09, "reward": 0.5542173385620117, "reward_std": 0.10927275568246841, "rewards/VisualizationJSONCombinedORM/mean": 0.5542173385620117, "rewards/VisualizationJSONCombinedORM/std": 0.12990814447402954, "step": 1101, "train_speed(iter/s)": 0.026498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 305.4375, "completions/min_length": 201.0, "epoch": 0.9114971050454922, "grad_norm": 0.18250222504138947, "kl": 0.0665283203125, "learning_rate": 2.367511234896125e-07, "loss": 0.0006651505827903748, "memory(GiB)": 39.09, "reward": 0.43128836154937744, "reward_std": 0.09686776995658875, "rewards/VisualizationJSONCombinedORM/mean": 0.43128836154937744, "rewards/VisualizationJSONCombinedORM/std": 0.10668158531188965, "step": 1102, "train_speed(iter/s)": 0.026505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 290.0625, "completions/min_length": 255.0, "epoch": 0.91232423490488, "grad_norm": 0.15801754593849182, "kl": 0.0294189453125, "learning_rate": 2.3238099007134973e-07, "loss": 0.00029404088854789734, "memory(GiB)": 39.09, "reward": 0.6178666353225708, "reward_std": 0.08512279391288757, "rewards/VisualizationJSONCombinedORM/mean": 0.6178666353225708, "rewards/VisualizationJSONCombinedORM/std": 0.20957611501216888, "step": 1103, "train_speed(iter/s)": 0.026514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 297.75, "completions/min_length": 261.0, "epoch": 0.913151364764268, "grad_norm": 0.16774369776248932, "kl": 0.03009033203125, "learning_rate": 2.280506072079963e-07, "loss": 0.00030102580785751343, "memory(GiB)": 39.09, "reward": 0.520586371421814, "reward_std": 0.056392259895801544, "rewards/VisualizationJSONCombinedORM/mean": 0.520586371421814, "rewards/VisualizationJSONCombinedORM/std": 0.267341673374176, "step": 1104, "train_speed(iter/s)": 0.026523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 297.75, "completions/min_length": 237.0, "epoch": 0.9139784946236559, "grad_norm": 0.14988434314727783, "kl": 0.041900634765625, "learning_rate": 2.237600110046001e-07, "loss": 0.0004197061061859131, "memory(GiB)": 39.09, "reward": 0.5928292870521545, "reward_std": 0.05906001478433609, "rewards/VisualizationJSONCombinedORM/mean": 0.5928292870521545, "rewards/VisualizationJSONCombinedORM/std": 0.24831180274486542, "step": 1105, "train_speed(iter/s)": 0.02653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 299.9375, "completions/min_length": 242.0, "epoch": 0.9148056244830438, "grad_norm": 0.14619579911231995, "kl": 0.02655029296875, "learning_rate": 2.1950923723448704e-07, "loss": 0.00026644766330718994, "memory(GiB)": 39.09, "reward": 0.5311951637268066, "reward_std": 0.06760140508413315, "rewards/VisualizationJSONCombinedORM/mean": 0.5311951637268066, "rewards/VisualizationJSONCombinedORM/std": 0.1186438575387001, "step": 1106, "train_speed(iter/s)": 0.026537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 310.1875, "completions/min_length": 238.0, "epoch": 0.9156327543424317, "grad_norm": 0.1838262975215912, "kl": 0.0511474609375, "learning_rate": 2.152983213389559e-07, "loss": 0.0005096755921840668, "memory(GiB)": 39.09, "reward": 0.42785364389419556, "reward_std": 0.06929303705692291, "rewards/VisualizationJSONCombinedORM/mean": 0.42785364389419556, "rewards/VisualizationJSONCombinedORM/std": 0.25316303968429565, "step": 1107, "train_speed(iter/s)": 0.026545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 314.0, "completions/min_length": 265.0, "epoch": 0.9164598842018197, "grad_norm": 0.1780070811510086, "kl": 0.038604736328125, "learning_rate": 2.11127298426988e-07, "loss": 0.0003864690661430359, "memory(GiB)": 39.09, "reward": 0.4123343229293823, "reward_std": 0.06732018291950226, "rewards/VisualizationJSONCombinedORM/mean": 0.4123343229293823, "rewards/VisualizationJSONCombinedORM/std": 0.12690415978431702, "step": 1108, "train_speed(iter/s)": 0.02655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 288.125, "completions/min_length": 234.0, "epoch": 0.9172870140612076, "grad_norm": 0.16745875775814056, "kl": 0.04168701171875, "learning_rate": 2.0699620327495174e-07, "loss": 0.00041773542761802673, "memory(GiB)": 39.09, "reward": 0.5922995209693909, "reward_std": 0.08429886400699615, "rewards/VisualizationJSONCombinedORM/mean": 0.5922995209693909, "rewards/VisualizationJSONCombinedORM/std": 0.16370466351509094, "step": 1109, "train_speed(iter/s)": 0.026557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 316.375, "completions/min_length": 257.0, "epoch": 0.9181141439205955, "grad_norm": 0.16580091416835785, "kl": 0.0504150390625, "learning_rate": 2.0290507032631356e-07, "loss": 0.0005034990608692169, "memory(GiB)": 39.09, "reward": 0.5568876266479492, "reward_std": 0.11799873411655426, "rewards/VisualizationJSONCombinedORM/mean": 0.5568876266479492, "rewards/VisualizationJSONCombinedORM/std": 0.13838493824005127, "step": 1110, "train_speed(iter/s)": 0.026562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 291.9375, "completions/min_length": 218.0, "epoch": 0.9189412737799835, "grad_norm": 0.1454712152481079, "kl": 0.030670166015625, "learning_rate": 1.9885393369134976e-07, "loss": 0.00030659139156341553, "memory(GiB)": 39.09, "reward": 0.7587473392486572, "reward_std": 0.060432858765125275, "rewards/VisualizationJSONCombinedORM/mean": 0.7587473392486572, "rewards/VisualizationJSONCombinedORM/std": 0.19954842329025269, "step": 1111, "train_speed(iter/s)": 0.026569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 313.5, "completions/min_length": 272.0, "epoch": 0.9197684036393714, "grad_norm": 0.17575232684612274, "kl": 0.0413818359375, "learning_rate": 1.9484282714686442e-07, "loss": 0.0004141777753829956, "memory(GiB)": 39.09, "reward": 0.400969922542572, "reward_std": 0.0445784330368042, "rewards/VisualizationJSONCombinedORM/mean": 0.400969922542572, "rewards/VisualizationJSONCombinedORM/std": 0.14022323489189148, "step": 1112, "train_speed(iter/s)": 0.026574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 320.5625, "completions/min_length": 261.0, "epoch": 0.9205955334987593, "grad_norm": 0.15690822899341583, "kl": 0.0238037109375, "learning_rate": 1.908717841359048e-07, "loss": 0.0002381652593612671, "memory(GiB)": 39.09, "reward": 0.5532070398330688, "reward_std": 0.06741541624069214, "rewards/VisualizationJSONCombinedORM/mean": 0.5532070398330688, "rewards/VisualizationJSONCombinedORM/std": 0.07688219100236893, "step": 1113, "train_speed(iter/s)": 0.026581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 289.8125, "completions/min_length": 218.0, "epoch": 0.9214226633581473, "grad_norm": 0.16176335513591766, "kl": 0.04107666015625, "learning_rate": 1.8694083776748472e-07, "loss": 0.0004101879894733429, "memory(GiB)": 39.09, "reward": 0.5397459268569946, "reward_std": 0.07540705800056458, "rewards/VisualizationJSONCombinedORM/mean": 0.5397459268569946, "rewards/VisualizationJSONCombinedORM/std": 0.11132632195949554, "step": 1114, "train_speed(iter/s)": 0.02659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 303.9375, "completions/min_length": 232.0, "epoch": 0.9222497932175352, "grad_norm": 0.17757190763950348, "kl": 0.0545654296875, "learning_rate": 1.8305002081630885e-07, "loss": 0.0005445443093776703, "memory(GiB)": 39.09, "reward": 0.6091506481170654, "reward_std": 0.07033385336399078, "rewards/VisualizationJSONCombinedORM/mean": 0.6091506481170654, "rewards/VisualizationJSONCombinedORM/std": 0.1737130731344223, "step": 1115, "train_speed(iter/s)": 0.026597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 319.3125, "completions/min_length": 249.0, "epoch": 0.9230769230769231, "grad_norm": 0.2143448442220688, "kl": 0.047027587890625, "learning_rate": 1.7919936572249442e-07, "loss": 0.0004702173173427582, "memory(GiB)": 39.09, "reward": 0.4038991630077362, "reward_std": 0.09512408077716827, "rewards/VisualizationJSONCombinedORM/mean": 0.4038991630077362, "rewards/VisualizationJSONCombinedORM/std": 0.09691356867551804, "step": 1116, "train_speed(iter/s)": 0.026604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 345.875, "completions/min_length": 257.0, "epoch": 0.923904052936311, "grad_norm": 0.1646738499403, "kl": 0.047607421875, "learning_rate": 1.7538890459131098e-07, "loss": 0.0004765857011079788, "memory(GiB)": 39.09, "reward": 0.5096011161804199, "reward_std": 0.05378838628530502, "rewards/VisualizationJSONCombinedORM/mean": 0.5096011161804199, "rewards/VisualizationJSONCombinedORM/std": 0.18898747861385345, "step": 1117, "train_speed(iter/s)": 0.026612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 292.25, "completions/min_length": 255.0, "epoch": 0.9247311827956989, "grad_norm": 0.19632570445537567, "kl": 0.0419921875, "learning_rate": 1.7161866919290004e-07, "loss": 0.0004197359085083008, "memory(GiB)": 39.09, "reward": 0.46196508407592773, "reward_std": 0.08815409988164902, "rewards/VisualizationJSONCombinedORM/mean": 0.46196508407592773, "rewards/VisualizationJSONCombinedORM/std": 0.1191045269370079, "step": 1118, "train_speed(iter/s)": 0.026616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 316.1875, "completions/min_length": 241.0, "epoch": 0.9255583126550868, "grad_norm": 0.1622951179742813, "kl": 0.05487060546875, "learning_rate": 1.6788869096202197e-07, "loss": 0.0005484521389007568, "memory(GiB)": 39.09, "reward": 0.6441358327865601, "reward_std": 0.09166394919157028, "rewards/VisualizationJSONCombinedORM/mean": 0.6441358327865601, "rewards/VisualizationJSONCombinedORM/std": 0.12009938061237335, "step": 1119, "train_speed(iter/s)": 0.026623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 310.375, "completions/min_length": 251.0, "epoch": 0.9263854425144747, "grad_norm": 0.2072513997554779, "kl": 0.04718017578125, "learning_rate": 1.641990009977834e-07, "loss": 0.00047142989933490753, "memory(GiB)": 39.09, "reward": 0.3416254222393036, "reward_std": 0.05982081592082977, "rewards/VisualizationJSONCombinedORM/mean": 0.3416254222393036, "rewards/VisualizationJSONCombinedORM/std": 0.11970878392457962, "step": 1120, "train_speed(iter/s)": 0.026629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 293.25, "completions/min_length": 231.0, "epoch": 0.9272125723738627, "grad_norm": 0.18706436455249786, "kl": 0.05487060546875, "learning_rate": 1.6054963006338742e-07, "loss": 0.00054951012134552, "memory(GiB)": 39.09, "reward": 0.2142494022846222, "reward_std": 0.031900376081466675, "rewards/VisualizationJSONCombinedORM/mean": 0.2142494022846222, "rewards/VisualizationJSONCombinedORM/std": 0.0343606099486351, "step": 1121, "train_speed(iter/s)": 0.026637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 290.9375, "completions/min_length": 252.0, "epoch": 0.9280397022332506, "grad_norm": 0.1861664056777954, "kl": 0.04669189453125, "learning_rate": 1.5694060858587046e-07, "loss": 0.000465909019112587, "memory(GiB)": 39.09, "reward": 0.5834947824478149, "reward_std": 0.07053089141845703, "rewards/VisualizationJSONCombinedORM/mean": 0.5834947824478149, "rewards/VisualizationJSONCombinedORM/std": 0.08387994766235352, "step": 1122, "train_speed(iter/s)": 0.026645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 278.8125, "completions/min_length": 209.0, "epoch": 0.9288668320926385, "grad_norm": 0.17533954977989197, "kl": 0.0308837890625, "learning_rate": 1.533719666558514e-07, "loss": 0.00030791014432907104, "memory(GiB)": 39.09, "reward": 0.66846764087677, "reward_std": 0.08836431801319122, "rewards/VisualizationJSONCombinedORM/mean": 0.66846764087677, "rewards/VisualizationJSONCombinedORM/std": 0.11151659488677979, "step": 1123, "train_speed(iter/s)": 0.026652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 316.375, "completions/min_length": 250.0, "epoch": 0.9296939619520265, "grad_norm": 0.16955824196338654, "kl": 0.028076171875, "learning_rate": 1.4984373402728014e-07, "loss": 0.0002804398536682129, "memory(GiB)": 39.09, "reward": 0.4972327947616577, "reward_std": 0.05081062391400337, "rewards/VisualizationJSONCombinedORM/mean": 0.4972327947616577, "rewards/VisualizationJSONCombinedORM/std": 0.2831774055957794, "step": 1124, "train_speed(iter/s)": 0.026659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 285.0, "completions/min_length": 222.0, "epoch": 0.9305210918114144, "grad_norm": 0.16722361743450165, "kl": 0.04296875, "learning_rate": 1.4635594011718935e-07, "loss": 0.0004298985004425049, "memory(GiB)": 39.09, "reward": 0.5351252555847168, "reward_std": 0.11035811901092529, "rewards/VisualizationJSONCombinedORM/mean": 0.5351252555847168, "rewards/VisualizationJSONCombinedORM/std": 0.10714098811149597, "step": 1125, "train_speed(iter/s)": 0.026666 }, { "epoch": 0.9305210918114144, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 358.125, "eval_completions/mean_length": 301.703125, "eval_completions/min_length": 252.875, "eval_kl": 0.0477447509765625, "eval_loss": 0.00047506639384664595, "eval_reward": 0.4634612252314885, "eval_reward_std": 0.07208021505114932, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4634612252314885, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07208021768989663, "eval_runtime": 307.6952, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 284.25, "completions/min_length": 240.0, "epoch": 0.9313482216708023, "grad_norm": 0.1659306138753891, "kl": 0.06475830078125, "learning_rate": 1.4290861400545031e-07, "loss": 0.0006487704813480377, "memory(GiB)": 39.09, "reward": 0.485272079706192, "reward_std": 0.07376283407211304, "rewards/VisualizationJSONCombinedORM/mean": 0.485272079706192, "rewards/VisualizationJSONCombinedORM/std": 0.15559467673301697, "step": 1126, "train_speed(iter/s)": 0.026482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 281.25, "completions/min_length": 229.0, "epoch": 0.9321753515301903, "grad_norm": 0.15427538752555847, "kl": 0.0263671875, "learning_rate": 1.39501784434527e-07, "loss": 0.0002642907202243805, "memory(GiB)": 39.09, "reward": 0.5554714202880859, "reward_std": 0.07531147450208664, "rewards/VisualizationJSONCombinedORM/mean": 0.5554714202880859, "rewards/VisualizationJSONCombinedORM/std": 0.11629614979028702, "step": 1127, "train_speed(iter/s)": 0.026489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 330.6875, "completions/min_length": 255.0, "epoch": 0.9330024813895782, "grad_norm": 0.19099324941635132, "kl": 0.05255126953125, "learning_rate": 1.361354798092429e-07, "loss": 0.000525306910276413, "memory(GiB)": 39.09, "reward": 0.40298330783843994, "reward_std": 0.06338968873023987, "rewards/VisualizationJSONCombinedORM/mean": 0.40298330783843994, "rewards/VisualizationJSONCombinedORM/std": 0.1974867284297943, "step": 1128, "train_speed(iter/s)": 0.026496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 299.8125, "completions/min_length": 243.0, "epoch": 0.9338296112489661, "grad_norm": 0.1684941053390503, "kl": 0.066162109375, "learning_rate": 1.328097281965357e-07, "loss": 0.0006607882678508759, "memory(GiB)": 39.09, "reward": 0.6584578156471252, "reward_std": 0.14414437115192413, "rewards/VisualizationJSONCombinedORM/mean": 0.6584578156471252, "rewards/VisualizationJSONCombinedORM/std": 0.18467985093593597, "step": 1129, "train_speed(iter/s)": 0.026501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 298.125, "completions/min_length": 232.0, "epoch": 0.9346567411083541, "grad_norm": 0.15211325883865356, "kl": 0.03875732421875, "learning_rate": 1.2952455732523238e-07, "loss": 0.00038705766201019287, "memory(GiB)": 39.09, "reward": 0.5214530229568481, "reward_std": 0.08136653900146484, "rewards/VisualizationJSONCombinedORM/mean": 0.5214530229568481, "rewards/VisualizationJSONCombinedORM/std": 0.1123085543513298, "step": 1130, "train_speed(iter/s)": 0.026509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 258.125, "completions/min_length": 235.0, "epoch": 0.9354838709677419, "grad_norm": 0.18190337717533112, "kl": 0.033233642578125, "learning_rate": 1.2627999458580952e-07, "loss": 0.0003332793712615967, "memory(GiB)": 39.09, "reward": 0.5336886644363403, "reward_std": 0.09878890216350555, "rewards/VisualizationJSONCombinedORM/mean": 0.5336886644363403, "rewards/VisualizationJSONCombinedORM/std": 0.21061740815639496, "step": 1131, "train_speed(iter/s)": 0.026521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 273.5, "completions/min_length": 213.0, "epoch": 0.9363110008271298, "grad_norm": 0.17134210467338562, "kl": 0.04180908203125, "learning_rate": 1.2307606703017173e-07, "loss": 0.00041862577199935913, "memory(GiB)": 39.09, "reward": 0.5867749452590942, "reward_std": 0.07491488754749298, "rewards/VisualizationJSONCombinedORM/mean": 0.5867749452590942, "rewards/VisualizationJSONCombinedORM/std": 0.1818724423646927, "step": 1132, "train_speed(iter/s)": 0.026528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 316.625, "completions/min_length": 220.0, "epoch": 0.9371381306865177, "grad_norm": 0.15210923552513123, "kl": 0.0211181640625, "learning_rate": 1.199128013714218e-07, "loss": 0.00021094083786010742, "memory(GiB)": 39.09, "reward": 0.6068366765975952, "reward_std": 0.03777528554201126, "rewards/VisualizationJSONCombinedORM/mean": 0.6068366765975952, "rewards/VisualizationJSONCombinedORM/std": 0.18534064292907715, "step": 1133, "train_speed(iter/s)": 0.026534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 286.3125, "completions/min_length": 239.0, "epoch": 0.9379652605459057, "grad_norm": 0.16790999472141266, "kl": 0.044677734375, "learning_rate": 1.1679022398363937e-07, "loss": 0.0004452168941497803, "memory(GiB)": 39.09, "reward": 0.5075588226318359, "reward_std": 0.06112976372241974, "rewards/VisualizationJSONCombinedORM/mean": 0.5075588226318359, "rewards/VisualizationJSONCombinedORM/std": 0.2109854817390442, "step": 1134, "train_speed(iter/s)": 0.026539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 300.25, "completions/min_length": 257.0, "epoch": 0.9387923904052936, "grad_norm": 0.16977187991142273, "kl": 0.043212890625, "learning_rate": 1.1370836090166204e-07, "loss": 0.00043261051177978516, "memory(GiB)": 39.09, "reward": 0.7122008800506592, "reward_std": 0.0709308609366417, "rewards/VisualizationJSONCombinedORM/mean": 0.7122008800506592, "rewards/VisualizationJSONCombinedORM/std": 0.08146408945322037, "step": 1135, "train_speed(iter/s)": 0.026544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 318.5625, "completions/min_length": 264.0, "epoch": 0.9396195202646815, "grad_norm": 0.21743468940258026, "kl": 0.03399658203125, "learning_rate": 1.1066723782086619e-07, "loss": 0.00033984333276748657, "memory(GiB)": 39.09, "reward": 0.3201659917831421, "reward_std": 0.04161693900823593, "rewards/VisualizationJSONCombinedORM/mean": 0.3201659917831421, "rewards/VisualizationJSONCombinedORM/std": 0.16839911043643951, "step": 1136, "train_speed(iter/s)": 0.026551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 318.875, "completions/min_length": 252.0, "epoch": 0.9404466501240695, "grad_norm": 0.1559406965970993, "kl": 0.02978515625, "learning_rate": 1.0766688009695548e-07, "loss": 0.00029806792736053467, "memory(GiB)": 39.09, "reward": 0.6769573092460632, "reward_std": 0.09131407737731934, "rewards/VisualizationJSONCombinedORM/mean": 0.6769573092460632, "rewards/VisualizationJSONCombinedORM/std": 0.09063854068517685, "step": 1137, "train_speed(iter/s)": 0.026558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 296.5625, "completions/min_length": 250.0, "epoch": 0.9412737799834574, "grad_norm": 0.20522445440292358, "kl": 0.03546142578125, "learning_rate": 1.0470731274574542e-07, "loss": 0.00035565346479415894, "memory(GiB)": 39.09, "reward": 0.2532683312892914, "reward_std": 0.040612656623125076, "rewards/VisualizationJSONCombinedORM/mean": 0.2532683312892914, "rewards/VisualizationJSONCombinedORM/std": 0.11124890297651291, "step": 1138, "train_speed(iter/s)": 0.026563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 293.0, "completions/min_length": 221.0, "epoch": 0.9421009098428453, "grad_norm": 0.18041673302650452, "kl": 0.04974365234375, "learning_rate": 1.0178856044295971e-07, "loss": 0.0004974603652954102, "memory(GiB)": 39.09, "reward": 0.5206825733184814, "reward_std": 0.05131429433822632, "rewards/VisualizationJSONCombinedORM/mean": 0.5206825733184814, "rewards/VisualizationJSONCombinedORM/std": 0.1321907788515091, "step": 1139, "train_speed(iter/s)": 0.026572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 330.0, "completions/min_length": 256.0, "epoch": 0.9429280397022333, "grad_norm": 0.1686943620443344, "kl": 0.0396728515625, "learning_rate": 9.891064752402091e-08, "loss": 0.0003972463309764862, "memory(GiB)": 39.09, "reward": 0.6121200323104858, "reward_std": 0.06511475890874863, "rewards/VisualizationJSONCombinedORM/mean": 0.6121200323104858, "rewards/VisualizationJSONCombinedORM/std": 0.12583614885807037, "step": 1140, "train_speed(iter/s)": 0.026579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 283.5, "completions/min_length": 214.0, "epoch": 0.9437551695616212, "grad_norm": 0.18701551854610443, "kl": 0.04833984375, "learning_rate": 9.607359798384785e-08, "loss": 0.00048371939919888973, "memory(GiB)": 39.09, "reward": 0.5535567998886108, "reward_std": 0.1409934014081955, "rewards/VisualizationJSONCombinedORM/mean": 0.5535567998886108, "rewards/VisualizationJSONCombinedORM/std": 0.1556701809167862, "step": 1141, "train_speed(iter/s)": 0.026585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 326.125, "completions/min_length": 244.0, "epoch": 0.9445822994210091, "grad_norm": 0.1660514771938324, "kl": 0.0535888671875, "learning_rate": 9.327743547665858e-08, "loss": 0.0005359426140785217, "memory(GiB)": 39.09, "reward": 0.39296430349349976, "reward_std": 0.05271688103675842, "rewards/VisualizationJSONCombinedORM/mean": 0.39296430349349976, "rewards/VisualizationJSONCombinedORM/std": 0.09610361605882645, "step": 1142, "train_speed(iter/s)": 0.026592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 281.125, "completions/min_length": 221.0, "epoch": 0.9454094292803971, "grad_norm": 0.16937096416950226, "kl": 0.0423583984375, "learning_rate": 9.052218331576878e-08, "loss": 0.00042294710874557495, "memory(GiB)": 39.09, "reward": 0.49465417861938477, "reward_std": 0.06107139587402344, "rewards/VisualizationJSONCombinedORM/mean": 0.49465417861938477, "rewards/VisualizationJSONCombinedORM/std": 0.06105971708893776, "step": 1143, "train_speed(iter/s)": 0.026601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 362.4375, "completions/min_length": 268.0, "epoch": 0.946236559139785, "grad_norm": 0.20899978280067444, "kl": 0.05169677734375, "learning_rate": 8.780786447340095e-08, "loss": 0.0005170628428459167, "memory(GiB)": 39.09, "reward": 0.47621506452560425, "reward_std": 0.09504693746566772, "rewards/VisualizationJSONCombinedORM/mean": 0.47621506452560425, "rewards/VisualizationJSONCombinedORM/std": 0.19472815096378326, "step": 1144, "train_speed(iter/s)": 0.026604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 305.125, "completions/min_length": 261.0, "epoch": 0.9470636889991728, "grad_norm": 0.18455609679222107, "kl": 0.0477294921875, "learning_rate": 8.513450158049109e-08, "loss": 0.00047681480646133423, "memory(GiB)": 39.09, "reward": 0.47581422328948975, "reward_std": 0.09192469716072083, "rewards/VisualizationJSONCombinedORM/mean": 0.47581422328948975, "rewards/VisualizationJSONCombinedORM/std": 0.18602220714092255, "step": 1145, "train_speed(iter/s)": 0.026612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 300.9375, "completions/min_length": 221.0, "epoch": 0.9478908188585607, "grad_norm": 0.1804235279560089, "kl": 0.03997802734375, "learning_rate": 8.250211692650001e-08, "loss": 0.00039904937148094177, "memory(GiB)": 39.09, "reward": 0.509920060634613, "reward_std": 0.07547754049301147, "rewards/VisualizationJSONCombinedORM/mean": 0.509920060634613, "rewards/VisualizationJSONCombinedORM/std": 0.1965588927268982, "step": 1146, "train_speed(iter/s)": 0.02662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 327.375, "completions/min_length": 261.0, "epoch": 0.9487179487179487, "grad_norm": 0.17240357398986816, "kl": 0.05780029296875, "learning_rate": 7.991073245922798e-08, "loss": 0.0005772411823272705, "memory(GiB)": 39.09, "reward": 0.5686795711517334, "reward_std": 0.11411263048648834, "rewards/VisualizationJSONCombinedORM/mean": 0.5686795711517334, "rewards/VisualizationJSONCombinedORM/std": 0.1431906670331955, "step": 1147, "train_speed(iter/s)": 0.026627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 289.875, "completions/min_length": 234.0, "epoch": 0.9495450785773366, "grad_norm": 0.19172200560569763, "kl": 0.030120849609375, "learning_rate": 7.736036978463202e-08, "loss": 0.0003012232482433319, "memory(GiB)": 39.09, "reward": 0.5958524942398071, "reward_std": 0.06709443777799606, "rewards/VisualizationJSONCombinedORM/mean": 0.5958524942398071, "rewards/VisualizationJSONCombinedORM/std": 0.1923597902059555, "step": 1148, "train_speed(iter/s)": 0.026635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 289.25, "completions/min_length": 251.0, "epoch": 0.9503722084367245, "grad_norm": 0.17188741266727448, "kl": 0.01983642578125, "learning_rate": 7.485105016664551e-08, "loss": 0.0001983698457479477, "memory(GiB)": 39.09, "reward": 0.5890264511108398, "reward_std": 0.06378138065338135, "rewards/VisualizationJSONCombinedORM/mean": 0.5890264511108398, "rewards/VisualizationJSONCombinedORM/std": 0.1677379459142685, "step": 1149, "train_speed(iter/s)": 0.026644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 306.125, "completions/min_length": 237.0, "epoch": 0.9511993382961125, "grad_norm": 0.18604737520217896, "kl": 0.0540771484375, "learning_rate": 7.238279452700004e-08, "loss": 0.0005412213504314423, "memory(GiB)": 39.09, "reward": 0.49150776863098145, "reward_std": 0.08732588589191437, "rewards/VisualizationJSONCombinedORM/mean": 0.49150776863098145, "rewards/VisualizationJSONCombinedORM/std": 0.09858547151088715, "step": 1150, "train_speed(iter/s)": 0.026652 }, { "epoch": 0.9511993382961125, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.4583333333333, "eval_completions/mean_length": 301.4375, "eval_completions/min_length": 251.875, "eval_kl": 0.04803466796875, "eval_loss": 0.000483095645904541, "eval_reward": 0.47862050868570805, "eval_reward_std": 0.0761975454709803, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47862050868570805, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07619754539337009, "eval_runtime": 313.3441, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 302.125, "completions/min_length": 236.0, "epoch": 0.9520264681555004, "grad_norm": 0.17055870592594147, "kl": 0.043121337890625, "learning_rate": 6.995562344505213e-08, "loss": 0.0004322752356529236, "memory(GiB)": 39.09, "reward": 0.5267083644866943, "reward_std": 0.12177172303199768, "rewards/VisualizationJSONCombinedORM/mean": 0.5267083644866943, "rewards/VisualizationJSONCombinedORM/std": 0.14535000920295715, "step": 1151, "train_speed(iter/s)": 0.026468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 278.6875, "completions/min_length": 230.0, "epoch": 0.9528535980148883, "grad_norm": 0.1920817643404007, "kl": 0.03912353515625, "learning_rate": 6.756955715761127e-08, "loss": 0.0003912821412086487, "memory(GiB)": 39.09, "reward": 0.6939691305160522, "reward_std": 0.09114862978458405, "rewards/VisualizationJSONCombinedORM/mean": 0.6939691305160522, "rewards/VisualizationJSONCombinedORM/std": 0.10039181262254715, "step": 1152, "train_speed(iter/s)": 0.026476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 289.25, "completions/min_length": 244.0, "epoch": 0.9536807278742763, "grad_norm": 0.17045147716999054, "kl": 0.05413818359375, "learning_rate": 6.522461555877213e-08, "loss": 0.00054159015417099, "memory(GiB)": 39.09, "reward": 0.5183011293411255, "reward_std": 0.06133204698562622, "rewards/VisualizationJSONCombinedORM/mean": 0.5183011293411255, "rewards/VisualizationJSONCombinedORM/std": 0.24539034068584442, "step": 1153, "train_speed(iter/s)": 0.026487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 276.625, "completions/min_length": 243.0, "epoch": 0.9545078577336642, "grad_norm": 0.1413472294807434, "kl": 0.05633544921875, "learning_rate": 6.292081819974427e-08, "loss": 0.0005633607506752014, "memory(GiB)": 39.09, "reward": 0.5260233879089355, "reward_std": 0.06962771713733673, "rewards/VisualizationJSONCombinedORM/mean": 0.5260233879089355, "rewards/VisualizationJSONCombinedORM/std": 0.249403715133667, "step": 1154, "train_speed(iter/s)": 0.026492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 290.3125, "completions/min_length": 229.0, "epoch": 0.9553349875930521, "grad_norm": 0.1943916529417038, "kl": 0.05755615234375, "learning_rate": 6.065818428869774e-08, "loss": 0.0005759075284004211, "memory(GiB)": 39.09, "reward": 0.5590736269950867, "reward_std": 0.0841013640165329, "rewards/VisualizationJSONCombinedORM/mean": 0.5590736269950867, "rewards/VisualizationJSONCombinedORM/std": 0.1488780975341797, "step": 1155, "train_speed(iter/s)": 0.026502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 352.4375, "completions/min_length": 274.0, "epoch": 0.9561621174524401, "grad_norm": 0.15331488847732544, "kl": 0.05255126953125, "learning_rate": 5.843673269059269e-08, "loss": 0.0005246847867965698, "memory(GiB)": 39.09, "reward": 0.7144826650619507, "reward_std": 0.08299380540847778, "rewards/VisualizationJSONCombinedORM/mean": 0.7144826650619507, "rewards/VisualizationJSONCombinedORM/std": 0.0881037637591362, "step": 1156, "train_speed(iter/s)": 0.02651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 292.5625, "completions/min_length": 239.0, "epoch": 0.956989247311828, "grad_norm": 0.17211604118347168, "kl": 0.07879638671875, "learning_rate": 5.625648192703115e-08, "loss": 0.000786997377872467, "memory(GiB)": 39.09, "reward": 0.5387853384017944, "reward_std": 0.09392016381025314, "rewards/VisualizationJSONCombinedORM/mean": 0.5387853384017944, "rewards/VisualizationJSONCombinedORM/std": 0.18482479453086853, "step": 1157, "train_speed(iter/s)": 0.026516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 295.125, "completions/min_length": 244.0, "epoch": 0.9578163771712159, "grad_norm": 0.20825259387493134, "kl": 0.06256103515625, "learning_rate": 5.411745017609493e-08, "loss": 0.0006265826523303986, "memory(GiB)": 39.09, "reward": 0.4983997941017151, "reward_std": 0.05996386706829071, "rewards/VisualizationJSONCombinedORM/mean": 0.4983997941017151, "rewards/VisualizationJSONCombinedORM/std": 0.21608182787895203, "step": 1158, "train_speed(iter/s)": 0.026523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 283.8125, "completions/min_length": 231.0, "epoch": 0.9586435070306039, "grad_norm": 0.16377131640911102, "kl": 0.033477783203125, "learning_rate": 5.201965527220188e-08, "loss": 0.0003341585397720337, "memory(GiB)": 39.09, "reward": 0.5732654929161072, "reward_std": 0.0940580666065216, "rewards/VisualizationJSONCombinedORM/mean": 0.5732654929161072, "rewards/VisualizationJSONCombinedORM/std": 0.12875224649906158, "step": 1159, "train_speed(iter/s)": 0.026531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 308.3125, "completions/min_length": 228.0, "epoch": 0.9594706368899917, "grad_norm": 0.21171730756759644, "kl": 0.0986328125, "learning_rate": 4.996311470594928e-08, "loss": 0.0009859539568424225, "memory(GiB)": 39.09, "reward": 0.21330028772354126, "reward_std": 0.04089107736945152, "rewards/VisualizationJSONCombinedORM/mean": 0.21330028772354126, "rewards/VisualizationJSONCombinedORM/std": 0.05375593528151512, "step": 1160, "train_speed(iter/s)": 0.026538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 327.5625, "completions/min_length": 266.0, "epoch": 0.9602977667493796, "grad_norm": 0.1808701455593109, "kl": 0.064697265625, "learning_rate": 4.794784562397459e-08, "loss": 0.0006472915410995483, "memory(GiB)": 39.09, "reward": 0.5978226661682129, "reward_std": 0.11364492774009705, "rewards/VisualizationJSONCombinedORM/mean": 0.5978226661682129, "rewards/VisualizationJSONCombinedORM/std": 0.13916127383708954, "step": 1161, "train_speed(iter/s)": 0.026541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 301.25, "completions/min_length": 233.0, "epoch": 0.9611248966087675, "grad_norm": 0.21434436738491058, "kl": 0.02886962890625, "learning_rate": 4.597386482880717e-08, "loss": 0.00028843432664871216, "memory(GiB)": 39.09, "reward": 0.4839518666267395, "reward_std": 0.09713372588157654, "rewards/VisualizationJSONCombinedORM/mean": 0.4839518666267395, "rewards/VisualizationJSONCombinedORM/std": 0.1265886127948761, "step": 1162, "train_speed(iter/s)": 0.026551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 316.0625, "completions/min_length": 243.0, "epoch": 0.9619520264681555, "grad_norm": 0.1515328586101532, "kl": 0.04718017578125, "learning_rate": 4.404118877873176e-08, "loss": 0.0004716217517852783, "memory(GiB)": 39.09, "reward": 0.508454442024231, "reward_std": 0.08159689605236053, "rewards/VisualizationJSONCombinedORM/mean": 0.508454442024231, "rewards/VisualizationJSONCombinedORM/std": 0.17596015334129333, "step": 1163, "train_speed(iter/s)": 0.026557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 303.1875, "completions/min_length": 255.0, "epoch": 0.9627791563275434, "grad_norm": 0.18929484486579895, "kl": 0.025787353515625, "learning_rate": 4.21498335876519e-08, "loss": 0.0002580750733613968, "memory(GiB)": 39.09, "reward": 0.4304672181606293, "reward_std": 0.06582576781511307, "rewards/VisualizationJSONCombinedORM/mean": 0.4304672181606293, "rewards/VisualizationJSONCombinedORM/std": 0.13401861488819122, "step": 1164, "train_speed(iter/s)": 0.026564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 310.9375, "completions/min_length": 240.0, "epoch": 0.9636062861869313, "grad_norm": 0.16690070927143097, "kl": 0.04840087890625, "learning_rate": 4.029981502495117e-08, "loss": 0.0004838239401578903, "memory(GiB)": 39.09, "reward": 0.6080911159515381, "reward_std": 0.0724191889166832, "rewards/VisualizationJSONCombinedORM/mean": 0.6080911159515381, "rewards/VisualizationJSONCombinedORM/std": 0.12282662838697433, "step": 1165, "train_speed(iter/s)": 0.026568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 333.5, "completions/min_length": 255.0, "epoch": 0.9644334160463193, "grad_norm": 0.1687495857477188, "kl": 0.0606689453125, "learning_rate": 3.8491148515366064e-08, "loss": 0.0006059631705284119, "memory(GiB)": 39.09, "reward": 0.6153518557548523, "reward_std": 0.06542611122131348, "rewards/VisualizationJSONCombinedORM/mean": 0.6153518557548523, "rewards/VisualizationJSONCombinedORM/std": 0.09444841742515564, "step": 1166, "train_speed(iter/s)": 0.026575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 286.0625, "completions/min_length": 223.0, "epoch": 0.9652605459057072, "grad_norm": 0.1681906133890152, "kl": 0.03778076171875, "learning_rate": 3.672384913885441e-08, "loss": 0.0003773123025894165, "memory(GiB)": 39.09, "reward": 0.4166259765625, "reward_std": 0.08598978817462921, "rewards/VisualizationJSONCombinedORM/mean": 0.4166259765625, "rewards/VisualizationJSONCombinedORM/std": 0.0902547836303711, "step": 1167, "train_speed(iter/s)": 0.026583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 293.0, "completions/min_length": 222.0, "epoch": 0.9660876757650951, "grad_norm": 0.1917201727628708, "kl": 0.08721923828125, "learning_rate": 3.499793163047327e-08, "loss": 0.00086955726146698, "memory(GiB)": 39.09, "reward": 0.4248964786529541, "reward_std": 0.0994323343038559, "rewards/VisualizationJSONCombinedORM/mean": 0.4248964786529541, "rewards/VisualizationJSONCombinedORM/std": 0.13365477323532104, "step": 1168, "train_speed(iter/s)": 0.026587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 292.25, "completions/min_length": 214.0, "epoch": 0.9669148056244831, "grad_norm": 0.17213056981563568, "kl": 0.05902099609375, "learning_rate": 3.3313410380250157e-08, "loss": 0.000591132789850235, "memory(GiB)": 39.09, "reward": 0.38520005345344543, "reward_std": 0.07219887524843216, "rewards/VisualizationJSONCombinedORM/mean": 0.38520005345344543, "rewards/VisualizationJSONCombinedORM/std": 0.10313050448894501, "step": 1169, "train_speed(iter/s)": 0.026591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 323.0, "completions/min_length": 244.0, "epoch": 0.967741935483871, "grad_norm": 0.18749351799488068, "kl": 0.039794921875, "learning_rate": 3.1670299433070315e-08, "loss": 0.0003986246883869171, "memory(GiB)": 39.09, "reward": 0.31046193838119507, "reward_std": 0.03828295320272446, "rewards/VisualizationJSONCombinedORM/mean": 0.31046193838119507, "rewards/VisualizationJSONCombinedORM/std": 0.13119645416736603, "step": 1170, "train_speed(iter/s)": 0.026598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 310.3125, "completions/min_length": 223.0, "epoch": 0.9685690653432589, "grad_norm": 0.17513172328472137, "kl": 0.024383544921875, "learning_rate": 3.0068612488554084e-08, "loss": 0.00024395063519477844, "memory(GiB)": 39.09, "reward": 0.676017701625824, "reward_std": 0.06298500299453735, "rewards/VisualizationJSONCombinedORM/mean": 0.676017701625824, "rewards/VisualizationJSONCombinedORM/std": 0.10988696664571762, "step": 1171, "train_speed(iter/s)": 0.026604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 277.875, "completions/min_length": 221.0, "epoch": 0.9693961952026469, "grad_norm": 0.17163047194480896, "kl": 0.033935546875, "learning_rate": 2.850836290094472e-08, "loss": 0.00033955276012420654, "memory(GiB)": 39.09, "reward": 0.5866557359695435, "reward_std": 0.08130967617034912, "rewards/VisualizationJSONCombinedORM/mean": 0.5866557359695435, "rewards/VisualizationJSONCombinedORM/std": 0.1221882700920105, "step": 1172, "train_speed(iter/s)": 0.026611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 302.0, "completions/min_length": 243.0, "epoch": 0.9702233250620348, "grad_norm": 0.19577248394489288, "kl": 0.0675048828125, "learning_rate": 2.6989563678996856e-08, "loss": 0.0006757676601409912, "memory(GiB)": 39.09, "reward": 0.4674667418003082, "reward_std": 0.07422579079866409, "rewards/VisualizationJSONCombinedORM/mean": 0.4674667418003082, "rewards/VisualizationJSONCombinedORM/std": 0.2037348747253418, "step": 1173, "train_speed(iter/s)": 0.026618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 295.25, "completions/min_length": 226.0, "epoch": 0.9710504549214226, "grad_norm": 0.1984405666589737, "kl": 0.04315185546875, "learning_rate": 2.551222748586879e-08, "loss": 0.0004316195845603943, "memory(GiB)": 39.09, "reward": 0.407488614320755, "reward_std": 0.0594601035118103, "rewards/VisualizationJSONCombinedORM/mean": 0.407488614320755, "rewards/VisualizationJSONCombinedORM/std": 0.06542882323265076, "step": 1174, "train_speed(iter/s)": 0.026627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 290.6875, "completions/min_length": 231.0, "epoch": 0.9718775847808105, "grad_norm": 0.15550868213176727, "kl": 0.04241943359375, "learning_rate": 2.4076366639015914e-08, "loss": 0.0004245936870574951, "memory(GiB)": 39.09, "reward": 0.5966784954071045, "reward_std": 0.0989069789648056, "rewards/VisualizationJSONCombinedORM/mean": 0.5966784954071045, "rewards/VisualizationJSONCombinedORM/std": 0.09945444017648697, "step": 1175, "train_speed(iter/s)": 0.026634 }, { "epoch": 0.9718775847808105, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.5833333333333, "eval_completions/mean_length": 307.0520833333333, "eval_completions/min_length": 258.5, "eval_kl": 0.045338948567708336, "eval_loss": 0.0004550851881504059, "eval_reward": 0.46046621600786847, "eval_reward_std": 0.0746413441374898, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46046621600786847, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07464134483598173, "eval_runtime": 306.9186, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 332.0625, "completions/min_length": 245.0, "epoch": 0.9727047146401985, "grad_norm": 0.17300456762313843, "kl": 0.03863525390625, "learning_rate": 2.26819931100869e-08, "loss": 0.00038648396730422974, "memory(GiB)": 39.09, "reward": 0.5068081021308899, "reward_std": 0.06282594799995422, "rewards/VisualizationJSONCombinedORM/mean": 0.5068081021308899, "rewards/VisualizationJSONCombinedORM/std": 0.2378080189228058, "step": 1176, "train_speed(iter/s)": 0.026456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 289.75, "completions/min_length": 222.0, "epoch": 0.9735318444995864, "grad_norm": 0.17088207602500916, "kl": 0.04864501953125, "learning_rate": 2.1329118524827662e-08, "loss": 0.00048614293336868286, "memory(GiB)": 39.09, "reward": 0.4669122099876404, "reward_std": 0.06785491108894348, "rewards/VisualizationJSONCombinedORM/mean": 0.4669122099876404, "rewards/VisualizationJSONCombinedORM/std": 0.09483011066913605, "step": 1177, "train_speed(iter/s)": 0.026463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 291.1875, "completions/min_length": 240.0, "epoch": 0.9743589743589743, "grad_norm": 0.17830099165439606, "kl": 0.04437255859375, "learning_rate": 2.0017754162979795e-08, "loss": 0.0004441607743501663, "memory(GiB)": 39.09, "reward": 0.4972958564758301, "reward_std": 0.09301650524139404, "rewards/VisualizationJSONCombinedORM/mean": 0.4972958564758301, "rewards/VisualizationJSONCombinedORM/std": 0.17078949511051178, "step": 1178, "train_speed(iter/s)": 0.02647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 336.4375, "completions/min_length": 246.0, "epoch": 0.9751861042183623, "grad_norm": 0.16441117227077484, "kl": 0.04229736328125, "learning_rate": 1.8747910958191173e-08, "loss": 0.0004227869212627411, "memory(GiB)": 39.09, "reward": 0.33584490418434143, "reward_std": 0.04384097084403038, "rewards/VisualizationJSONCombinedORM/mean": 0.33584490418434143, "rewards/VisualizationJSONCombinedORM/std": 0.0982423648238182, "step": 1179, "train_speed(iter/s)": 0.026478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 285.0625, "completions/min_length": 220.0, "epoch": 0.9760132340777502, "grad_norm": 0.16119354963302612, "kl": 0.02734375, "learning_rate": 1.7519599497919926e-08, "loss": 0.0002736300230026245, "memory(GiB)": 39.09, "reward": 0.5232000350952148, "reward_std": 0.058045394718647, "rewards/VisualizationJSONCombinedORM/mean": 0.5232000350952148, "rewards/VisualizationJSONCombinedORM/std": 0.23075462877750397, "step": 1180, "train_speed(iter/s)": 0.026484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 280.5, "completions/min_length": 227.0, "epoch": 0.9768403639371381, "grad_norm": 0.1441352814435959, "kl": 0.029754638671875, "learning_rate": 1.6332830023350065e-08, "loss": 0.00029686838388442993, "memory(GiB)": 39.09, "reward": 0.6781048774719238, "reward_std": 0.10309968888759613, "rewards/VisualizationJSONCombinedORM/mean": 0.6781048774719238, "rewards/VisualizationJSONCombinedORM/std": 0.10031301528215408, "step": 1181, "train_speed(iter/s)": 0.026494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 335.3125, "completions/min_length": 207.0, "epoch": 0.9776674937965261, "grad_norm": 0.18730178475379944, "kl": 0.0369873046875, "learning_rate": 1.5187612429304887e-08, "loss": 0.0003699474036693573, "memory(GiB)": 39.09, "reward": 0.6006748676300049, "reward_std": 0.11656764149665833, "rewards/VisualizationJSONCombinedORM/mean": 0.6006748676300049, "rewards/VisualizationJSONCombinedORM/std": 0.11679302901029587, "step": 1182, "train_speed(iter/s)": 0.026499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 318.125, "completions/min_length": 236.0, "epoch": 0.978494623655914, "grad_norm": 0.17617572844028473, "kl": 0.0556640625, "learning_rate": 1.408395626416259e-08, "loss": 0.0005572885274887085, "memory(GiB)": 39.09, "reward": 0.36818671226501465, "reward_std": 0.049347322434186935, "rewards/VisualizationJSONCombinedORM/mean": 0.36818671226501465, "rewards/VisualizationJSONCombinedORM/std": 0.13941460847854614, "step": 1183, "train_speed(iter/s)": 0.026504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 307.5625, "completions/min_length": 266.0, "epoch": 0.9793217535153019, "grad_norm": 0.1494099348783493, "kl": 0.0303955078125, "learning_rate": 1.3021870729780783e-08, "loss": 0.0003048628568649292, "memory(GiB)": 39.09, "reward": 0.6112414002418518, "reward_std": 0.05107954144477844, "rewards/VisualizationJSONCombinedORM/mean": 0.6112414002418518, "rewards/VisualizationJSONCombinedORM/std": 0.135055810213089, "step": 1184, "train_speed(iter/s)": 0.026512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 313.8125, "completions/min_length": 225.0, "epoch": 0.9801488833746899, "grad_norm": 0.17591172456741333, "kl": 0.04656982421875, "learning_rate": 1.200136468141544e-08, "loss": 0.0004655607044696808, "memory(GiB)": 39.09, "reward": 0.4490869641304016, "reward_std": 0.0747448280453682, "rewards/VisualizationJSONCombinedORM/mean": 0.4490869641304016, "rewards/VisualizationJSONCombinedORM/std": 0.2813378572463989, "step": 1185, "train_speed(iter/s)": 0.026516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 312.875, "completions/min_length": 246.0, "epoch": 0.9809760132340778, "grad_norm": 0.18016530573368073, "kl": 0.06109619140625, "learning_rate": 1.1022446627649286e-08, "loss": 0.0006093690171837807, "memory(GiB)": 39.09, "reward": 0.5579695105552673, "reward_std": 0.12244752049446106, "rewards/VisualizationJSONCombinedORM/mean": 0.5579695105552673, "rewards/VisualizationJSONCombinedORM/std": 0.141649067401886, "step": 1186, "train_speed(iter/s)": 0.026521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 307.75, "completions/min_length": 266.0, "epoch": 0.9818031430934657, "grad_norm": 0.15875346958637238, "kl": 0.053466796875, "learning_rate": 1.008512473032075e-08, "loss": 0.0005329754203557968, "memory(GiB)": 39.09, "reward": 0.5095296502113342, "reward_std": 0.05649182200431824, "rewards/VisualizationJSONCombinedORM/mean": 0.5095296502113342, "rewards/VisualizationJSONCombinedORM/std": 0.23700588941574097, "step": 1187, "train_speed(iter/s)": 0.02653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 305.5625, "completions/min_length": 230.0, "epoch": 0.9826302729528535, "grad_norm": 0.15611036121845245, "kl": 0.04962158203125, "learning_rate": 9.18940680445568e-09, "loss": 0.000497184693813324, "memory(GiB)": 39.09, "reward": 0.5268702507019043, "reward_std": 0.05428021773695946, "rewards/VisualizationJSONCombinedORM/mean": 0.5268702507019043, "rewards/VisualizationJSONCombinedORM/std": 0.17301474511623383, "step": 1188, "train_speed(iter/s)": 0.026539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 288.375, "completions/min_length": 244.0, "epoch": 0.9834574028122415, "grad_norm": 0.2170248031616211, "kl": 0.067138671875, "learning_rate": 8.335300318201844e-09, "loss": 0.0006716717034578323, "memory(GiB)": 39.09, "reward": 0.4673580229282379, "reward_std": 0.08157029747962952, "rewards/VisualizationJSONCombinedORM/mean": 0.4673580229282379, "rewards/VisualizationJSONCombinedORM/std": 0.1790326088666916, "step": 1189, "train_speed(iter/s)": 0.026546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 295.5625, "completions/min_length": 240.0, "epoch": 0.9842845326716294, "grad_norm": 0.15908139944076538, "kl": 0.02911376953125, "learning_rate": 7.52281239276842e-09, "loss": 0.00029119476675987244, "memory(GiB)": 39.09, "reward": 0.4860662817955017, "reward_std": 0.06602402031421661, "rewards/VisualizationJSONCombinedORM/mean": 0.4860662817955017, "rewards/VisualizationJSONCombinedORM/std": 0.08240801095962524, "step": 1190, "train_speed(iter/s)": 0.026551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 326.0625, "completions/min_length": 263.0, "epoch": 0.9851116625310173, "grad_norm": 0.18134982883930206, "kl": 0.05767822265625, "learning_rate": 6.751949802362712e-09, "loss": 0.0005759596824645996, "memory(GiB)": 39.09, "reward": 0.38231492042541504, "reward_std": 0.059397611767053604, "rewards/VisualizationJSONCombinedORM/mean": 0.38231492042541504, "rewards/VisualizationJSONCombinedORM/std": 0.14776405692100525, "step": 1191, "train_speed(iter/s)": 0.026556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 292.1875, "completions/min_length": 216.0, "epoch": 0.9859387923904053, "grad_norm": 0.211844339966774, "kl": 0.04522705078125, "learning_rate": 6.022718974137976e-09, "loss": 0.00045242905616760254, "memory(GiB)": 39.09, "reward": 0.4509422481060028, "reward_std": 0.08534989506006241, "rewards/VisualizationJSONCombinedORM/mean": 0.4509422481060028, "rewards/VisualizationJSONCombinedORM/std": 0.19878199696540833, "step": 1192, "train_speed(iter/s)": 0.026564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 300.4375, "completions/min_length": 250.0, "epoch": 0.9867659222497932, "grad_norm": 0.1631178855895996, "kl": 0.033203125, "learning_rate": 5.3351259881379016e-09, "loss": 0.0003323182463645935, "memory(GiB)": 39.09, "reward": 0.5569709539413452, "reward_std": 0.08437693864107132, "rewards/VisualizationJSONCombinedORM/mean": 0.5569709539413452, "rewards/VisualizationJSONCombinedORM/std": 0.2370266616344452, "step": 1193, "train_speed(iter/s)": 0.026571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 318.5, "completions/min_length": 259.0, "epoch": 0.9875930521091811, "grad_norm": 0.154366597533226, "kl": 0.045166015625, "learning_rate": 4.689176577244992e-09, "loss": 0.0004521384835243225, "memory(GiB)": 39.09, "reward": 0.6200728416442871, "reward_std": 0.06991395354270935, "rewards/VisualizationJSONCombinedORM/mean": 0.6200728416442871, "rewards/VisualizationJSONCombinedORM/std": 0.06913130730390549, "step": 1194, "train_speed(iter/s)": 0.026577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 319.875, "completions/min_length": 247.0, "epoch": 0.9884201819685691, "grad_norm": 0.1749085634946823, "kl": 0.0938720703125, "learning_rate": 4.0848761271350405e-09, "loss": 0.00093797966837883, "memory(GiB)": 39.09, "reward": 0.6524894833564758, "reward_std": 0.10739591717720032, "rewards/VisualizationJSONCombinedORM/mean": 0.6524894833564758, "rewards/VisualizationJSONCombinedORM/std": 0.12273546308279037, "step": 1195, "train_speed(iter/s)": 0.026582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 277.1875, "completions/min_length": 215.0, "epoch": 0.989247311827957, "grad_norm": 0.17269426584243774, "kl": 0.0433349609375, "learning_rate": 3.522229676229949e-09, "loss": 0.000432819128036499, "memory(GiB)": 39.09, "reward": 0.5394101142883301, "reward_std": 0.0745588093996048, "rewards/VisualizationJSONCombinedORM/mean": 0.5394101142883301, "rewards/VisualizationJSONCombinedORM/std": 0.09872419387102127, "step": 1196, "train_speed(iter/s)": 0.026587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 284.4375, "completions/min_length": 239.0, "epoch": 0.9900744416873449, "grad_norm": 0.1709703952074051, "kl": 0.0286865234375, "learning_rate": 3.0012419156572047e-09, "loss": 0.00028651952743530273, "memory(GiB)": 39.09, "reward": 0.5650023221969604, "reward_std": 0.05276075750589371, "rewards/VisualizationJSONCombinedORM/mean": 0.5650023221969604, "rewards/VisualizationJSONCombinedORM/std": 0.22179490327835083, "step": 1197, "train_speed(iter/s)": 0.026594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 355.375, "completions/min_length": 281.0, "epoch": 0.9909015715467329, "grad_norm": 0.17564953863620758, "kl": 0.0595703125, "learning_rate": 2.5219171892110207e-09, "loss": 0.0005935803055763245, "memory(GiB)": 39.09, "reward": 0.36485105752944946, "reward_std": 0.05633539706468582, "rewards/VisualizationJSONCombinedORM/mean": 0.36485105752944946, "rewards/VisualizationJSONCombinedORM/std": 0.13545724749565125, "step": 1198, "train_speed(iter/s)": 0.026597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 292.125, "completions/min_length": 241.0, "epoch": 0.9917287014061208, "grad_norm": 0.20432402193546295, "kl": 0.040283203125, "learning_rate": 2.0842594933140338e-09, "loss": 0.00040296465158462524, "memory(GiB)": 39.09, "reward": 0.6934071779251099, "reward_std": 0.11299766600131989, "rewards/VisualizationJSONCombinedORM/mean": 0.6934071779251099, "rewards/VisualizationJSONCombinedORM/std": 0.11017844080924988, "step": 1199, "train_speed(iter/s)": 0.026603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 307.5, "completions/min_length": 231.0, "epoch": 0.9925558312655087, "grad_norm": 0.17111170291900635, "kl": 0.0479736328125, "learning_rate": 1.688272476986219e-09, "loss": 0.0004793331027030945, "memory(GiB)": 39.09, "reward": 0.5969382524490356, "reward_std": 0.12606078386306763, "rewards/VisualizationJSONCombinedORM/mean": 0.5969382524490356, "rewards/VisualizationJSONCombinedORM/std": 0.12803034484386444, "step": 1200, "train_speed(iter/s)": 0.02661 }, { "epoch": 0.9925558312655087, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.7916666666667, "eval_completions/mean_length": 303.2083333333333, "eval_completions/min_length": 257.0, "eval_kl": 0.045995076497395836, "eval_loss": 0.00046126171946525574, "eval_reward": 0.4788203357408444, "eval_reward_std": 0.06871241525126, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4788203357408444, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06871241886013497, "eval_runtime": 307.1325, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 284.625, "completions/min_length": 231.0, "epoch": 0.9933829611248967, "grad_norm": 0.15481118857860565, "kl": 0.03143310546875, "learning_rate": 1.3339594418138036e-09, "loss": 0.0003135334700345993, "memory(GiB)": 39.09, "reward": 0.3328458368778229, "reward_std": 0.03186637908220291, "rewards/VisualizationJSONCombinedORM/mean": 0.3328458368778229, "rewards/VisualizationJSONCombinedORM/std": 0.03786103054881096, "step": 1201, "train_speed(iter/s)": 0.026439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 312.8125, "completions/min_length": 250.0, "epoch": 0.9942100909842845, "grad_norm": 0.18094168603420258, "kl": 0.0286865234375, "learning_rate": 1.0213233419203994e-09, "loss": 0.000286836177110672, "memory(GiB)": 39.09, "reward": 0.6400636434555054, "reward_std": 0.09700540453195572, "rewards/VisualizationJSONCombinedORM/mean": 0.6400636434555054, "rewards/VisualizationJSONCombinedORM/std": 0.13403229415416718, "step": 1202, "train_speed(iter/s)": 0.026445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 293.8125, "completions/min_length": 215.0, "epoch": 0.9950372208436724, "grad_norm": 0.20013684034347534, "kl": 0.0631103515625, "learning_rate": 7.503667839453555e-10, "loss": 0.0006314627826213837, "memory(GiB)": 39.09, "reward": 0.3517315983772278, "reward_std": 0.05238666385412216, "rewards/VisualizationJSONCombinedORM/mean": 0.3517315983772278, "rewards/VisualizationJSONCombinedORM/std": 0.07725053280591965, "step": 1203, "train_speed(iter/s)": 0.026454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 308.3125, "completions/min_length": 248.0, "epoch": 0.9958643507030603, "grad_norm": 0.15273621678352356, "kl": 0.03656005859375, "learning_rate": 5.210920270187769e-10, "loss": 0.0003665126860141754, "memory(GiB)": 39.09, "reward": 0.7821630239486694, "reward_std": 0.1086881160736084, "rewards/VisualizationJSONCombinedORM/mean": 0.7821630239486694, "rewards/VisualizationJSONCombinedORM/std": 0.11756663769483566, "step": 1204, "train_speed(iter/s)": 0.026461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 296.1875, "completions/min_length": 236.0, "epoch": 0.9966914805624483, "grad_norm": 0.14672057330608368, "kl": 0.08087158203125, "learning_rate": 3.335009827437619e-10, "loss": 0.0008085351437330246, "memory(GiB)": 39.09, "reward": 0.5872822999954224, "reward_std": 0.07647685706615448, "rewards/VisualizationJSONCombinedORM/mean": 0.5872822999954224, "rewards/VisualizationJSONCombinedORM/std": 0.29687514901161194, "step": 1205, "train_speed(iter/s)": 0.026469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 300.4375, "completions/min_length": 250.0, "epoch": 0.9975186104218362, "grad_norm": 0.1717642843723297, "kl": 0.0654296875, "learning_rate": 1.8759521518307845e-10, "loss": 0.0006547793745994568, "memory(GiB)": 39.09, "reward": 0.5031977891921997, "reward_std": 0.06845414638519287, "rewards/VisualizationJSONCombinedORM/mean": 0.5031977891921997, "rewards/VisualizationJSONCombinedORM/std": 0.2066461145877838, "step": 1206, "train_speed(iter/s)": 0.026476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 272.875, "completions/min_length": 204.0, "epoch": 0.9983457402812241, "grad_norm": 0.17114879190921783, "kl": 0.02398681640625, "learning_rate": 8.337594084084633e-11, "loss": 0.00023970752954483032, "memory(GiB)": 39.09, "reward": 0.22596418857574463, "reward_std": 0.018089644610881805, "rewards/VisualizationJSONCombinedORM/mean": 0.22596418857574463, "rewards/VisualizationJSONCombinedORM/std": 0.05652984604239464, "step": 1207, "train_speed(iter/s)": 0.026484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 322.0, "completions/min_length": 270.0, "epoch": 0.9991728701406121, "grad_norm": 0.16362051665782928, "kl": 0.0625, "learning_rate": 2.084402865754065e-11, "loss": 0.0006237868219614029, "memory(GiB)": 39.09, "reward": 0.613468587398529, "reward_std": 0.12067510187625885, "rewards/VisualizationJSONCombinedORM/mean": 0.613468587398529, "rewards/VisualizationJSONCombinedORM/std": 0.14074929058551788, "step": 1208, "train_speed(iter/s)": 0.026493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 321.0, "completions/min_length": 258.0, "epoch": 1.0, "grad_norm": 0.2152615487575531, "kl": 0.0494384765625, "learning_rate": 0.0, "loss": 0.0004954859614372253, "memory(GiB)": 39.09, "reward": 0.490122526884079, "reward_std": 0.10128406435251236, "rewards/VisualizationJSONCombinedORM/mean": 0.490122526884079, "rewards/VisualizationJSONCombinedORM/std": 0.19171246886253357, "step": 1209, "train_speed(iter/s)": 0.026499 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 358.0, "eval_completions/mean_length": 305.1770833333333, "eval_completions/min_length": 253.83333333333334, "eval_kl": 0.0477752685546875, "eval_loss": 0.000482252478832379, "eval_reward": 0.47262550704181194, "eval_reward_std": 0.07642011670395732, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47262550704181194, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07642011961434036, "eval_runtime": 307.6216, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 302.875, "completions/min_length": 252.0, "epoch": 1.0008271298593878, "grad_norm": 0.17715801298618317, "kl": 0.0732421875, "learning_rate": 5.861921006795522e-06, "loss": 0.000731639564037323, "memory(GiB)": 36.66, "reward": 0.4135704040527344, "reward_std": 0.07117979973554611, "rewards/VisualizationJSONCombinedORM/mean": 0.4135704040527344, "rewards/VisualizationJSONCombinedORM/std": 0.1194431260228157, "step": 1210, "train_speed(iter/s)": 5.544208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 268.0625, "completions/min_length": 224.0, "epoch": 1.0016542597187759, "grad_norm": 0.1691797971725464, "kl": 0.053497314453125, "learning_rate": 5.8548094438015065e-06, "loss": 0.0005338964983820915, "memory(GiB)": 37.19, "reward": 0.45723938941955566, "reward_std": 0.06486387550830841, "rewards/VisualizationJSONCombinedORM/mean": 0.45723938941955566, "rewards/VisualizationJSONCombinedORM/std": 0.09616053849458694, "step": 1211, "train_speed(iter/s)": 5.013184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 303.25, "completions/min_length": 259.0, "epoch": 1.0024813895781637, "grad_norm": 0.1565793752670288, "kl": 0.024169921875, "learning_rate": 5.8476960990393085e-06, "loss": 0.00024194270372390747, "memory(GiB)": 37.41, "reward": 0.48957186937332153, "reward_std": 0.04609108716249466, "rewards/VisualizationJSONCombinedORM/mean": 0.48957186937332153, "rewards/VisualizationJSONCombinedORM/std": 0.24382509291172028, "step": 1212, "train_speed(iter/s)": 4.547421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 290.3125, "completions/min_length": 246.0, "epoch": 1.0033085194375517, "grad_norm": 0.15844881534576416, "kl": 0.04644775390625, "learning_rate": 5.840580987336013e-06, "loss": 0.00046313926577568054, "memory(GiB)": 37.41, "reward": 0.6117954850196838, "reward_std": 0.08096249401569366, "rewards/VisualizationJSONCombinedORM/mean": 0.6117954850196838, "rewards/VisualizationJSONCombinedORM/std": 0.08230149000883102, "step": 1213, "train_speed(iter/s)": 4.170285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.25, "completions/min_length": 266.0, "epoch": 1.0041356492969395, "grad_norm": 0.16899432241916656, "kl": 0.04791259765625, "learning_rate": 5.833464123522384e-06, "loss": 0.00047802552580833435, "memory(GiB)": 37.41, "reward": 0.5180898904800415, "reward_std": 0.06980901211500168, "rewards/VisualizationJSONCombinedORM/mean": 0.5180898904800415, "rewards/VisualizationJSONCombinedORM/std": 0.10822641849517822, "step": 1214, "train_speed(iter/s)": 3.861599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 294.75, "completions/min_length": 261.0, "epoch": 1.0049627791563276, "grad_norm": 0.2119135856628418, "kl": 0.071044921875, "learning_rate": 5.826345522432843e-06, "loss": 0.0007089227437973022, "memory(GiB)": 37.41, "reward": 0.37049126625061035, "reward_std": 0.09577213227748871, "rewards/VisualizationJSONCombinedORM/mean": 0.37049126625061035, "rewards/VisualizationJSONCombinedORM/std": 0.09693054109811783, "step": 1215, "train_speed(iter/s)": 3.623311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 275.5625, "completions/min_length": 226.0, "epoch": 1.0057899090157154, "grad_norm": 0.15346674621105194, "kl": 0.07025146484375, "learning_rate": 5.819225198905429e-06, "loss": 0.00070161372423172, "memory(GiB)": 37.41, "reward": 0.717415452003479, "reward_std": 0.09435096383094788, "rewards/VisualizationJSONCombinedORM/mean": 0.717415452003479, "rewards/VisualizationJSONCombinedORM/std": 0.09185086190700531, "step": 1216, "train_speed(iter/s)": 3.385501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 285.75, "completions/min_length": 243.0, "epoch": 1.0066170388751035, "grad_norm": 0.18639419972896576, "kl": 0.04638671875, "learning_rate": 5.812103167781773e-06, "loss": 0.00046489015221595764, "memory(GiB)": 37.41, "reward": 0.3285345435142517, "reward_std": 0.05274650454521179, "rewards/VisualizationJSONCombinedORM/mean": 0.3285345435142517, "rewards/VisualizationJSONCombinedORM/std": 0.0847339779138565, "step": 1217, "train_speed(iter/s)": 3.196773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 300.125, "completions/min_length": 244.0, "epoch": 1.0074441687344913, "grad_norm": 0.21752935647964478, "kl": 0.0516357421875, "learning_rate": 5.804979443907065e-06, "loss": 0.0005157962441444397, "memory(GiB)": 37.62, "reward": 0.5438832640647888, "reward_std": 0.12144842743873596, "rewards/VisualizationJSONCombinedORM/mean": 0.5438832640647888, "rewards/VisualizationJSONCombinedORM/std": 0.1837529093027115, "step": 1218, "train_speed(iter/s)": 2.982386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 300.6875, "completions/min_length": 255.0, "epoch": 1.0082712985938793, "grad_norm": 0.18380390107631683, "kl": 0.0340576171875, "learning_rate": 5.797854042130022e-06, "loss": 0.0003406926989555359, "memory(GiB)": 37.62, "reward": 0.406447172164917, "reward_std": 0.08265960216522217, "rewards/VisualizationJSONCombinedORM/mean": 0.406447172164917, "rewards/VisualizationJSONCombinedORM/std": 0.08104000240564346, "step": 1219, "train_speed(iter/s)": 2.834168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 299.0, "completions/min_length": 223.0, "epoch": 1.0090984284532671, "grad_norm": 0.18626560270786285, "kl": 0.1083984375, "learning_rate": 5.790726977302862e-06, "loss": 0.0010832473635673523, "memory(GiB)": 37.62, "reward": 0.4829208254814148, "reward_std": 0.08672874420881271, "rewards/VisualizationJSONCombinedORM/mean": 0.4829208254814148, "rewards/VisualizationJSONCombinedORM/std": 0.17059831321239471, "step": 1220, "train_speed(iter/s)": 2.699622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 293.6875, "completions/min_length": 240.0, "epoch": 1.0099255583126552, "grad_norm": 0.17298603057861328, "kl": 0.0628662109375, "learning_rate": 5.7835982642812645e-06, "loss": 0.0006272792816162109, "memory(GiB)": 37.62, "reward": 0.36994558572769165, "reward_std": 0.05705097317695618, "rewards/VisualizationJSONCombinedORM/mean": 0.36994558572769165, "rewards/VisualizationJSONCombinedORM/std": 0.06641466170549393, "step": 1221, "train_speed(iter/s)": 2.545054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 289.625, "completions/min_length": 247.0, "epoch": 1.010752688172043, "grad_norm": 0.1784573346376419, "kl": 0.08013916015625, "learning_rate": 5.7764679179243485e-06, "loss": 0.0008014645427465439, "memory(GiB)": 37.62, "reward": 0.38922804594039917, "reward_std": 0.0631697028875351, "rewards/VisualizationJSONCombinedORM/mean": 0.38922804594039917, "rewards/VisualizationJSONCombinedORM/std": 0.08841218799352646, "step": 1222, "train_speed(iter/s)": 2.424884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.375, "completions/min_length": 241.0, "epoch": 1.011579818031431, "grad_norm": 0.2574887275695801, "kl": 0.042236328125, "learning_rate": 5.769335953094636e-06, "loss": 0.00042228400707244873, "memory(GiB)": 37.62, "reward": 0.5360121726989746, "reward_std": 0.06875459104776382, "rewards/VisualizationJSONCombinedORM/mean": 0.5360121726989746, "rewards/VisualizationJSONCombinedORM/std": 0.15048207342624664, "step": 1223, "train_speed(iter/s)": 2.31247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 301.125, "completions/min_length": 235.0, "epoch": 1.0124069478908189, "grad_norm": 0.15957842767238617, "kl": 0.05230712890625, "learning_rate": 5.762202384658021e-06, "loss": 0.0005225725471973419, "memory(GiB)": 37.62, "reward": 0.6143765449523926, "reward_std": 0.11283661425113678, "rewards/VisualizationJSONCombinedORM/mean": 0.6143765449523926, "rewards/VisualizationJSONCombinedORM/std": 0.1264781504869461, "step": 1224, "train_speed(iter/s)": 2.208031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 268.5, "completions/min_length": 247.0, "epoch": 1.0132340777502067, "grad_norm": 0.20855611562728882, "kl": 0.057861328125, "learning_rate": 5.7550672274837464e-06, "loss": 0.0005785524845123291, "memory(GiB)": 37.92, "reward": 0.6148176789283752, "reward_std": 0.10854832828044891, "rewards/VisualizationJSONCombinedORM/mean": 0.6148176789283752, "rewards/VisualizationJSONCombinedORM/std": 0.1065981388092041, "step": 1225, "train_speed(iter/s)": 2.098039 }, { "epoch": 1.0132340777502067, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.25, "eval_completions/mean_length": 311.4010416666667, "eval_completions/min_length": 265.4583333333333, "eval_kl": 0.0638275146484375, "eval_loss": 0.0006397739052772522, "eval_reward": 0.4538269353409608, "eval_reward_std": 0.07011209133391579, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4538269353409608, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07011209280850987, "eval_runtime": 311.827, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 304.6875, "completions/min_length": 241.0, "epoch": 1.0140612076095947, "grad_norm": 0.16462989151477814, "kl": 0.05731201171875, "learning_rate": 5.747930496444356e-06, "loss": 0.000572502613067627, "memory(GiB)": 37.92, "reward": 0.6325632333755493, "reward_std": 0.10780994594097137, "rewards/VisualizationJSONCombinedORM/mean": 0.6325632333755493, "rewards/VisualizationJSONCombinedORM/std": 0.11371516436338425, "step": 1226, "train_speed(iter/s)": 1.329291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 294.0, "completions/min_length": 225.0, "epoch": 1.0148883374689825, "grad_norm": 0.21175897121429443, "kl": 0.1207275390625, "learning_rate": 5.740792206415685e-06, "loss": 0.0012058988213539124, "memory(GiB)": 37.92, "reward": 0.655917227268219, "reward_std": 0.08840613812208176, "rewards/VisualizationJSONCombinedORM/mean": 0.655917227268219, "rewards/VisualizationJSONCombinedORM/std": 0.1570747345685959, "step": 1227, "train_speed(iter/s)": 1.297559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 294.0625, "completions/min_length": 245.0, "epoch": 1.0157154673283706, "grad_norm": 0.156963512301445, "kl": 0.07220458984375, "learning_rate": 5.733652372276809e-06, "loss": 0.0007240027189254761, "memory(GiB)": 37.92, "reward": 0.7806665897369385, "reward_std": 0.06575147062540054, "rewards/VisualizationJSONCombinedORM/mean": 0.7806665897369385, "rewards/VisualizationJSONCombinedORM/std": 0.07040601223707199, "step": 1228, "train_speed(iter/s)": 1.262579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 313.875, "completions/min_length": 229.0, "epoch": 1.0165425971877584, "grad_norm": 0.19776462018489838, "kl": 0.04718017578125, "learning_rate": 5.7265110089100304e-06, "loss": 0.00047085434198379517, "memory(GiB)": 37.92, "reward": 0.5700386166572571, "reward_std": 0.09698891639709473, "rewards/VisualizationJSONCombinedORM/mean": 0.5700386166572571, "rewards/VisualizationJSONCombinedORM/std": 0.18816564977169037, "step": 1229, "train_speed(iter/s)": 1.22939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 302.4375, "completions/min_length": 258.0, "epoch": 1.0173697270471465, "grad_norm": 0.2640192210674286, "kl": 0.1015625, "learning_rate": 5.719368131200834e-06, "loss": 0.0010140854865312576, "memory(GiB)": 37.92, "reward": 0.39767491817474365, "reward_std": 0.087995246052742, "rewards/VisualizationJSONCombinedORM/mean": 0.39767491817474365, "rewards/VisualizationJSONCombinedORM/std": 0.0978526845574379, "step": 1230, "train_speed(iter/s)": 1.200551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 306.0, "completions/min_length": 239.0, "epoch": 1.0181968569065343, "grad_norm": 0.196650892496109, "kl": 0.0743408203125, "learning_rate": 5.712223754037861e-06, "loss": 0.0007444247603416443, "memory(GiB)": 37.92, "reward": 0.48555219173431396, "reward_std": 0.07940113544464111, "rewards/VisualizationJSONCombinedORM/mean": 0.48555219173431396, "rewards/VisualizationJSONCombinedORM/std": 0.10615678876638412, "step": 1231, "train_speed(iter/s)": 1.169575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 299.875, "completions/min_length": 245.0, "epoch": 1.0190239867659223, "grad_norm": 0.1942160725593567, "kl": 0.1046142578125, "learning_rate": 5.705077892312881e-06, "loss": 0.0010468438267707825, "memory(GiB)": 37.92, "reward": 0.5834579467773438, "reward_std": 0.11602403223514557, "rewards/VisualizationJSONCombinedORM/mean": 0.5834579467773438, "rewards/VisualizationJSONCombinedORM/std": 0.11520040035247803, "step": 1232, "train_speed(iter/s)": 1.141392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 311.6875, "completions/min_length": 240.0, "epoch": 1.0198511166253101, "grad_norm": 0.18351426720619202, "kl": 0.0693359375, "learning_rate": 5.697930560920757e-06, "loss": 0.0006927847862243652, "memory(GiB)": 37.92, "reward": 0.6010810136795044, "reward_std": 0.1012636050581932, "rewards/VisualizationJSONCombinedORM/mean": 0.6010810136795044, "rewards/VisualizationJSONCombinedORM/std": 0.1001477837562561, "step": 1233, "train_speed(iter/s)": 1.112575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 289.5625, "completions/min_length": 251.0, "epoch": 1.0206782464846982, "grad_norm": 0.20634278655052185, "kl": 0.0555419921875, "learning_rate": 5.690781774759412e-06, "loss": 0.0005568675696849823, "memory(GiB)": 37.92, "reward": 0.6226322650909424, "reward_std": 0.0951564610004425, "rewards/VisualizationJSONCombinedORM/mean": 0.6226322650909424, "rewards/VisualizationJSONCombinedORM/std": 0.10323471575975418, "step": 1234, "train_speed(iter/s)": 1.085258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 299.375, "completions/min_length": 242.0, "epoch": 1.021505376344086, "grad_norm": 0.20180954039096832, "kl": 0.0693359375, "learning_rate": 5.683631548729806e-06, "loss": 0.0006947629153728485, "memory(GiB)": 37.92, "reward": 0.6159632205963135, "reward_std": 0.06060376018285751, "rewards/VisualizationJSONCombinedORM/mean": 0.6159632205963135, "rewards/VisualizationJSONCombinedORM/std": 0.19046197831630707, "step": 1235, "train_speed(iter/s)": 1.065821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 325.3125, "completions/min_length": 235.0, "epoch": 1.022332506203474, "grad_norm": 0.17237454652786255, "kl": 0.06207275390625, "learning_rate": 5.676479897735899e-06, "loss": 0.0006204023957252502, "memory(GiB)": 37.92, "reward": 0.4366855323314667, "reward_std": 0.06578350067138672, "rewards/VisualizationJSONCombinedORM/mean": 0.4366855323314667, "rewards/VisualizationJSONCombinedORM/std": 0.24418684840202332, "step": 1236, "train_speed(iter/s)": 1.0405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 300.1875, "completions/min_length": 228.0, "epoch": 1.0231596360628619, "grad_norm": 0.18434187769889832, "kl": 0.0496826171875, "learning_rate": 5.669326836684617e-06, "loss": 0.0004972442984580994, "memory(GiB)": 37.92, "reward": 0.22446416318416595, "reward_std": 0.02678728848695755, "rewards/VisualizationJSONCombinedORM/mean": 0.22446416318416595, "rewards/VisualizationJSONCombinedORM/std": 0.07963862270116806, "step": 1237, "train_speed(iter/s)": 1.020229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 293.25, "completions/min_length": 211.0, "epoch": 1.0239867659222497, "grad_norm": 0.19057421386241913, "kl": 0.07891845703125, "learning_rate": 5.662172380485835e-06, "loss": 0.0007898174226284027, "memory(GiB)": 37.92, "reward": 0.4845866858959198, "reward_std": 0.14293037354946136, "rewards/VisualizationJSONCombinedORM/mean": 0.4845866858959198, "rewards/VisualizationJSONCombinedORM/std": 0.2172773778438568, "step": 1238, "train_speed(iter/s)": 0.9938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 307.25, "completions/min_length": 244.0, "epoch": 1.0248138957816377, "grad_norm": 0.15315598249435425, "kl": 0.0408935546875, "learning_rate": 5.6550165440523246e-06, "loss": 0.00040963292121887207, "memory(GiB)": 37.92, "reward": 0.6066790819168091, "reward_std": 0.1481037139892578, "rewards/VisualizationJSONCombinedORM/mean": 0.6066790819168091, "rewards/VisualizationJSONCombinedORM/std": 0.1615193486213684, "step": 1239, "train_speed(iter/s)": 0.975152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 302.4375, "completions/min_length": 249.0, "epoch": 1.0256410256410255, "grad_norm": 0.22497981786727905, "kl": 0.05908203125, "learning_rate": 5.647859342299743e-06, "loss": 0.0005908533930778503, "memory(GiB)": 37.92, "reward": 0.5494370460510254, "reward_std": 0.10930772125720978, "rewards/VisualizationJSONCombinedORM/mean": 0.5494370460510254, "rewards/VisualizationJSONCombinedORM/std": 0.19938687980175018, "step": 1240, "train_speed(iter/s)": 0.956811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 293.5625, "completions/min_length": 231.0, "epoch": 1.0264681555004136, "grad_norm": 0.2099321484565735, "kl": 0.07098388671875, "learning_rate": 5.640700790146585e-06, "loss": 0.0007088705897331238, "memory(GiB)": 37.92, "reward": 0.609173059463501, "reward_std": 0.11895179748535156, "rewards/VisualizationJSONCombinedORM/mean": 0.609173059463501, "rewards/VisualizationJSONCombinedORM/std": 0.14417895674705505, "step": 1241, "train_speed(iter/s)": 0.938708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 323.75, "completions/min_length": 243.0, "epoch": 1.0272952853598014, "grad_norm": 0.19271206855773926, "kl": 0.05450439453125, "learning_rate": 5.63354090251417e-06, "loss": 0.0005441084504127502, "memory(GiB)": 37.92, "reward": 0.5584574341773987, "reward_std": 0.07496966421604156, "rewards/VisualizationJSONCombinedORM/mean": 0.5584574341773987, "rewards/VisualizationJSONCombinedORM/std": 0.22335858643054962, "step": 1242, "train_speed(iter/s)": 0.920742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 335.3125, "completions/min_length": 231.0, "epoch": 1.0281224152191895, "grad_norm": 0.17624405026435852, "kl": 0.04852294921875, "learning_rate": 5.626379694326593e-06, "loss": 0.0004863366484642029, "memory(GiB)": 37.92, "reward": 0.47673720121383667, "reward_std": 0.08110669255256653, "rewards/VisualizationJSONCombinedORM/mean": 0.47673720121383667, "rewards/VisualizationJSONCombinedORM/std": 0.21519167721271515, "step": 1243, "train_speed(iter/s)": 0.904971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 309.25, "completions/min_length": 233.0, "epoch": 1.0289495450785773, "grad_norm": 0.15722137689590454, "kl": 0.03765869140625, "learning_rate": 5.619217180510706e-06, "loss": 0.00037670135498046875, "memory(GiB)": 37.92, "reward": 0.6288631558418274, "reward_std": 0.09482774138450623, "rewards/VisualizationJSONCombinedORM/mean": 0.6288631558418274, "rewards/VisualizationJSONCombinedORM/std": 0.1545896679162979, "step": 1244, "train_speed(iter/s)": 0.888947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 302.75, "completions/min_length": 232.0, "epoch": 1.0297766749379653, "grad_norm": 0.1718335896730423, "kl": 0.06512451171875, "learning_rate": 5.612053375996082e-06, "loss": 0.0006510019302368164, "memory(GiB)": 37.92, "reward": 0.5228729248046875, "reward_std": 0.042491473257541656, "rewards/VisualizationJSONCombinedORM/mean": 0.5228729248046875, "rewards/VisualizationJSONCombinedORM/std": 0.3017578125, "step": 1245, "train_speed(iter/s)": 0.874129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 290.4375, "completions/min_length": 233.0, "epoch": 1.0306038047973531, "grad_norm": 0.1975950449705124, "kl": 0.0501708984375, "learning_rate": 5.60488829571498e-06, "loss": 0.0005019493401050568, "memory(GiB)": 37.92, "reward": 0.39166873693466187, "reward_std": 0.06543870270252228, "rewards/VisualizationJSONCombinedORM/mean": 0.39166873693466187, "rewards/VisualizationJSONCombinedORM/std": 0.14617027342319489, "step": 1246, "train_speed(iter/s)": 0.859708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 271.4375, "completions/min_length": 215.0, "epoch": 1.0314309346567412, "grad_norm": 0.16970722377300262, "kl": 0.0643310546875, "learning_rate": 5.597721954602326e-06, "loss": 0.0006443262100219727, "memory(GiB)": 37.92, "reward": 0.48064547777175903, "reward_std": 0.058407463133335114, "rewards/VisualizationJSONCombinedORM/mean": 0.48064547777175903, "rewards/VisualizationJSONCombinedORM/std": 0.2590925395488739, "step": 1247, "train_speed(iter/s)": 0.845816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 267.5625, "completions/min_length": 234.0, "epoch": 1.032258064516129, "grad_norm": 0.18637600541114807, "kl": 0.068603515625, "learning_rate": 5.590554367595666e-06, "loss": 0.0006862063892185688, "memory(GiB)": 37.92, "reward": 0.7275428175926208, "reward_std": 0.08677364140748978, "rewards/VisualizationJSONCombinedORM/mean": 0.7275428175926208, "rewards/VisualizationJSONCombinedORM/std": 0.09515520930290222, "step": 1248, "train_speed(iter/s)": 0.834212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 278.5625, "completions/min_length": 212.0, "epoch": 1.033085194375517, "grad_norm": 0.20553989708423615, "kl": 0.0731201171875, "learning_rate": 5.583385549635152e-06, "loss": 0.0007331706583499908, "memory(GiB)": 37.92, "reward": 0.41592538356781006, "reward_std": 0.0902034193277359, "rewards/VisualizationJSONCombinedORM/mean": 0.41592538356781006, "rewards/VisualizationJSONCombinedORM/std": 0.15871207416057587, "step": 1249, "train_speed(iter/s)": 0.820599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 328.8125, "completions/min_length": 220.0, "epoch": 1.0339123242349049, "grad_norm": 0.19819375872612, "kl": 0.07061767578125, "learning_rate": 5.576215515663489e-06, "loss": 0.0007073059678077698, "memory(GiB)": 37.92, "reward": 0.5277552604675293, "reward_std": 0.14103266596794128, "rewards/VisualizationJSONCombinedORM/mean": 0.5277552604675293, "rewards/VisualizationJSONCombinedORM/std": 0.1541101187467575, "step": 1250, "train_speed(iter/s)": 0.807985 }, { "epoch": 1.0339123242349049, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.75, "eval_completions/mean_length": 294.5260416666667, "eval_completions/min_length": 246.66666666666666, "eval_kl": 0.05938720703125, "eval_loss": 0.0005944461445324123, "eval_reward": 0.434822969759504, "eval_reward_std": 0.0639732499839738, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.434822969759504, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06397325409731518, "eval_runtime": 305.5218, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 314.625, "completions/min_length": 262.0, "epoch": 1.034739454094293, "grad_norm": 0.19564902782440186, "kl": 0.0758056640625, "learning_rate": 5.56904428062593e-06, "loss": 0.0007578358054161072, "memory(GiB)": 37.92, "reward": 0.5114705562591553, "reward_std": 0.05214817821979523, "rewards/VisualizationJSONCombinedORM/mean": 0.5114705562591553, "rewards/VisualizationJSONCombinedORM/std": 0.2024839073419571, "step": 1251, "train_speed(iter/s)": 0.66584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 299.125, "completions/min_length": 205.0, "epoch": 1.0355665839536807, "grad_norm": 0.17642123997211456, "kl": 0.033172607421875, "learning_rate": 5.561871859470222e-06, "loss": 0.00033104419708251953, "memory(GiB)": 37.92, "reward": 0.6390673518180847, "reward_std": 0.0944756343960762, "rewards/VisualizationJSONCombinedORM/mean": 0.6390673518180847, "rewards/VisualizationJSONCombinedORM/std": 0.17866721749305725, "step": 1252, "train_speed(iter/s)": 0.65689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 275.5625, "completions/min_length": 207.0, "epoch": 1.0363937138130686, "grad_norm": 0.24686719477176666, "kl": 0.05181884765625, "learning_rate": 5.55469826714659e-06, "loss": 0.0005185529589653015, "memory(GiB)": 37.92, "reward": 0.4524264335632324, "reward_std": 0.08362123370170593, "rewards/VisualizationJSONCombinedORM/mean": 0.4524264335632324, "rewards/VisualizationJSONCombinedORM/std": 0.26028603315353394, "step": 1253, "train_speed(iter/s)": 0.648346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 287.75, "completions/min_length": 218.0, "epoch": 1.0372208436724566, "grad_norm": 0.17860636115074158, "kl": 0.13885498046875, "learning_rate": 5.5475235186076985e-06, "loss": 0.0013833977282047272, "memory(GiB)": 37.92, "reward": 0.5131673216819763, "reward_std": 0.0545930415391922, "rewards/VisualizationJSONCombinedORM/mean": 0.5131673216819763, "rewards/VisualizationJSONCombinedORM/std": 0.19002635776996613, "step": 1254, "train_speed(iter/s)": 0.640639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 282.4375, "completions/min_length": 228.0, "epoch": 1.0380479735318444, "grad_norm": 0.1828332096338272, "kl": 0.06591796875, "learning_rate": 5.540347628808621e-06, "loss": 0.0006591975688934326, "memory(GiB)": 37.92, "reward": 0.7290569543838501, "reward_std": 0.09357413649559021, "rewards/VisualizationJSONCombinedORM/mean": 0.7290569543838501, "rewards/VisualizationJSONCombinedORM/std": 0.09442517161369324, "step": 1255, "train_speed(iter/s)": 0.63371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 312.625, "completions/min_length": 281.0, "epoch": 1.0388751033912325, "grad_norm": 0.158203586935997, "kl": 0.06854248046875, "learning_rate": 5.53317061270681e-06, "loss": 0.0006845742464065552, "memory(GiB)": 37.92, "reward": 0.7007938027381897, "reward_std": 0.05669635534286499, "rewards/VisualizationJSONCombinedORM/mean": 0.7007938027381897, "rewards/VisualizationJSONCombinedORM/std": 0.07852993160486221, "step": 1256, "train_speed(iter/s)": 0.625683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 292.0, "completions/min_length": 241.0, "epoch": 1.0397022332506203, "grad_norm": 0.15957672894001007, "kl": 0.067138671875, "learning_rate": 5.525992485262068e-06, "loss": 0.0006715953350067139, "memory(GiB)": 37.92, "reward": 0.6943203210830688, "reward_std": 0.0734109878540039, "rewards/VisualizationJSONCombinedORM/mean": 0.6943203210830688, "rewards/VisualizationJSONCombinedORM/std": 0.07839101552963257, "step": 1257, "train_speed(iter/s)": 0.618669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 298.375, "completions/min_length": 244.0, "epoch": 1.0405293631100083, "grad_norm": 0.19212886691093445, "kl": 0.0694580078125, "learning_rate": 5.51881326143651e-06, "loss": 0.0006958860903978348, "memory(GiB)": 37.92, "reward": 0.3697343170642853, "reward_std": 0.09209985285997391, "rewards/VisualizationJSONCombinedORM/mean": 0.3697343170642853, "rewards/VisualizationJSONCombinedORM/std": 0.0915742889046669, "step": 1258, "train_speed(iter/s)": 0.610075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 328.125, "completions/min_length": 259.0, "epoch": 1.0413564929693961, "grad_norm": 0.1796991527080536, "kl": 0.102783203125, "learning_rate": 5.51163295619454e-06, "loss": 0.0010295584797859192, "memory(GiB)": 37.92, "reward": 0.5091083645820618, "reward_std": 0.0645853728055954, "rewards/VisualizationJSONCombinedORM/mean": 0.5091083645820618, "rewards/VisualizationJSONCombinedORM/std": 0.25177323818206787, "step": 1259, "train_speed(iter/s)": 0.601377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 277.375, "completions/min_length": 217.0, "epoch": 1.0421836228287842, "grad_norm": 0.17011842131614685, "kl": 0.040771484375, "learning_rate": 5.504451584502813e-06, "loss": 0.0004085823893547058, "memory(GiB)": 37.92, "reward": 0.3986177444458008, "reward_std": 0.047955043613910675, "rewards/VisualizationJSONCombinedORM/mean": 0.3986177444458008, "rewards/VisualizationJSONCombinedORM/std": 0.04836346209049225, "step": 1260, "train_speed(iter/s)": 0.594015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 287.9375, "completions/min_length": 227.0, "epoch": 1.043010752688172, "grad_norm": 0.18781323730945587, "kl": 0.0885009765625, "learning_rate": 5.497269161330212e-06, "loss": 0.0008853450417518616, "memory(GiB)": 37.92, "reward": 0.5736551284790039, "reward_std": 0.11444227397441864, "rewards/VisualizationJSONCombinedORM/mean": 0.5736551284790039, "rewards/VisualizationJSONCombinedORM/std": 0.12320837378501892, "step": 1261, "train_speed(iter/s)": 0.587773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 300.125, "completions/min_length": 231.0, "epoch": 1.04383788254756, "grad_norm": 0.20556101202964783, "kl": 0.0743408203125, "learning_rate": 5.490085701647805e-06, "loss": 0.0007414519786834717, "memory(GiB)": 37.92, "reward": 0.617462158203125, "reward_std": 0.11405248939990997, "rewards/VisualizationJSONCombinedORM/mean": 0.617462158203125, "rewards/VisualizationJSONCombinedORM/std": 0.11343149840831757, "step": 1262, "train_speed(iter/s)": 0.581653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 301.625, "completions/min_length": 247.0, "epoch": 1.0446650124069479, "grad_norm": 0.22329968214035034, "kl": 0.05712890625, "learning_rate": 5.4829012204288225e-06, "loss": 0.000570688396692276, "memory(GiB)": 37.92, "reward": 0.6366982460021973, "reward_std": 0.08284594118595123, "rewards/VisualizationJSONCombinedORM/mean": 0.6366982460021973, "rewards/VisualizationJSONCombinedORM/std": 0.08709104359149933, "step": 1263, "train_speed(iter/s)": 0.57529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 307.5625, "completions/min_length": 231.0, "epoch": 1.045492142266336, "grad_norm": 0.20295578241348267, "kl": 0.1353759765625, "learning_rate": 5.47571573264863e-06, "loss": 0.0013549327850341797, "memory(GiB)": 37.92, "reward": 0.5139955282211304, "reward_std": 0.06655310094356537, "rewards/VisualizationJSONCombinedORM/mean": 0.5139955282211304, "rewards/VisualizationJSONCombinedORM/std": 0.2648533582687378, "step": 1264, "train_speed(iter/s)": 0.568343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 282.5625, "completions/min_length": 209.0, "epoch": 1.0463192721257237, "grad_norm": 0.17485620081424713, "kl": 0.11669921875, "learning_rate": 5.468529253284683e-06, "loss": 0.0011646151542663574, "memory(GiB)": 37.92, "reward": 0.7023907899856567, "reward_std": 0.11001241207122803, "rewards/VisualizationJSONCombinedORM/mean": 0.7023907899856567, "rewards/VisualizationJSONCombinedORM/std": 0.12486238777637482, "step": 1265, "train_speed(iter/s)": 0.562925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 300.875, "completions/min_length": 240.0, "epoch": 1.0471464019851116, "grad_norm": 0.19356992840766907, "kl": 0.103759765625, "learning_rate": 5.46134179731651e-06, "loss": 0.0010370798408985138, "memory(GiB)": 37.98, "reward": 0.4551306664943695, "reward_std": 0.08472061157226562, "rewards/VisualizationJSONCombinedORM/mean": 0.4551306664943695, "rewards/VisualizationJSONCombinedORM/std": 0.26623791456222534, "step": 1266, "train_speed(iter/s)": 0.555437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 319.1875, "completions/min_length": 224.0, "epoch": 1.0479735318444996, "grad_norm": 0.2178688943386078, "kl": 0.1424560546875, "learning_rate": 5.4541533797256715e-06, "loss": 0.0014228001236915588, "memory(GiB)": 37.98, "reward": 0.6343098878860474, "reward_std": 0.13355137407779694, "rewards/VisualizationJSONCombinedORM/mean": 0.6343098878860474, "rewards/VisualizationJSONCombinedORM/std": 0.15097030997276306, "step": 1267, "train_speed(iter/s)": 0.549473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 308.75, "completions/min_length": 246.0, "epoch": 1.0488006617038874, "grad_norm": 0.1783687025308609, "kl": 0.0972900390625, "learning_rate": 5.446964015495734e-06, "loss": 0.000973045825958252, "memory(GiB)": 37.98, "reward": 0.647121787071228, "reward_std": 0.11129710078239441, "rewards/VisualizationJSONCombinedORM/mean": 0.647121787071228, "rewards/VisualizationJSONCombinedORM/std": 0.11153063178062439, "step": 1268, "train_speed(iter/s)": 0.544974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 283.5, "completions/min_length": 236.0, "epoch": 1.0496277915632755, "grad_norm": 0.1989145576953888, "kl": 0.0777587890625, "learning_rate": 5.4397737196122355e-06, "loss": 0.0007777437567710876, "memory(GiB)": 37.98, "reward": 0.5868438482284546, "reward_std": 0.09064146876335144, "rewards/VisualizationJSONCombinedORM/mean": 0.5868438482284546, "rewards/VisualizationJSONCombinedORM/std": 0.11514510214328766, "step": 1269, "train_speed(iter/s)": 0.539914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 294.375, "completions/min_length": 245.0, "epoch": 1.0504549214226633, "grad_norm": 0.1875082105398178, "kl": 0.0888671875, "learning_rate": 5.432582507062658e-06, "loss": 0.0008875131607055664, "memory(GiB)": 37.98, "reward": 0.47176337242126465, "reward_std": 0.03642597049474716, "rewards/VisualizationJSONCombinedORM/mean": 0.47176337242126465, "rewards/VisualizationJSONCombinedORM/std": 0.29863330721855164, "step": 1270, "train_speed(iter/s)": 0.533963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 286.0625, "completions/min_length": 219.0, "epoch": 1.0512820512820513, "grad_norm": 0.19117586314678192, "kl": 0.08392333984375, "learning_rate": 5.425390392836393e-06, "loss": 0.0008372291922569275, "memory(GiB)": 37.98, "reward": 0.48216867446899414, "reward_std": 0.18275873363018036, "rewards/VisualizationJSONCombinedORM/mean": 0.48216867446899414, "rewards/VisualizationJSONCombinedORM/std": 0.19149626791477203, "step": 1271, "train_speed(iter/s)": 0.528989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 285.625, "completions/min_length": 249.0, "epoch": 1.0521091811414391, "grad_norm": 0.25081923604011536, "kl": 0.1484375, "learning_rate": 5.418197391924712e-06, "loss": 0.0014855265617370605, "memory(GiB)": 37.98, "reward": 0.2953140437602997, "reward_std": 0.04312264919281006, "rewards/VisualizationJSONCombinedORM/mean": 0.2953140437602997, "rewards/VisualizationJSONCombinedORM/std": 0.1186266615986824, "step": 1272, "train_speed(iter/s)": 0.523761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 280.6875, "completions/min_length": 224.0, "epoch": 1.0529363110008272, "grad_norm": 0.16589419543743134, "kl": 0.073974609375, "learning_rate": 5.411003519320733e-06, "loss": 0.0007419046014547348, "memory(GiB)": 37.98, "reward": 0.4896909296512604, "reward_std": 0.07398533821105957, "rewards/VisualizationJSONCombinedORM/mean": 0.4896909296512604, "rewards/VisualizationJSONCombinedORM/std": 0.10718618333339691, "step": 1273, "train_speed(iter/s)": 0.518788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 281.625, "completions/min_length": 240.0, "epoch": 1.053763440860215, "grad_norm": 0.2058386206626892, "kl": 0.09423828125, "learning_rate": 5.4038087900193985e-06, "loss": 0.0009408742189407349, "memory(GiB)": 37.98, "reward": 0.2873231768608093, "reward_std": 0.05771665647625923, "rewards/VisualizationJSONCombinedORM/mean": 0.2873231768608093, "rewards/VisualizationJSONCombinedORM/std": 0.14944252371788025, "step": 1274, "train_speed(iter/s)": 0.514191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 295.125, "completions/min_length": 234.0, "epoch": 1.054590570719603, "grad_norm": 0.21684883534908295, "kl": 0.0970458984375, "learning_rate": 5.396613219017422e-06, "loss": 0.0009676888585090637, "memory(GiB)": 37.98, "reward": 0.36099010705947876, "reward_std": 0.07767125964164734, "rewards/VisualizationJSONCombinedORM/mean": 0.36099010705947876, "rewards/VisualizationJSONCombinedORM/std": 0.07570172101259232, "step": 1275, "train_speed(iter/s)": 0.508453 }, { "epoch": 1.054590570719603, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 339.375, "eval_completions/mean_length": 286.359375, "eval_completions/min_length": 246.125, "eval_kl": 0.09154256184895833, "eval_loss": 0.0009132586419582367, "eval_reward": 0.5034036909540495, "eval_reward_std": 0.08190944658902784, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.5034036909540495, "eval_rewards/VisualizationJSONCombinedORM/std": 0.08190944767557085, "eval_runtime": 295.1055, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 296.0625, "completions/min_length": 257.0, "epoch": 1.0554177005789909, "grad_norm": 0.20741493999958038, "kl": 0.09033203125, "learning_rate": 5.3894168213132865e-06, "loss": 0.0009061060845851898, "memory(GiB)": 37.98, "reward": 0.683337926864624, "reward_std": 0.09561892598867416, "rewards/VisualizationJSONCombinedORM/mean": 0.683337926864624, "rewards/VisualizationJSONCombinedORM/std": 0.10490003973245621, "step": 1276, "train_speed(iter/s)": 0.450876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 307.6875, "completions/min_length": 227.0, "epoch": 1.056244830438379, "grad_norm": 0.18259787559509277, "kl": 0.0628662109375, "learning_rate": 5.382219611907189e-06, "loss": 0.0006292089819908142, "memory(GiB)": 37.98, "reward": 0.6904152631759644, "reward_std": 0.08010877668857574, "rewards/VisualizationJSONCombinedORM/mean": 0.6904152631759644, "rewards/VisualizationJSONCombinedORM/std": 0.08247547596693039, "step": 1277, "train_speed(iter/s)": 0.446899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 298.6875, "completions/min_length": 241.0, "epoch": 1.0570719602977667, "grad_norm": 0.1783825010061264, "kl": 0.0880126953125, "learning_rate": 5.375021605801023e-06, "loss": 0.0008775815367698669, "memory(GiB)": 37.98, "reward": 0.7003966569900513, "reward_std": 0.09575694799423218, "rewards/VisualizationJSONCombinedORM/mean": 0.7003966569900513, "rewards/VisualizationJSONCombinedORM/std": 0.09352102875709534, "step": 1278, "train_speed(iter/s)": 0.44341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 302.9375, "completions/min_length": 223.0, "epoch": 1.0578990901571548, "grad_norm": 0.18593114614486694, "kl": 0.0677490234375, "learning_rate": 5.367822817998338e-06, "loss": 0.0006778687238693237, "memory(GiB)": 37.98, "reward": 0.7041593790054321, "reward_std": 0.09428437054157257, "rewards/VisualizationJSONCombinedORM/mean": 0.7041593790054321, "rewards/VisualizationJSONCombinedORM/std": 0.09910793602466583, "step": 1279, "train_speed(iter/s)": 0.439488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 255.0, "completions/min_length": 197.0, "epoch": 1.0587262200165426, "grad_norm": 0.17925907671451569, "kl": 0.0811767578125, "learning_rate": 5.3606232635043185e-06, "loss": 0.0008106529712677002, "memory(GiB)": 37.98, "reward": 0.5937128067016602, "reward_std": 0.10687753558158875, "rewards/VisualizationJSONCombinedORM/mean": 0.5937128067016602, "rewards/VisualizationJSONCombinedORM/std": 0.10485517233610153, "step": 1280, "train_speed(iter/s)": 0.435388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 286.3125, "completions/min_length": 242.0, "epoch": 1.0595533498759304, "grad_norm": 0.17926572263240814, "kl": 0.087890625, "learning_rate": 5.353422957325743e-06, "loss": 0.0008776886388659477, "memory(GiB)": 37.98, "reward": 0.7095719575881958, "reward_std": 0.08644579350948334, "rewards/VisualizationJSONCombinedORM/mean": 0.7095719575881958, "rewards/VisualizationJSONCombinedORM/std": 0.08558753877878189, "step": 1281, "train_speed(iter/s)": 0.4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 266.125, "completions/min_length": 206.0, "epoch": 1.0603804797353185, "grad_norm": 0.21963751316070557, "kl": 0.0750732421875, "learning_rate": 5.346221914470959e-06, "loss": 0.000750809907913208, "memory(GiB)": 37.98, "reward": 0.37950795888900757, "reward_std": 0.09091612696647644, "rewards/VisualizationJSONCombinedORM/mean": 0.37950795888900757, "rewards/VisualizationJSONCombinedORM/std": 0.10265789926052094, "step": 1282, "train_speed(iter/s)": 0.429138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 274.25, "completions/min_length": 223.0, "epoch": 1.0612076095947063, "grad_norm": 0.24266603589057922, "kl": 0.07275390625, "learning_rate": 5.3390201499498485e-06, "loss": 0.0007286639884114265, "memory(GiB)": 37.98, "reward": 0.4706989526748657, "reward_std": 0.07617723941802979, "rewards/VisualizationJSONCombinedORM/mean": 0.4706989526748657, "rewards/VisualizationJSONCombinedORM/std": 0.19835735857486725, "step": 1283, "train_speed(iter/s)": 0.426548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 294.75, "completions/min_length": 250.0, "epoch": 1.0620347394540943, "grad_norm": 0.1675860583782196, "kl": 0.05120849609375, "learning_rate": 5.331817678773796e-06, "loss": 0.0005107540637254715, "memory(GiB)": 37.98, "reward": 0.45747455954551697, "reward_std": 0.09069395810365677, "rewards/VisualizationJSONCombinedORM/mean": 0.45747455954551697, "rewards/VisualizationJSONCombinedORM/std": 0.1290530562400818, "step": 1284, "train_speed(iter/s)": 0.42305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 297.375, "completions/min_length": 237.0, "epoch": 1.0628618693134821, "grad_norm": 0.2007301300764084, "kl": 0.0484619140625, "learning_rate": 5.324614515955665e-06, "loss": 0.0004841648042201996, "memory(GiB)": 37.98, "reward": 0.569645881652832, "reward_std": 0.09676957130432129, "rewards/VisualizationJSONCombinedORM/mean": 0.569645881652832, "rewards/VisualizationJSONCombinedORM/std": 0.11365417391061783, "step": 1285, "train_speed(iter/s)": 0.419212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 315.0625, "completions/min_length": 255.0, "epoch": 1.0636889991728702, "grad_norm": 0.16786961257457733, "kl": 0.10205078125, "learning_rate": 5.317410676509752e-06, "loss": 0.0010178722441196442, "memory(GiB)": 37.98, "reward": 0.304243266582489, "reward_std": 0.04228207468986511, "rewards/VisualizationJSONCombinedORM/mean": 0.304243266582489, "rewards/VisualizationJSONCombinedORM/std": 0.07123679667711258, "step": 1286, "train_speed(iter/s)": 0.41595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 288.0625, "completions/min_length": 242.0, "epoch": 1.064516129032258, "grad_norm": 0.1768297553062439, "kl": 0.08026123046875, "learning_rate": 5.310206175451772e-06, "loss": 0.0008017942309379578, "memory(GiB)": 37.98, "reward": 0.319139301776886, "reward_std": 0.04608002305030823, "rewards/VisualizationJSONCombinedORM/mean": 0.319139301776886, "rewards/VisualizationJSONCombinedORM/std": 0.12307731807231903, "step": 1287, "train_speed(iter/s)": 0.413168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 286.75, "completions/min_length": 220.0, "epoch": 1.065343258891646, "grad_norm": 0.1952805370092392, "kl": 0.0731201171875, "learning_rate": 5.303001027798813e-06, "loss": 0.0007331632077693939, "memory(GiB)": 37.98, "reward": 0.6051028966903687, "reward_std": 0.09085497260093689, "rewards/VisualizationJSONCombinedORM/mean": 0.6051028966903687, "rewards/VisualizationJSONCombinedORM/std": 0.08924760669469833, "step": 1288, "train_speed(iter/s)": 0.410287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 282.0625, "completions/min_length": 245.0, "epoch": 1.0661703887510339, "grad_norm": 0.15929098427295685, "kl": 0.0579833984375, "learning_rate": 5.295795248569315e-06, "loss": 0.00058012455701828, "memory(GiB)": 37.98, "reward": 0.729354739189148, "reward_std": 0.07799667119979858, "rewards/VisualizationJSONCombinedORM/mean": 0.729354739189148, "rewards/VisualizationJSONCombinedORM/std": 0.08276572078466415, "step": 1289, "train_speed(iter/s)": 0.407619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 278.6875, "completions/min_length": 223.0, "epoch": 1.066997518610422, "grad_norm": 0.1796453446149826, "kl": 0.0628662109375, "learning_rate": 5.288588852783031e-06, "loss": 0.0006294175982475281, "memory(GiB)": 37.98, "reward": 0.5383371114730835, "reward_std": 0.07522208988666534, "rewards/VisualizationJSONCombinedORM/mean": 0.5383371114730835, "rewards/VisualizationJSONCombinedORM/std": 0.18649862706661224, "step": 1290, "train_speed(iter/s)": 0.404217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 279.75, "completions/min_length": 225.0, "epoch": 1.0678246484698097, "grad_norm": 0.18602782487869263, "kl": 0.05072021484375, "learning_rate": 5.281381855461002e-06, "loss": 0.0005074441432952881, "memory(GiB)": 37.98, "reward": 0.5908218622207642, "reward_std": 0.07333792746067047, "rewards/VisualizationJSONCombinedORM/mean": 0.5908218622207642, "rewards/VisualizationJSONCombinedORM/std": 0.14757230877876282, "step": 1291, "train_speed(iter/s)": 0.401485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 278.4375, "completions/min_length": 235.0, "epoch": 1.0686517783291978, "grad_norm": 0.18618039786815643, "kl": 0.07177734375, "learning_rate": 5.274174271625522e-06, "loss": 0.0007180795073509216, "memory(GiB)": 37.98, "reward": 0.46228522062301636, "reward_std": 0.08853095769882202, "rewards/VisualizationJSONCombinedORM/mean": 0.46228522062301636, "rewards/VisualizationJSONCombinedORM/std": 0.08574651926755905, "step": 1292, "train_speed(iter/s)": 0.398745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 270.4375, "completions/min_length": 232.0, "epoch": 1.0694789081885856, "grad_norm": 0.17421755194664001, "kl": 0.04522705078125, "learning_rate": 5.266966116300106e-06, "loss": 0.0004533827304840088, "memory(GiB)": 37.98, "reward": 0.4430079460144043, "reward_std": 0.0543622151017189, "rewards/VisualizationJSONCombinedORM/mean": 0.4430079460144043, "rewards/VisualizationJSONCombinedORM/std": 0.2591833770275116, "step": 1293, "train_speed(iter/s)": 0.396133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 287.9375, "completions/min_length": 227.0, "epoch": 1.0703060380479736, "grad_norm": 0.17708416283130646, "kl": 0.0947265625, "learning_rate": 5.259757404509463e-06, "loss": 0.0009486451745033264, "memory(GiB)": 37.98, "reward": 0.5997579097747803, "reward_std": 0.10936230421066284, "rewards/VisualizationJSONCombinedORM/mean": 0.5997579097747803, "rewards/VisualizationJSONCombinedORM/std": 0.11471012234687805, "step": 1294, "train_speed(iter/s)": 0.393296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 255.625, "completions/min_length": 216.0, "epoch": 1.0711331679073615, "grad_norm": 0.1943846046924591, "kl": 0.0947265625, "learning_rate": 5.25254815127946e-06, "loss": 0.0009459294378757477, "memory(GiB)": 37.98, "reward": 0.5248243808746338, "reward_std": 0.08245056867599487, "rewards/VisualizationJSONCombinedORM/mean": 0.5248243808746338, "rewards/VisualizationJSONCombinedORM/std": 0.27607643604278564, "step": 1295, "train_speed(iter/s)": 0.389974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 282.5, "completions/min_length": 221.0, "epoch": 1.0719602977667493, "grad_norm": 0.17121729254722595, "kl": 0.0389404296875, "learning_rate": 5.245338371637091e-06, "loss": 0.0003893524408340454, "memory(GiB)": 37.98, "reward": 0.40126001834869385, "reward_std": 0.08433860540390015, "rewards/VisualizationJSONCombinedORM/mean": 0.40126001834869385, "rewards/VisualizationJSONCombinedORM/std": 0.13371317088603973, "step": 1296, "train_speed(iter/s)": 0.387252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 282.625, "completions/min_length": 227.0, "epoch": 1.0727874276261373, "grad_norm": 0.1794484704732895, "kl": 0.04376220703125, "learning_rate": 5.238128080610451e-06, "loss": 0.0004376024007797241, "memory(GiB)": 37.98, "reward": 0.5436617136001587, "reward_std": 0.06359460204839706, "rewards/VisualizationJSONCombinedORM/mean": 0.5436617136001587, "rewards/VisualizationJSONCombinedORM/std": 0.1408856213092804, "step": 1297, "train_speed(iter/s)": 0.38454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 284.875, "completions/min_length": 241.0, "epoch": 1.0736145574855251, "grad_norm": 0.17190605401992798, "kl": 0.0777587890625, "learning_rate": 5.230917293228699e-06, "loss": 0.0007778219878673553, "memory(GiB)": 37.98, "reward": 0.3635798394680023, "reward_std": 0.048113882541656494, "rewards/VisualizationJSONCombinedORM/mean": 0.3635798394680023, "rewards/VisualizationJSONCombinedORM/std": 0.08069364726543427, "step": 1298, "train_speed(iter/s)": 0.382285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 280.5625, "completions/min_length": 232.0, "epoch": 1.0744416873449132, "grad_norm": 0.19659745693206787, "kl": 0.079833984375, "learning_rate": 5.2237060245220276e-06, "loss": 0.0007971674203872681, "memory(GiB)": 37.98, "reward": 0.382132887840271, "reward_std": 0.0701473131775856, "rewards/VisualizationJSONCombinedORM/mean": 0.382132887840271, "rewards/VisualizationJSONCombinedORM/std": 0.22346660494804382, "step": 1299, "train_speed(iter/s)": 0.379339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 258.5, "completions/min_length": 227.0, "epoch": 1.075268817204301, "grad_norm": 0.17708732187747955, "kl": 0.03582763671875, "learning_rate": 5.216494289521637e-06, "loss": 0.0003590881824493408, "memory(GiB)": 37.98, "reward": 0.48243746161460876, "reward_std": 0.07587999105453491, "rewards/VisualizationJSONCombinedORM/mean": 0.48243746161460876, "rewards/VisualizationJSONCombinedORM/std": 0.13546080887317657, "step": 1300, "train_speed(iter/s)": 0.377191 }, { "epoch": 1.075268817204301, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 327.75, "eval_completions/mean_length": 280.1770833333333, "eval_completions/min_length": 236.375, "eval_kl": 0.061787923177083336, "eval_loss": 0.000616674602497369, "eval_reward": 0.4507333878427744, "eval_reward_std": 0.061784800491295755, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4507333878427744, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06178480258677155, "eval_runtime": 288.3989, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 299.4375, "completions/min_length": 251.0, "epoch": 1.076095947063689, "grad_norm": 0.17126557230949402, "kl": 0.05126953125, "learning_rate": 5.2092821032596895e-06, "loss": 0.000512540340423584, "memory(GiB)": 37.98, "reward": 0.32708466053009033, "reward_std": 0.06325694173574448, "rewards/VisualizationJSONCombinedORM/mean": 0.32708466053009033, "rewards/VisualizationJSONCombinedORM/std": 0.07984737306833267, "step": 1301, "train_speed(iter/s)": 0.346049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 272.75, "completions/min_length": 199.0, "epoch": 1.0769230769230769, "grad_norm": 0.17482225596904755, "kl": 0.03790283203125, "learning_rate": 5.2020694807693015e-06, "loss": 0.0003785640001296997, "memory(GiB)": 37.98, "reward": 0.5977441668510437, "reward_std": 0.052531659603118896, "rewards/VisualizationJSONCombinedORM/mean": 0.5977441668510437, "rewards/VisualizationJSONCombinedORM/std": 0.17391225695610046, "step": 1302, "train_speed(iter/s)": 0.344035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 274.5625, "completions/min_length": 219.0, "epoch": 1.077750206782465, "grad_norm": 0.15925335884094238, "kl": 0.03692626953125, "learning_rate": 5.19485643708449e-06, "loss": 0.000369437038898468, "memory(GiB)": 37.98, "reward": 0.7700279355049133, "reward_std": 0.03964584693312645, "rewards/VisualizationJSONCombinedORM/mean": 0.7700279355049133, "rewards/VisualizationJSONCombinedORM/std": 0.041545044630765915, "step": 1303, "train_speed(iter/s)": 0.342099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 267.625, "completions/min_length": 215.0, "epoch": 1.0785773366418527, "grad_norm": 0.17543797194957733, "kl": 0.038330078125, "learning_rate": 5.18764298724015e-06, "loss": 0.0003829747438430786, "memory(GiB)": 37.98, "reward": 0.7348547577857971, "reward_std": 0.09418436884880066, "rewards/VisualizationJSONCombinedORM/mean": 0.7348547577857971, "rewards/VisualizationJSONCombinedORM/std": 0.09666773676872253, "step": 1304, "train_speed(iter/s)": 0.340483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 288.1875, "completions/min_length": 224.0, "epoch": 1.0794044665012408, "grad_norm": 0.1772185117006302, "kl": 0.03643798828125, "learning_rate": 5.1804291462720255e-06, "loss": 0.0003645569086074829, "memory(GiB)": 37.98, "reward": 0.5179892778396606, "reward_std": 0.09396199882030487, "rewards/VisualizationJSONCombinedORM/mean": 0.5179892778396606, "rewards/VisualizationJSONCombinedORM/std": 0.11803922802209854, "step": 1305, "train_speed(iter/s)": 0.338629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 254.8125, "completions/min_length": 212.0, "epoch": 1.0802315963606286, "grad_norm": 0.1878267377614975, "kl": 0.05810546875, "learning_rate": 5.173214929216677e-06, "loss": 0.0005817487835884094, "memory(GiB)": 37.98, "reward": 0.6131793260574341, "reward_std": 0.1441194862127304, "rewards/VisualizationJSONCombinedORM/mean": 0.6131793260574341, "rewards/VisualizationJSONCombinedORM/std": 0.179266557097435, "step": 1306, "train_speed(iter/s)": 0.33626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 261.375, "completions/min_length": 217.0, "epoch": 1.0810587262200166, "grad_norm": 0.15483415126800537, "kl": 0.03955078125, "learning_rate": 5.166000351111444e-06, "loss": 0.0003958567976951599, "memory(GiB)": 37.98, "reward": 0.7221636176109314, "reward_std": 0.08491586148738861, "rewards/VisualizationJSONCombinedORM/mean": 0.7221636176109314, "rewards/VisualizationJSONCombinedORM/std": 0.10990879684686661, "step": 1307, "train_speed(iter/s)": 0.334636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 279.375, "completions/min_length": 236.0, "epoch": 1.0818858560794045, "grad_norm": 0.14313074946403503, "kl": 0.0350341796875, "learning_rate": 5.158785426994423e-06, "loss": 0.00035064294934272766, "memory(GiB)": 37.98, "reward": 0.49445196986198425, "reward_std": 0.07847151160240173, "rewards/VisualizationJSONCombinedORM/mean": 0.49445196986198425, "rewards/VisualizationJSONCombinedORM/std": 0.2306872010231018, "step": 1308, "train_speed(iter/s)": 0.332564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 279.875, "completions/min_length": 231.0, "epoch": 1.0827129859387923, "grad_norm": 0.16975460946559906, "kl": 0.06573486328125, "learning_rate": 5.151570171904432e-06, "loss": 0.0006570033729076385, "memory(GiB)": 37.98, "reward": 0.4868873655796051, "reward_std": 0.05760357156395912, "rewards/VisualizationJSONCombinedORM/mean": 0.4868873655796051, "rewards/VisualizationJSONCombinedORM/std": 0.24366486072540283, "step": 1309, "train_speed(iter/s)": 0.330778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 272.0625, "completions/min_length": 214.0, "epoch": 1.0835401157981803, "grad_norm": 0.18958421051502228, "kl": 0.06256103515625, "learning_rate": 5.144354600880974e-06, "loss": 0.0006248615682125092, "memory(GiB)": 37.98, "reward": 0.6445052623748779, "reward_std": 0.0637863278388977, "rewards/VisualizationJSONCombinedORM/mean": 0.6445052623748779, "rewards/VisualizationJSONCombinedORM/std": 0.15532074868679047, "step": 1310, "train_speed(iter/s)": 0.329051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 279.5625, "completions/min_length": 225.0, "epoch": 1.0843672456575681, "grad_norm": 0.18715620040893555, "kl": 0.0970458984375, "learning_rate": 5.137138728964215e-06, "loss": 0.0009682327508926392, "memory(GiB)": 37.98, "reward": 0.5877296328544617, "reward_std": 0.09683650732040405, "rewards/VisualizationJSONCombinedORM/mean": 0.5877296328544617, "rewards/VisualizationJSONCombinedORM/std": 0.1376788169145584, "step": 1311, "train_speed(iter/s)": 0.326948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 270.0, "completions/min_length": 214.0, "epoch": 1.0851943755169562, "grad_norm": 0.18852943181991577, "kl": 0.06280517578125, "learning_rate": 5.129922571194949e-06, "loss": 0.000628884881734848, "memory(GiB)": 37.98, "reward": 0.6233605742454529, "reward_std": 0.07462414354085922, "rewards/VisualizationJSONCombinedORM/mean": 0.6233605742454529, "rewards/VisualizationJSONCombinedORM/std": 0.10931910574436188, "step": 1312, "train_speed(iter/s)": 0.325045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 276.875, "completions/min_length": 222.0, "epoch": 1.086021505376344, "grad_norm": 0.16567900776863098, "kl": 0.0543212890625, "learning_rate": 5.122706142614562e-06, "loss": 0.0005437135696411133, "memory(GiB)": 37.98, "reward": 0.7221307754516602, "reward_std": 0.10431058704853058, "rewards/VisualizationJSONCombinedORM/mean": 0.7221307754516602, "rewards/VisualizationJSONCombinedORM/std": 0.13847920298576355, "step": 1313, "train_speed(iter/s)": 0.323277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 290.8125, "completions/min_length": 217.0, "epoch": 1.086848635235732, "grad_norm": 0.16012004017829895, "kl": 0.05755615234375, "learning_rate": 5.115489458265006e-06, "loss": 0.0005753226578235626, "memory(GiB)": 37.98, "reward": 0.2702125310897827, "reward_std": 0.024030199274420738, "rewards/VisualizationJSONCombinedORM/mean": 0.2702125310897827, "rewards/VisualizationJSONCombinedORM/std": 0.02607799880206585, "step": 1314, "train_speed(iter/s)": 0.321421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 285.875, "completions/min_length": 226.0, "epoch": 1.0876757650951199, "grad_norm": 0.1741834282875061, "kl": 0.0716552734375, "learning_rate": 5.108272533188767e-06, "loss": 0.0007183440029621124, "memory(GiB)": 37.98, "reward": 0.5282192230224609, "reward_std": 0.07665836811065674, "rewards/VisualizationJSONCombinedORM/mean": 0.5282192230224609, "rewards/VisualizationJSONCombinedORM/std": 0.11307768523693085, "step": 1315, "train_speed(iter/s)": 0.319512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 283.6875, "completions/min_length": 217.0, "epoch": 1.088502894954508, "grad_norm": 0.19376085698604584, "kl": 0.0819091796875, "learning_rate": 5.101055382428831e-06, "loss": 0.0008205324411392212, "memory(GiB)": 37.98, "reward": 0.29626575112342834, "reward_std": 0.038926634937524796, "rewards/VisualizationJSONCombinedORM/mean": 0.29626575112342834, "rewards/VisualizationJSONCombinedORM/std": 0.045890819281339645, "step": 1316, "train_speed(iter/s)": 0.318124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 287.3125, "completions/min_length": 215.0, "epoch": 1.0893300248138957, "grad_norm": 0.21770508587360382, "kl": 0.065673828125, "learning_rate": 5.093838021028658e-06, "loss": 0.0006582066416740417, "memory(GiB)": 37.98, "reward": 0.5823431015014648, "reward_std": 0.07953428477048874, "rewards/VisualizationJSONCombinedORM/mean": 0.5823431015014648, "rewards/VisualizationJSONCombinedORM/std": 0.16794274747371674, "step": 1317, "train_speed(iter/s)": 0.316378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 272.9375, "completions/min_length": 218.0, "epoch": 1.0901571546732838, "grad_norm": 0.13905517756938934, "kl": 0.05865478515625, "learning_rate": 5.086620464032143e-06, "loss": 0.0005863644182682037, "memory(GiB)": 37.98, "reward": 0.49525922536849976, "reward_std": 0.041120126843452454, "rewards/VisualizationJSONCombinedORM/mean": 0.49525922536849976, "rewards/VisualizationJSONCombinedORM/std": 0.040588684380054474, "step": 1318, "train_speed(iter/s)": 0.315012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 286.6875, "completions/min_length": 254.0, "epoch": 1.0909842845326716, "grad_norm": 0.16468898952007294, "kl": 0.072021484375, "learning_rate": 5.07940272648359e-06, "loss": 0.0007185712456703186, "memory(GiB)": 37.98, "reward": 0.7237578630447388, "reward_std": 0.08670791983604431, "rewards/VisualizationJSONCombinedORM/mean": 0.7237578630447388, "rewards/VisualizationJSONCombinedORM/std": 0.11529985070228577, "step": 1319, "train_speed(iter/s)": 0.312903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 288.4375, "completions/min_length": 227.0, "epoch": 1.0918114143920596, "grad_norm": 0.16609354317188263, "kl": 0.06719970703125, "learning_rate": 5.07218482342768e-06, "loss": 0.000671684741973877, "memory(GiB)": 37.98, "reward": 0.47018277645111084, "reward_std": 0.061288438737392426, "rewards/VisualizationJSONCombinedORM/mean": 0.47018277645111084, "rewards/VisualizationJSONCombinedORM/std": 0.0822649747133255, "step": 1320, "train_speed(iter/s)": 0.311311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 259.125, "completions/min_length": 208.0, "epoch": 1.0926385442514475, "grad_norm": 0.17699085175991058, "kl": 0.06298828125, "learning_rate": 5.064966769909439e-06, "loss": 0.000630602240562439, "memory(GiB)": 37.98, "reward": 0.5219871997833252, "reward_std": 0.08573177456855774, "rewards/VisualizationJSONCombinedORM/mean": 0.5219871997833252, "rewards/VisualizationJSONCombinedORM/std": 0.2409020960330963, "step": 1321, "train_speed(iter/s)": 0.309646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 243.5625, "completions/min_length": 184.0, "epoch": 1.0934656741108353, "grad_norm": 0.2192925661802292, "kl": 0.06854248046875, "learning_rate": 5.057748580974204e-06, "loss": 0.0006856322288513184, "memory(GiB)": 37.98, "reward": 0.5642529129981995, "reward_std": 0.08623175323009491, "rewards/VisualizationJSONCombinedORM/mean": 0.5642529129981995, "rewards/VisualizationJSONCombinedORM/std": 0.17407269775867462, "step": 1322, "train_speed(iter/s)": 0.30835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 285.5, "completions/min_length": 224.0, "epoch": 1.0942928039702233, "grad_norm": 0.22931581735610962, "kl": 0.0733642578125, "learning_rate": 5.050530271667602e-06, "loss": 0.0007334202527999878, "memory(GiB)": 37.98, "reward": 0.5718088150024414, "reward_std": 0.139789879322052, "rewards/VisualizationJSONCombinedORM/mean": 0.5718088150024414, "rewards/VisualizationJSONCombinedORM/std": 0.1401406228542328, "step": 1323, "train_speed(iter/s)": 0.306394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 297.0625, "completions/min_length": 239.0, "epoch": 1.0951199338296111, "grad_norm": 0.17940549552440643, "kl": 0.06982421875, "learning_rate": 5.043311857035499e-06, "loss": 0.0006987005472183228, "memory(GiB)": 37.98, "reward": 0.6513657569885254, "reward_std": 0.11374007165431976, "rewards/VisualizationJSONCombinedORM/mean": 0.6513657569885254, "rewards/VisualizationJSONCombinedORM/std": 0.11168383806943893, "step": 1324, "train_speed(iter/s)": 0.304972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 278.5625, "completions/min_length": 220.0, "epoch": 1.0959470636889992, "grad_norm": 0.2024819254875183, "kl": 0.055419921875, "learning_rate": 5.036093352123993e-06, "loss": 0.0005542188882827759, "memory(GiB)": 37.98, "reward": 0.5287127494812012, "reward_std": 0.09873200953006744, "rewards/VisualizationJSONCombinedORM/mean": 0.5287127494812012, "rewards/VisualizationJSONCombinedORM/std": 0.1978137195110321, "step": 1325, "train_speed(iter/s)": 0.30342 }, { "epoch": 1.0959470636889992, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 325.5833333333333, "eval_completions/mean_length": 276.5, "eval_completions/min_length": 234.91666666666666, "eval_kl": 0.076385498046875, "eval_loss": 0.0007686440949328244, "eval_reward": 0.4982845392078161, "eval_reward_std": 0.07000929703159879, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4982845392078161, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07000929703159879, "eval_runtime": 287.0728, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 275.875, "completions/min_length": 216.0, "epoch": 1.096774193548387, "grad_norm": 0.22491824626922607, "kl": 0.0802001953125, "learning_rate": 5.0288747719793584e-06, "loss": 0.000802062451839447, "memory(GiB)": 37.98, "reward": 0.5966269969940186, "reward_std": 0.0753573551774025, "rewards/VisualizationJSONCombinedORM/mean": 0.5966269969940186, "rewards/VisualizationJSONCombinedORM/std": 0.142076775431633, "step": 1326, "train_speed(iter/s)": 0.283459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 272.1875, "completions/min_length": 206.0, "epoch": 1.097601323407775, "grad_norm": 0.16699416935443878, "kl": 0.1026611328125, "learning_rate": 5.021656131648037e-06, "loss": 0.001027185469865799, "memory(GiB)": 37.98, "reward": 0.5556639432907104, "reward_std": 0.10155126452445984, "rewards/VisualizationJSONCombinedORM/mean": 0.5556639432907104, "rewards/VisualizationJSONCombinedORM/std": 0.1102810725569725, "step": 1327, "train_speed(iter/s)": 0.282242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 290.8125, "completions/min_length": 226.0, "epoch": 1.0984284532671629, "grad_norm": 0.1764785349369049, "kl": 0.04620361328125, "learning_rate": 5.014437446176588e-06, "loss": 0.00046147406101226807, "memory(GiB)": 37.98, "reward": 0.4634740650653839, "reward_std": 0.11371748149394989, "rewards/VisualizationJSONCombinedORM/mean": 0.4634740650653839, "rewards/VisualizationJSONCombinedORM/std": 0.15742865204811096, "step": 1328, "train_speed(iter/s)": 0.280802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 261.1875, "completions/min_length": 209.0, "epoch": 1.099255583126551, "grad_norm": 0.1808522492647171, "kl": 0.09033203125, "learning_rate": 5.00721873061167e-06, "loss": 0.0009035691618919373, "memory(GiB)": 37.98, "reward": 0.4853322505950928, "reward_std": 0.07043579965829849, "rewards/VisualizationJSONCombinedORM/mean": 0.4853322505950928, "rewards/VisualizationJSONCombinedORM/std": 0.09685930609703064, "step": 1329, "train_speed(iter/s)": 0.27941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 252.625, "completions/min_length": 200.0, "epoch": 1.1000827129859387, "grad_norm": 0.16917684674263, "kl": 0.0623779296875, "learning_rate": 5e-06, "loss": 0.0006241276860237122, "memory(GiB)": 37.98, "reward": 0.5111358165740967, "reward_std": 0.07162147760391235, "rewards/VisualizationJSONCombinedORM/mean": 0.5111358165740967, "rewards/VisualizationJSONCombinedORM/std": 0.18122482299804688, "step": 1330, "train_speed(iter/s)": 0.278186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 268.5625, "completions/min_length": 187.0, "epoch": 1.1009098428453268, "grad_norm": 0.17984603345394135, "kl": 0.062744140625, "learning_rate": 4.992781269388331e-06, "loss": 0.0006267577409744263, "memory(GiB)": 37.98, "reward": 0.45979076623916626, "reward_std": 0.0796041488647461, "rewards/VisualizationJSONCombinedORM/mean": 0.45979076623916626, "rewards/VisualizationJSONCombinedORM/std": 0.08744293451309204, "step": 1331, "train_speed(iter/s)": 0.276908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 286.5, "completions/min_length": 227.0, "epoch": 1.1017369727047146, "grad_norm": 0.1762429177761078, "kl": 0.0718994140625, "learning_rate": 4.985562553823413e-06, "loss": 0.0007203128188848495, "memory(GiB)": 37.98, "reward": 0.38689205050468445, "reward_std": 0.04450703039765358, "rewards/VisualizationJSONCombinedORM/mean": 0.38689205050468445, "rewards/VisualizationJSONCombinedORM/std": 0.06501595675945282, "step": 1332, "train_speed(iter/s)": 0.275649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 286.75, "completions/min_length": 243.0, "epoch": 1.1025641025641026, "grad_norm": 0.1799640953540802, "kl": 0.06085205078125, "learning_rate": 4.978343868351966e-06, "loss": 0.0006097331643104553, "memory(GiB)": 37.98, "reward": 0.5648082494735718, "reward_std": 0.0924636572599411, "rewards/VisualizationJSONCombinedORM/mean": 0.5648082494735718, "rewards/VisualizationJSONCombinedORM/std": 0.09972797334194183, "step": 1333, "train_speed(iter/s)": 0.274204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 282.875, "completions/min_length": 206.0, "epoch": 1.1033912324234905, "grad_norm": 0.18512371182441711, "kl": 0.04583740234375, "learning_rate": 4.971125228020643e-06, "loss": 0.00045941025018692017, "memory(GiB)": 37.98, "reward": 0.5583035349845886, "reward_std": 0.09490984678268433, "rewards/VisualizationJSONCombinedORM/mean": 0.5583035349845886, "rewards/VisualizationJSONCombinedORM/std": 0.1827264428138733, "step": 1334, "train_speed(iter/s)": 0.272878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 233.875, "completions/min_length": 194.0, "epoch": 1.1042183622828785, "grad_norm": 0.17027245461940765, "kl": 0.1385498046875, "learning_rate": 4.963906647876008e-06, "loss": 0.0013855546712875366, "memory(GiB)": 37.98, "reward": 0.21812087297439575, "reward_std": 0.02107204496860504, "rewards/VisualizationJSONCombinedORM/mean": 0.21812087297439575, "rewards/VisualizationJSONCombinedORM/std": 0.020749595016241074, "step": 1335, "train_speed(iter/s)": 0.271855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 265.8125, "completions/min_length": 219.0, "epoch": 1.1050454921422663, "grad_norm": 0.2288043349981308, "kl": 0.078857421875, "learning_rate": 4.956688142964501e-06, "loss": 0.0007897093892097473, "memory(GiB)": 37.98, "reward": 0.6297169923782349, "reward_std": 0.12536853551864624, "rewards/VisualizationJSONCombinedORM/mean": 0.6297169923782349, "rewards/VisualizationJSONCombinedORM/std": 0.1469685137271881, "step": 1336, "train_speed(iter/s)": 0.270774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 297.0625, "completions/min_length": 260.0, "epoch": 1.1058726220016544, "grad_norm": 0.16490799188613892, "kl": 0.066650390625, "learning_rate": 4.9494697283324e-06, "loss": 0.0006671138107776642, "memory(GiB)": 37.98, "reward": 0.538932204246521, "reward_std": 0.047548696398735046, "rewards/VisualizationJSONCombinedORM/mean": 0.538932204246521, "rewards/VisualizationJSONCombinedORM/std": 0.2618958652019501, "step": 1337, "train_speed(iter/s)": 0.269529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 262.4375, "completions/min_length": 213.0, "epoch": 1.1066997518610422, "grad_norm": 0.17024411261081696, "kl": 0.0660400390625, "learning_rate": 4.942251419025797e-06, "loss": 0.0006591677665710449, "memory(GiB)": 37.98, "reward": 0.6996368169784546, "reward_std": 0.07590799033641815, "rewards/VisualizationJSONCombinedORM/mean": 0.6996368169784546, "rewards/VisualizationJSONCombinedORM/std": 0.08864907175302505, "step": 1338, "train_speed(iter/s)": 0.268484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 262.5625, "completions/min_length": 212.0, "epoch": 1.10752688172043, "grad_norm": 0.1815950870513916, "kl": 0.08203125, "learning_rate": 4.935033230090563e-06, "loss": 0.0008198991417884827, "memory(GiB)": 37.98, "reward": 0.4520835280418396, "reward_std": 0.057771701365709305, "rewards/VisualizationJSONCombinedORM/mean": 0.4520835280418396, "rewards/VisualizationJSONCombinedORM/std": 0.2458367496728897, "step": 1339, "train_speed(iter/s)": 0.267333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 230.0625, "completions/min_length": 207.0, "epoch": 1.108354011579818, "grad_norm": 0.17701901495456696, "kl": 0.056396484375, "learning_rate": 4.927815176572322e-06, "loss": 0.0005645044147968292, "memory(GiB)": 37.98, "reward": 0.5827950239181519, "reward_std": 0.0862504094839096, "rewards/VisualizationJSONCombinedORM/mean": 0.5827950239181519, "rewards/VisualizationJSONCombinedORM/std": 0.1648837924003601, "step": 1340, "train_speed(iter/s)": 0.266533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 260.0625, "completions/min_length": 195.0, "epoch": 1.1091811414392059, "grad_norm": 0.2200683057308197, "kl": 0.08349609375, "learning_rate": 4.920597273516413e-06, "loss": 0.0008357018232345581, "memory(GiB)": 37.98, "reward": 0.6579097509384155, "reward_std": 0.10745485872030258, "rewards/VisualizationJSONCombinedORM/mean": 0.6579097509384155, "rewards/VisualizationJSONCombinedORM/std": 0.16116942465305328, "step": 1341, "train_speed(iter/s)": 0.265301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 253.3125, "completions/min_length": 192.0, "epoch": 1.110008271298594, "grad_norm": 0.17612282931804657, "kl": 0.05755615234375, "learning_rate": 4.913379535967859e-06, "loss": 0.000575728714466095, "memory(GiB)": 37.98, "reward": 0.3946908712387085, "reward_std": 0.03181274235248566, "rewards/VisualizationJSONCombinedORM/mean": 0.3946908712387085, "rewards/VisualizationJSONCombinedORM/std": 0.039023078978061676, "step": 1342, "train_speed(iter/s)": 0.264101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 253.4375, "completions/min_length": 209.0, "epoch": 1.1108354011579817, "grad_norm": 0.2191014289855957, "kl": 0.06201171875, "learning_rate": 4.906161978971343e-06, "loss": 0.0006203465163707733, "memory(GiB)": 37.98, "reward": 0.44595733284950256, "reward_std": 0.09371253848075867, "rewards/VisualizationJSONCombinedORM/mean": 0.44595733284950256, "rewards/VisualizationJSONCombinedORM/std": 0.09150831401348114, "step": 1343, "train_speed(iter/s)": 0.262791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 260.0, "completions/min_length": 197.0, "epoch": 1.1116625310173698, "grad_norm": 0.17372749745845795, "kl": 0.08087158203125, "learning_rate": 4.898944617571169e-06, "loss": 0.0008075721561908722, "memory(GiB)": 37.98, "reward": 0.3008368909358978, "reward_std": 0.02733401395380497, "rewards/VisualizationJSONCombinedORM/mean": 0.3008368909358978, "rewards/VisualizationJSONCombinedORM/std": 0.031828321516513824, "step": 1344, "train_speed(iter/s)": 0.261928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 265.1875, "completions/min_length": 200.0, "epoch": 1.1124896608767576, "grad_norm": 0.16763387620449066, "kl": 0.04791259765625, "learning_rate": 4.891727466811236e-06, "loss": 0.0004782751202583313, "memory(GiB)": 37.98, "reward": 0.49913549423217773, "reward_std": 0.06758499145507812, "rewards/VisualizationJSONCombinedORM/mean": 0.49913549423217773, "rewards/VisualizationJSONCombinedORM/std": 0.0695546492934227, "step": 1345, "train_speed(iter/s)": 0.260809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 271.6875, "completions/min_length": 228.0, "epoch": 1.1133167907361456, "grad_norm": 0.1793185919523239, "kl": 0.05767822265625, "learning_rate": 4.8845105417349955e-06, "loss": 0.0005769599229097366, "memory(GiB)": 37.98, "reward": 0.7141029238700867, "reward_std": 0.10391422361135483, "rewards/VisualizationJSONCombinedORM/mean": 0.7141029238700867, "rewards/VisualizationJSONCombinedORM/std": 0.10749677568674088, "step": 1346, "train_speed(iter/s)": 0.25992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 282.0, "completions/min_length": 238.0, "epoch": 1.1141439205955335, "grad_norm": 0.17278918623924255, "kl": 0.0723876953125, "learning_rate": 4.87729385738544e-06, "loss": 0.0007244795560836792, "memory(GiB)": 37.98, "reward": 0.3767238259315491, "reward_std": 0.05362355709075928, "rewards/VisualizationJSONCombinedORM/mean": 0.3767238259315491, "rewards/VisualizationJSONCombinedORM/std": 0.09479695558547974, "step": 1347, "train_speed(iter/s)": 0.258758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 251.375, "completions/min_length": 214.0, "epoch": 1.1149710504549215, "grad_norm": 0.1677435338497162, "kl": 0.05438232421875, "learning_rate": 4.8700774288050515e-06, "loss": 0.0005436278879642487, "memory(GiB)": 37.98, "reward": 0.4209352433681488, "reward_std": 0.059860728681087494, "rewards/VisualizationJSONCombinedORM/mean": 0.4209352433681488, "rewards/VisualizationJSONCombinedORM/std": 0.06185629963874817, "step": 1348, "train_speed(iter/s)": 0.257935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 235.75, "completions/min_length": 206.0, "epoch": 1.1157981803143093, "grad_norm": 0.20023728907108307, "kl": 0.067626953125, "learning_rate": 4.862861271035785e-06, "loss": 0.000674741342663765, "memory(GiB)": 37.98, "reward": 0.4479791224002838, "reward_std": 0.07780607044696808, "rewards/VisualizationJSONCombinedORM/mean": 0.4479791224002838, "rewards/VisualizationJSONCombinedORM/std": 0.1618761420249939, "step": 1349, "train_speed(iter/s)": 0.256916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 273.1875, "completions/min_length": 210.0, "epoch": 1.1166253101736974, "grad_norm": 0.16603493690490723, "kl": 0.06280517578125, "learning_rate": 4.855645399119028e-06, "loss": 0.0006284266710281372, "memory(GiB)": 37.98, "reward": 0.41956043243408203, "reward_std": 0.050250642001628876, "rewards/VisualizationJSONCombinedORM/mean": 0.41956043243408203, "rewards/VisualizationJSONCombinedORM/std": 0.05019953101873398, "step": 1350, "train_speed(iter/s)": 0.255915 }, { "epoch": 1.1166253101736974, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 302.0833333333333, "eval_completions/mean_length": 258.6979166666667, "eval_completions/min_length": 221.70833333333334, "eval_kl": 0.060750325520833336, "eval_loss": 0.0006123576313257217, "eval_reward": 0.44208585284650326, "eval_reward_std": 0.06370949054447313, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44208585284650326, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06370949376529704, "eval_runtime": 272.4243, "eval_samples_per_second": 0.088, "eval_steps_per_second": 0.011, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 258.75, "completions/min_length": 210.0, "epoch": 1.1174524400330852, "grad_norm": 0.16789275407791138, "kl": 0.08544921875, "learning_rate": 4.84842982809557e-06, "loss": 0.00085483118891716, "memory(GiB)": 37.98, "reward": 0.6396117210388184, "reward_std": 0.12433646619319916, "rewards/VisualizationJSONCombinedORM/mean": 0.6396117210388184, "rewards/VisualizationJSONCombinedORM/std": 0.15560223162174225, "step": 1351, "train_speed(iter/s)": 0.242329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 257.0, "completions/min_length": 212.0, "epoch": 1.118279569892473, "grad_norm": 0.20313142240047455, "kl": 0.083984375, "learning_rate": 4.841214573005578e-06, "loss": 0.0008389130234718323, "memory(GiB)": 37.98, "reward": 0.4572947025299072, "reward_std": 0.07127489149570465, "rewards/VisualizationJSONCombinedORM/mean": 0.4572947025299072, "rewards/VisualizationJSONCombinedORM/std": 0.2643832564353943, "step": 1352, "train_speed(iter/s)": 0.241242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 263.75, "completions/min_length": 207.0, "epoch": 1.119106699751861, "grad_norm": 0.1640724390745163, "kl": 0.0390625, "learning_rate": 4.833999648888556e-06, "loss": 0.0003910362720489502, "memory(GiB)": 37.98, "reward": 0.6136648654937744, "reward_std": 0.07207350432872772, "rewards/VisualizationJSONCombinedORM/mean": 0.6136648654937744, "rewards/VisualizationJSONCombinedORM/std": 0.16662396490573883, "step": 1353, "train_speed(iter/s)": 0.240279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 255.25, "completions/min_length": 216.0, "epoch": 1.1199338296112489, "grad_norm": 0.1514604538679123, "kl": 0.07427978515625, "learning_rate": 4.826785070783326e-06, "loss": 0.0007415711879730225, "memory(GiB)": 37.98, "reward": 0.655237078666687, "reward_std": 0.07235237210988998, "rewards/VisualizationJSONCombinedORM/mean": 0.655237078666687, "rewards/VisualizationJSONCombinedORM/std": 0.1006702408194542, "step": 1354, "train_speed(iter/s)": 0.239403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 264.3125, "completions/min_length": 209.0, "epoch": 1.120760959470637, "grad_norm": 0.1474527269601822, "kl": 0.047119140625, "learning_rate": 4.819570853727975e-06, "loss": 0.0004714652895927429, "memory(GiB)": 37.98, "reward": 0.47612833976745605, "reward_std": 0.07584845274686813, "rewards/VisualizationJSONCombinedORM/mean": 0.47612833976745605, "rewards/VisualizationJSONCombinedORM/std": 0.19012890756130219, "step": 1355, "train_speed(iter/s)": 0.23855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 248.5625, "completions/min_length": 221.0, "epoch": 1.1215880893300247, "grad_norm": 0.18531093001365662, "kl": 0.0548095703125, "learning_rate": 4.8123570127598514e-06, "loss": 0.0005483254790306091, "memory(GiB)": 37.98, "reward": 0.3392525315284729, "reward_std": 0.03702482581138611, "rewards/VisualizationJSONCombinedORM/mean": 0.3392525315284729, "rewards/VisualizationJSONCombinedORM/std": 0.03653538227081299, "step": 1356, "train_speed(iter/s)": 0.237598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 241.6875, "completions/min_length": 203.0, "epoch": 1.1224152191894128, "grad_norm": 0.20048578083515167, "kl": 0.0482177734375, "learning_rate": 4.805143562915511e-06, "loss": 0.00048182904720306396, "memory(GiB)": 37.98, "reward": 0.6263347864151001, "reward_std": 0.0838780626654625, "rewards/VisualizationJSONCombinedORM/mean": 0.6263347864151001, "rewards/VisualizationJSONCombinedORM/std": 0.20226158201694489, "step": 1357, "train_speed(iter/s)": 0.236675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 256.1875, "completions/min_length": 207.0, "epoch": 1.1232423490488006, "grad_norm": 0.20570354163646698, "kl": 0.05084228515625, "learning_rate": 4.797930519230699e-06, "loss": 0.0005080029368400574, "memory(GiB)": 37.98, "reward": 0.3421611189842224, "reward_std": 0.05014488846063614, "rewards/VisualizationJSONCombinedORM/mean": 0.3421611189842224, "rewards/VisualizationJSONCombinedORM/std": 0.052205126732587814, "step": 1358, "train_speed(iter/s)": 0.235813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 267.75, "completions/min_length": 205.0, "epoch": 1.1240694789081886, "grad_norm": 0.17535312473773956, "kl": 0.04315185546875, "learning_rate": 4.790717896740311e-06, "loss": 0.0004324205219745636, "memory(GiB)": 37.98, "reward": 0.6154934167861938, "reward_std": 0.0480395071208477, "rewards/VisualizationJSONCombinedORM/mean": 0.6154934167861938, "rewards/VisualizationJSONCombinedORM/std": 0.12658703327178955, "step": 1359, "train_speed(iter/s)": 0.235004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 272.9375, "completions/min_length": 231.0, "epoch": 1.1248966087675765, "grad_norm": 0.17733509838581085, "kl": 0.05279541015625, "learning_rate": 4.783505710478366e-06, "loss": 0.0005278438329696655, "memory(GiB)": 37.98, "reward": 0.5423397421836853, "reward_std": 0.08328409492969513, "rewards/VisualizationJSONCombinedORM/mean": 0.5423397421836853, "rewards/VisualizationJSONCombinedORM/std": 0.13878686726093292, "step": 1360, "train_speed(iter/s)": 0.23427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 262.6875, "completions/min_length": 194.0, "epoch": 1.1257237386269645, "grad_norm": 0.16835276782512665, "kl": 0.0491943359375, "learning_rate": 4.776293975477973e-06, "loss": 0.000491628423333168, "memory(GiB)": 37.98, "reward": 0.5044094324111938, "reward_std": 0.08829016983509064, "rewards/VisualizationJSONCombinedORM/mean": 0.5044094324111938, "rewards/VisualizationJSONCombinedORM/std": 0.14950591325759888, "step": 1361, "train_speed(iter/s)": 0.233616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 258.9375, "completions/min_length": 225.0, "epoch": 1.1265508684863523, "grad_norm": 0.18750444054603577, "kl": 0.0567626953125, "learning_rate": 4.7690827067713035e-06, "loss": 0.0005679372698068619, "memory(GiB)": 37.98, "reward": 0.4832191467285156, "reward_std": 0.06202990561723709, "rewards/VisualizationJSONCombinedORM/mean": 0.4832191467285156, "rewards/VisualizationJSONCombinedORM/std": 0.07742120325565338, "step": 1362, "train_speed(iter/s)": 0.232771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 242.375, "completions/min_length": 215.0, "epoch": 1.1273779983457404, "grad_norm": 0.1755717247724533, "kl": 0.0662841796875, "learning_rate": 4.761871919389552e-06, "loss": 0.000663459300994873, "memory(GiB)": 37.98, "reward": 0.6572356820106506, "reward_std": 0.1033051609992981, "rewards/VisualizationJSONCombinedORM/mean": 0.6572356820106506, "rewards/VisualizationJSONCombinedORM/std": 0.14769351482391357, "step": 1363, "train_speed(iter/s)": 0.231843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 239.0, "completions/min_length": 210.0, "epoch": 1.1282051282051282, "grad_norm": 0.18687006831169128, "kl": 0.06671142578125, "learning_rate": 4.75466162836291e-06, "loss": 0.0006677061319351196, "memory(GiB)": 37.98, "reward": 0.4899383783340454, "reward_std": 0.07267414778470993, "rewards/VisualizationJSONCombinedORM/mean": 0.4899383783340454, "rewards/VisualizationJSONCombinedORM/std": 0.0711788609623909, "step": 1364, "train_speed(iter/s)": 0.231048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 262.0625, "completions/min_length": 204.0, "epoch": 1.129032258064516, "grad_norm": 0.14455771446228027, "kl": 0.052978515625, "learning_rate": 4.747451848720542e-06, "loss": 0.0005297139286994934, "memory(GiB)": 37.98, "reward": 0.5864415168762207, "reward_std": 0.06953907012939453, "rewards/VisualizationJSONCombinedORM/mean": 0.5864415168762207, "rewards/VisualizationJSONCombinedORM/std": 0.1985940784215927, "step": 1365, "train_speed(iter/s)": 0.230145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 276.375, "completions/min_length": 216.0, "epoch": 1.129859387923904, "grad_norm": 0.1704527735710144, "kl": 0.05865478515625, "learning_rate": 4.740242595490537e-06, "loss": 0.0005868338048458099, "memory(GiB)": 37.98, "reward": 0.40379396080970764, "reward_std": 0.045033544301986694, "rewards/VisualizationJSONCombinedORM/mean": 0.40379396080970764, "rewards/VisualizationJSONCombinedORM/std": 0.07011591643095016, "step": 1366, "train_speed(iter/s)": 0.229297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 237.625, "completions/min_length": 192.0, "epoch": 1.130686517783292, "grad_norm": 0.15099820494651794, "kl": 0.0802001953125, "learning_rate": 4.7330338836998945e-06, "loss": 0.0008007809519767761, "memory(GiB)": 37.98, "reward": 0.7391449213027954, "reward_std": 0.07361815124750137, "rewards/VisualizationJSONCombinedORM/mean": 0.7391449213027954, "rewards/VisualizationJSONCombinedORM/std": 0.08100573718547821, "step": 1367, "train_speed(iter/s)": 0.228485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 260.8125, "completions/min_length": 221.0, "epoch": 1.13151364764268, "grad_norm": 0.1811073124408722, "kl": 0.07208251953125, "learning_rate": 4.725825728374479e-06, "loss": 0.0007195174694061279, "memory(GiB)": 38.02, "reward": 0.5035949945449829, "reward_std": 0.07808913290500641, "rewards/VisualizationJSONCombinedORM/mean": 0.5035949945449829, "rewards/VisualizationJSONCombinedORM/std": 0.07669095695018768, "step": 1368, "train_speed(iter/s)": 0.227477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 261.875, "completions/min_length": 203.0, "epoch": 1.1323407775020677, "grad_norm": 0.18357695639133453, "kl": 0.0975341796875, "learning_rate": 4.718618144538999e-06, "loss": 0.0009750723838806152, "memory(GiB)": 38.02, "reward": 0.45407000184059143, "reward_std": 0.07853133231401443, "rewards/VisualizationJSONCombinedORM/mean": 0.45407000184059143, "rewards/VisualizationJSONCombinedORM/std": 0.1446046084165573, "step": 1369, "train_speed(iter/s)": 0.226671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 256.375, "completions/min_length": 207.0, "epoch": 1.1331679073614558, "grad_norm": 0.1875678151845932, "kl": 0.144287109375, "learning_rate": 4.711411147216969e-06, "loss": 0.0014435239136219025, "memory(GiB)": 38.02, "reward": 0.5090552568435669, "reward_std": 0.11350700259208679, "rewards/VisualizationJSONCombinedORM/mean": 0.5090552568435669, "rewards/VisualizationJSONCombinedORM/std": 0.13303035497665405, "step": 1370, "train_speed(iter/s)": 0.22612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 245.75, "completions/min_length": 215.0, "epoch": 1.1339950372208436, "grad_norm": 0.18679837882518768, "kl": 0.05157470703125, "learning_rate": 4.704204751430687e-06, "loss": 0.0005158185958862305, "memory(GiB)": 38.02, "reward": 0.7107547521591187, "reward_std": 0.06669695675373077, "rewards/VisualizationJSONCombinedORM/mean": 0.7107547521591187, "rewards/VisualizationJSONCombinedORM/std": 0.13681966066360474, "step": 1371, "train_speed(iter/s)": 0.225423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 266.3125, "completions/min_length": 209.0, "epoch": 1.1348221670802316, "grad_norm": 0.19733649492263794, "kl": 0.04840087890625, "learning_rate": 4.696998972201189e-06, "loss": 0.00048451870679855347, "memory(GiB)": 38.02, "reward": 0.5223507881164551, "reward_std": 0.07183882594108582, "rewards/VisualizationJSONCombinedORM/mean": 0.5223507881164551, "rewards/VisualizationJSONCombinedORM/std": 0.21646033227443695, "step": 1372, "train_speed(iter/s)": 0.224481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 266.5, "completions/min_length": 227.0, "epoch": 1.1356492969396195, "grad_norm": 0.19230544567108154, "kl": 0.0626220703125, "learning_rate": 4.6897938245482285e-06, "loss": 0.0006259791553020477, "memory(GiB)": 38.02, "reward": 0.24744966626167297, "reward_std": 0.0273408442735672, "rewards/VisualizationJSONCombinedORM/mean": 0.24744966626167297, "rewards/VisualizationJSONCombinedORM/std": 0.028296368196606636, "step": 1373, "train_speed(iter/s)": 0.223539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 262.125, "completions/min_length": 206.0, "epoch": 1.1364764267990075, "grad_norm": 0.20433108508586884, "kl": 0.0556640625, "learning_rate": 4.6825893234902485e-06, "loss": 0.0005570538341999054, "memory(GiB)": 38.02, "reward": 0.3246763348579407, "reward_std": 0.06436659395694733, "rewards/VisualizationJSONCombinedORM/mean": 0.3246763348579407, "rewards/VisualizationJSONCombinedORM/std": 0.09238123148679733, "step": 1374, "train_speed(iter/s)": 0.222857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 281.25, "completions/min_length": 209.0, "epoch": 1.1373035566583953, "grad_norm": 0.22097280621528625, "kl": 0.06781005859375, "learning_rate": 4.6753854840443375e-06, "loss": 0.0006764531135559082, "memory(GiB)": 38.02, "reward": 0.573415219783783, "reward_std": 0.0865626186132431, "rewards/VisualizationJSONCombinedORM/mean": 0.573415219783783, "rewards/VisualizationJSONCombinedORM/std": 0.1985103338956833, "step": 1375, "train_speed(iter/s)": 0.22207 }, { "epoch": 1.1373035566583953, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 306.5416666666667, "eval_completions/mean_length": 258.4947916666667, "eval_completions/min_length": 225.33333333333334, "eval_kl": 0.082000732421875, "eval_loss": 0.0008283357019536197, "eval_reward": 0.49095556636651355, "eval_reward_std": 0.07182584377005696, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.49095556636651355, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07182584734012683, "eval_runtime": 274.6185, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 262.5625, "completions/min_length": 195.0, "epoch": 1.1381306865177834, "grad_norm": 0.17060130834579468, "kl": 0.070068359375, "learning_rate": 4.668182321226205e-06, "loss": 0.0007004216313362122, "memory(GiB)": 38.02, "reward": 0.5657873749732971, "reward_std": 0.045460958033800125, "rewards/VisualizationJSONCombinedORM/mean": 0.5657873749732971, "rewards/VisualizationJSONCombinedORM/std": 0.1623382270336151, "step": 1376, "train_speed(iter/s)": 0.212087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 257.8125, "completions/min_length": 204.0, "epoch": 1.1389578163771712, "grad_norm": 0.16749900579452515, "kl": 0.076904296875, "learning_rate": 4.660979850050153e-06, "loss": 0.0007688738405704498, "memory(GiB)": 38.02, "reward": 0.5873784422874451, "reward_std": 0.08959769457578659, "rewards/VisualizationJSONCombinedORM/mean": 0.5873784422874451, "rewards/VisualizationJSONCombinedORM/std": 0.10363254696130753, "step": 1377, "train_speed(iter/s)": 0.211578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 250.875, "completions/min_length": 212.0, "epoch": 1.139784946236559, "grad_norm": 0.2169182449579239, "kl": 0.1380615234375, "learning_rate": 4.653778085529043e-06, "loss": 0.0013816729187965393, "memory(GiB)": 38.02, "reward": 0.31117093563079834, "reward_std": 0.033523932099342346, "rewards/VisualizationJSONCombinedORM/mean": 0.31117093563079834, "rewards/VisualizationJSONCombinedORM/std": 0.12184394150972366, "step": 1378, "train_speed(iter/s)": 0.210931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 253.625, "completions/min_length": 204.0, "epoch": 1.140612076095947, "grad_norm": 0.22687430679798126, "kl": 0.07989501953125, "learning_rate": 4.6465770426742595e-06, "loss": 0.0007996931672096252, "memory(GiB)": 38.02, "reward": 0.570216953754425, "reward_std": 0.09489841759204865, "rewards/VisualizationJSONCombinedORM/mean": 0.570216953754425, "rewards/VisualizationJSONCombinedORM/std": 0.0953545868396759, "step": 1379, "train_speed(iter/s)": 0.210332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 299.375, "completions/min_length": 222.0, "epoch": 1.141439205955335, "grad_norm": 0.17799048125743866, "kl": 0.0657958984375, "learning_rate": 4.639376736495683e-06, "loss": 0.0006589889526367188, "memory(GiB)": 38.02, "reward": 0.7103986144065857, "reward_std": 0.10565871000289917, "rewards/VisualizationJSONCombinedORM/mean": 0.7103986144065857, "rewards/VisualizationJSONCombinedORM/std": 0.11824418604373932, "step": 1380, "train_speed(iter/s)": 0.209579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 250.125, "completions/min_length": 208.0, "epoch": 1.142266335814723, "grad_norm": 0.15472820401191711, "kl": 0.1002197265625, "learning_rate": 4.6321771820016635e-06, "loss": 0.0010013356804847717, "memory(GiB)": 38.02, "reward": 0.6759693622589111, "reward_std": 0.05495257303118706, "rewards/VisualizationJSONCombinedORM/mean": 0.6759693622589111, "rewards/VisualizationJSONCombinedORM/std": 0.06036243215203285, "step": 1381, "train_speed(iter/s)": 0.208938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 254.25, "completions/min_length": 204.0, "epoch": 1.1430934656741107, "grad_norm": 0.17202867567539215, "kl": 0.1025390625, "learning_rate": 4.624978394198978e-06, "loss": 0.0010259542614221573, "memory(GiB)": 38.02, "reward": 0.5756815075874329, "reward_std": 0.0918472409248352, "rewards/VisualizationJSONCombinedORM/mean": 0.5756815075874329, "rewards/VisualizationJSONCombinedORM/std": 0.22906967997550964, "step": 1382, "train_speed(iter/s)": 0.208326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 268.375, "completions/min_length": 186.0, "epoch": 1.1439205955334988, "grad_norm": 0.20300573110580444, "kl": 0.123779296875, "learning_rate": 4.617780388092812e-06, "loss": 0.001234591007232666, "memory(GiB)": 38.02, "reward": 0.3584992289543152, "reward_std": 0.07056065648794174, "rewards/VisualizationJSONCombinedORM/mean": 0.3584992289543152, "rewards/VisualizationJSONCombinedORM/std": 0.11171162128448486, "step": 1383, "train_speed(iter/s)": 0.207717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 232.0, "completions/min_length": 202.0, "epoch": 1.1447477253928866, "grad_norm": 0.17939800024032593, "kl": 0.0860595703125, "learning_rate": 4.610583178686715e-06, "loss": 0.0008590444922447205, "memory(GiB)": 38.02, "reward": 0.34926638007164, "reward_std": 0.045528165996074677, "rewards/VisualizationJSONCombinedORM/mean": 0.34926638007164, "rewards/VisualizationJSONCombinedORM/std": 0.04838109761476517, "step": 1384, "train_speed(iter/s)": 0.207069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 261.0625, "completions/min_length": 222.0, "epoch": 1.1455748552522746, "grad_norm": 0.15512706339359283, "kl": 0.07232666015625, "learning_rate": 4.603386780982579e-06, "loss": 0.0007226951420307159, "memory(GiB)": 38.02, "reward": 0.5800859928131104, "reward_std": 0.09438107907772064, "rewards/VisualizationJSONCombinedORM/mean": 0.5800859928131104, "rewards/VisualizationJSONCombinedORM/std": 0.11906915158033371, "step": 1385, "train_speed(iter/s)": 0.206417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 233.0625, "completions/min_length": 205.0, "epoch": 1.1464019851116625, "grad_norm": 0.19746576249599457, "kl": 0.08013916015625, "learning_rate": 4.596191209980604e-06, "loss": 0.0008014459162950516, "memory(GiB)": 38.02, "reward": 0.46113479137420654, "reward_std": 0.055200979113578796, "rewards/VisualizationJSONCombinedORM/mean": 0.46113479137420654, "rewards/VisualizationJSONCombinedORM/std": 0.18938402831554413, "step": 1386, "train_speed(iter/s)": 0.205832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 221.9375, "completions/min_length": 196.0, "epoch": 1.1472291149710505, "grad_norm": 0.24117514491081238, "kl": 0.0712890625, "learning_rate": 4.588996480679267e-06, "loss": 0.000712614506483078, "memory(GiB)": 38.02, "reward": 0.41023749113082886, "reward_std": 0.057492487132549286, "rewards/VisualizationJSONCombinedORM/mean": 0.41023749113082886, "rewards/VisualizationJSONCombinedORM/std": 0.13531212508678436, "step": 1387, "train_speed(iter/s)": 0.205263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 253.5625, "completions/min_length": 218.0, "epoch": 1.1480562448304383, "grad_norm": 0.1863030195236206, "kl": 0.08251953125, "learning_rate": 4.58180260807529e-06, "loss": 0.0008247233927249908, "memory(GiB)": 38.02, "reward": 0.5670797824859619, "reward_std": 0.11915924400091171, "rewards/VisualizationJSONCombinedORM/mean": 0.5670797824859619, "rewards/VisualizationJSONCombinedORM/std": 0.18005141615867615, "step": 1388, "train_speed(iter/s)": 0.204836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 266.5625, "completions/min_length": 206.0, "epoch": 1.1488833746898264, "grad_norm": 0.2110867202281952, "kl": 0.0804443359375, "learning_rate": 4.574609607163609e-06, "loss": 0.0008064545691013336, "memory(GiB)": 38.02, "reward": 0.5202140808105469, "reward_std": 0.07740309834480286, "rewards/VisualizationJSONCombinedORM/mean": 0.5202140808105469, "rewards/VisualizationJSONCombinedORM/std": 0.19509223103523254, "step": 1389, "train_speed(iter/s)": 0.204197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 236.5625, "completions/min_length": 195.0, "epoch": 1.1497105045492142, "grad_norm": 0.1548580527305603, "kl": 0.05694580078125, "learning_rate": 4.567417492937344e-06, "loss": 0.0005691871047019958, "memory(GiB)": 38.02, "reward": 0.6273612380027771, "reward_std": 0.0793706476688385, "rewards/VisualizationJSONCombinedORM/mean": 0.6273612380027771, "rewards/VisualizationJSONCombinedORM/std": 0.17226362228393555, "step": 1390, "train_speed(iter/s)": 0.20353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 249.125, "completions/min_length": 186.0, "epoch": 1.1505376344086022, "grad_norm": 0.1632174849510193, "kl": 0.07342529296875, "learning_rate": 4.560226280387766e-06, "loss": 0.0007334146648645401, "memory(GiB)": 38.02, "reward": 0.39388570189476013, "reward_std": 0.04347279295325279, "rewards/VisualizationJSONCombinedORM/mean": 0.39388570189476013, "rewards/VisualizationJSONCombinedORM/std": 0.1785379946231842, "step": 1391, "train_speed(iter/s)": 0.202974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 245.3125, "completions/min_length": 203.0, "epoch": 1.15136476426799, "grad_norm": 0.16929742693901062, "kl": 0.06170654296875, "learning_rate": 4.553035984504269e-06, "loss": 0.0006172731518745422, "memory(GiB)": 38.02, "reward": 0.462564617395401, "reward_std": 0.049321405589580536, "rewards/VisualizationJSONCombinedORM/mean": 0.462564617395401, "rewards/VisualizationJSONCombinedORM/std": 0.1941603571176529, "step": 1392, "train_speed(iter/s)": 0.202327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 251.9375, "completions/min_length": 209.0, "epoch": 1.152191894127378, "grad_norm": 0.1830049455165863, "kl": 0.0928955078125, "learning_rate": 4.545846620274329e-06, "loss": 0.0009279577061533928, "memory(GiB)": 38.02, "reward": 0.7115359306335449, "reward_std": 0.10804291814565659, "rewards/VisualizationJSONCombinedORM/mean": 0.7115359306335449, "rewards/VisualizationJSONCombinedORM/std": 0.10941699892282486, "step": 1393, "train_speed(iter/s)": 0.201784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 240.0625, "completions/min_length": 206.0, "epoch": 1.153019023986766, "grad_norm": 0.19913487136363983, "kl": 0.1043701171875, "learning_rate": 4.53865820268349e-06, "loss": 0.0010447930544614792, "memory(GiB)": 38.02, "reward": 0.5263898372650146, "reward_std": 0.08833712339401245, "rewards/VisualizationJSONCombinedORM/mean": 0.5263898372650146, "rewards/VisualizationJSONCombinedORM/std": 0.15875022113323212, "step": 1394, "train_speed(iter/s)": 0.20122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 233.125, "completions/min_length": 193.0, "epoch": 1.1538461538461537, "grad_norm": 0.18264129757881165, "kl": 0.11370849609375, "learning_rate": 4.531470746715317e-06, "loss": 0.0011380314826965332, "memory(GiB)": 38.02, "reward": 0.4347784221172333, "reward_std": 0.09031298011541367, "rewards/VisualizationJSONCombinedORM/mean": 0.4347784221172333, "rewards/VisualizationJSONCombinedORM/std": 0.09144066274166107, "step": 1395, "train_speed(iter/s)": 0.200788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 236.4375, "completions/min_length": 210.0, "epoch": 1.1546732837055418, "grad_norm": 0.18514497578144073, "kl": 0.1219482421875, "learning_rate": 4.524284267351372e-06, "loss": 0.0012195073068141937, "memory(GiB)": 38.02, "reward": 0.40127697587013245, "reward_std": 0.06805310398340225, "rewards/VisualizationJSONCombinedORM/mean": 0.40127697587013245, "rewards/VisualizationJSONCombinedORM/std": 0.14425566792488098, "step": 1396, "train_speed(iter/s)": 0.200283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 248.1875, "completions/min_length": 200.0, "epoch": 1.1555004135649296, "grad_norm": 0.18282605707645416, "kl": 0.0838623046875, "learning_rate": 4.517098779571179e-06, "loss": 0.0008396357297897339, "memory(GiB)": 38.02, "reward": 0.5241827964782715, "reward_std": 0.05488546937704086, "rewards/VisualizationJSONCombinedORM/mean": 0.5241827964782715, "rewards/VisualizationJSONCombinedORM/std": 0.15294213593006134, "step": 1397, "train_speed(iter/s)": 0.199787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 249.5625, "completions/min_length": 203.0, "epoch": 1.1563275434243176, "grad_norm": 0.19107475876808167, "kl": 0.1202392578125, "learning_rate": 4.509914298352197e-06, "loss": 0.0012003779411315918, "memory(GiB)": 38.02, "reward": 0.659572958946228, "reward_std": 0.08751149475574493, "rewards/VisualizationJSONCombinedORM/mean": 0.659572958946228, "rewards/VisualizationJSONCombinedORM/std": 0.10101944953203201, "step": 1398, "train_speed(iter/s)": 0.199124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 258.0, "completions/min_length": 212.0, "epoch": 1.1571546732837055, "grad_norm": 0.2638097107410431, "kl": 0.0738525390625, "learning_rate": 4.50273083866979e-06, "loss": 0.0007369071245193481, "memory(GiB)": 38.02, "reward": 0.5968149900436401, "reward_std": 0.09830321371555328, "rewards/VisualizationJSONCombinedORM/mean": 0.5968149900436401, "rewards/VisualizationJSONCombinedORM/std": 0.0980139970779419, "step": 1399, "train_speed(iter/s)": 0.198607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 236.125, "completions/min_length": 206.0, "epoch": 1.1579818031430935, "grad_norm": 0.1795675903558731, "kl": 0.1015625, "learning_rate": 4.4955484154971875e-06, "loss": 0.0010156556963920593, "memory(GiB)": 38.02, "reward": 0.47391486167907715, "reward_std": 0.06542296707630157, "rewards/VisualizationJSONCombinedORM/mean": 0.47391486167907715, "rewards/VisualizationJSONCombinedORM/std": 0.1857345849275589, "step": 1400, "train_speed(iter/s)": 0.198102 }, { "epoch": 1.1579818031430935, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 283.0833333333333, "eval_completions/mean_length": 243.48958333333334, "eval_completions/min_length": 213.91666666666666, "eval_kl": 0.09218343098958333, "eval_loss": 0.0009201386128552258, "eval_reward": 0.5177596838523945, "eval_reward_std": 0.0690578796978419, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.5177596838523945, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06905788245300452, "eval_runtime": 260.687, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.012, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 232.25, "completions/min_length": 200.0, "epoch": 1.1588089330024813, "grad_norm": 0.19632551074028015, "kl": 0.08642578125, "learning_rate": 4.488367043805462e-06, "loss": 0.0008665695786476135, "memory(GiB)": 38.02, "reward": 0.47281837463378906, "reward_std": 0.052630942314863205, "rewards/VisualizationJSONCombinedORM/mean": 0.47281837463378906, "rewards/VisualizationJSONCombinedORM/std": 0.05334394797682762, "step": 1401, "train_speed(iter/s)": 0.190675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 249.5, "completions/min_length": 200.0, "epoch": 1.1596360628618694, "grad_norm": 0.17059162259101868, "kl": 0.1593017578125, "learning_rate": 4.4811867385634916e-06, "loss": 0.001600801944732666, "memory(GiB)": 38.02, "reward": 0.7224850654602051, "reward_std": 0.08406111598014832, "rewards/VisualizationJSONCombinedORM/mean": 0.7224850654602051, "rewards/VisualizationJSONCombinedORM/std": 0.08644551783800125, "step": 1402, "train_speed(iter/s)": 0.190202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 235.625, "completions/min_length": 192.0, "epoch": 1.1604631927212572, "grad_norm": 0.21564525365829468, "kl": 0.07025146484375, "learning_rate": 4.474007514737933e-06, "loss": 0.0007027089595794678, "memory(GiB)": 38.02, "reward": 0.5944861173629761, "reward_std": 0.07594846189022064, "rewards/VisualizationJSONCombinedORM/mean": 0.5944861173629761, "rewards/VisualizationJSONCombinedORM/std": 0.1595478057861328, "step": 1403, "train_speed(iter/s)": 0.18972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 249.625, "completions/min_length": 211.0, "epoch": 1.1612903225806452, "grad_norm": 0.19837017357349396, "kl": 0.0994873046875, "learning_rate": 4.4668293872931904e-06, "loss": 0.0009953295812010765, "memory(GiB)": 38.02, "reward": 0.6205474138259888, "reward_std": 0.07231032848358154, "rewards/VisualizationJSONCombinedORM/mean": 0.6205474138259888, "rewards/VisualizationJSONCombinedORM/std": 0.1871754229068756, "step": 1404, "train_speed(iter/s)": 0.189231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 232.6875, "completions/min_length": 199.0, "epoch": 1.162117452440033, "grad_norm": 0.20165511965751648, "kl": 0.043212890625, "learning_rate": 4.459652371191381e-06, "loss": 0.0004322417080402374, "memory(GiB)": 38.02, "reward": 0.5195980072021484, "reward_std": 0.10485349595546722, "rewards/VisualizationJSONCombinedORM/mean": 0.5195980072021484, "rewards/VisualizationJSONCombinedORM/std": 0.13557429611682892, "step": 1405, "train_speed(iter/s)": 0.188711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 255.4375, "completions/min_length": 203.0, "epoch": 1.162944582299421, "grad_norm": 0.2250574827194214, "kl": 0.0980224609375, "learning_rate": 4.452476481392302e-06, "loss": 0.0009808503091335297, "memory(GiB)": 38.02, "reward": 0.4048880934715271, "reward_std": 0.05060012638568878, "rewards/VisualizationJSONCombinedORM/mean": 0.4048880934715271, "rewards/VisualizationJSONCombinedORM/std": 0.052541494369506836, "step": 1406, "train_speed(iter/s)": 0.188247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 247.75, "completions/min_length": 200.0, "epoch": 1.163771712158809, "grad_norm": 0.19329726696014404, "kl": 0.07977294921875, "learning_rate": 4.4453017328534115e-06, "loss": 0.0007974058389663696, "memory(GiB)": 38.02, "reward": 0.5909841060638428, "reward_std": 0.10860966891050339, "rewards/VisualizationJSONCombinedORM/mean": 0.5909841060638428, "rewards/VisualizationJSONCombinedORM/std": 0.11055408418178558, "step": 1407, "train_speed(iter/s)": 0.187773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 245.8125, "completions/min_length": 206.0, "epoch": 1.1645988420181967, "grad_norm": 0.26008540391921997, "kl": 0.095703125, "learning_rate": 4.438128140529779e-06, "loss": 0.0009551160037517548, "memory(GiB)": 38.02, "reward": 0.5880164504051208, "reward_std": 0.12039043009281158, "rewards/VisualizationJSONCombinedORM/mean": 0.5880164504051208, "rewards/VisualizationJSONCombinedORM/std": 0.19375504553318024, "step": 1408, "train_speed(iter/s)": 0.187328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 256.25, "completions/min_length": 198.0, "epoch": 1.1654259718775848, "grad_norm": 0.2058500349521637, "kl": 0.0750732421875, "learning_rate": 4.430955719374073e-06, "loss": 0.0007510408759117126, "memory(GiB)": 38.02, "reward": 0.4242664575576782, "reward_std": 0.10735665261745453, "rewards/VisualizationJSONCombinedORM/mean": 0.4242664575576782, "rewards/VisualizationJSONCombinedORM/std": 0.12717938423156738, "step": 1409, "train_speed(iter/s)": 0.186872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 226.0, "completions/min_length": 194.0, "epoch": 1.1662531017369726, "grad_norm": 0.16106398403644562, "kl": 0.0648193359375, "learning_rate": 4.4237844843365126e-06, "loss": 0.0006481409072875977, "memory(GiB)": 38.02, "reward": 0.5493170619010925, "reward_std": 0.04986713081598282, "rewards/VisualizationJSONCombinedORM/mean": 0.5493170619010925, "rewards/VisualizationJSONCombinedORM/std": 0.17394021153450012, "step": 1410, "train_speed(iter/s)": 0.186497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 227.0625, "completions/min_length": 186.0, "epoch": 1.1670802315963607, "grad_norm": 0.2445659190416336, "kl": 0.097900390625, "learning_rate": 4.41661445036485e-06, "loss": 0.000976957380771637, "memory(GiB)": 38.02, "reward": 0.5642956495285034, "reward_std": 0.11828990280628204, "rewards/VisualizationJSONCombinedORM/mean": 0.5642956495285034, "rewards/VisualizationJSONCombinedORM/std": 0.12179228663444519, "step": 1411, "train_speed(iter/s)": 0.186038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 257.0, "completions/min_length": 196.0, "epoch": 1.1679073614557485, "grad_norm": 0.19101713597774506, "kl": 0.0687255859375, "learning_rate": 4.409445632404334e-06, "loss": 0.0006860643625259399, "memory(GiB)": 38.02, "reward": 0.6283591985702515, "reward_std": 0.09818045794963837, "rewards/VisualizationJSONCombinedORM/mean": 0.6283591985702515, "rewards/VisualizationJSONCombinedORM/std": 0.13106383383274078, "step": 1412, "train_speed(iter/s)": 0.185603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 246.25, "completions/min_length": 195.0, "epoch": 1.1687344913151365, "grad_norm": 0.2077266126871109, "kl": 0.07025146484375, "learning_rate": 4.402278045397675e-06, "loss": 0.0007036328315734863, "memory(GiB)": 38.02, "reward": 0.5738422870635986, "reward_std": 0.04992235451936722, "rewards/VisualizationJSONCombinedORM/mean": 0.5738422870635986, "rewards/VisualizationJSONCombinedORM/std": 0.13826055824756622, "step": 1413, "train_speed(iter/s)": 0.185139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 234.8125, "completions/min_length": 198.0, "epoch": 1.1695616211745243, "grad_norm": 0.19863471388816833, "kl": 0.052978515625, "learning_rate": 4.395111704285021e-06, "loss": 0.0005285297520458698, "memory(GiB)": 38.02, "reward": 0.4626081585884094, "reward_std": 0.08411333709955215, "rewards/VisualizationJSONCombinedORM/mean": 0.4626081585884094, "rewards/VisualizationJSONCombinedORM/std": 0.12383021414279938, "step": 1414, "train_speed(iter/s)": 0.184609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 281.9375, "completions/min_length": 219.0, "epoch": 1.1703887510339124, "grad_norm": 0.16848663985729218, "kl": 0.07489013671875, "learning_rate": 4.38794662400392e-06, "loss": 0.0007484108209609985, "memory(GiB)": 38.02, "reward": 0.4143363833427429, "reward_std": 0.05464569106698036, "rewards/VisualizationJSONCombinedORM/mean": 0.4143363833427429, "rewards/VisualizationJSONCombinedORM/std": 0.07947078347206116, "step": 1415, "train_speed(iter/s)": 0.184259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 261.5, "completions/min_length": 218.0, "epoch": 1.1712158808933002, "grad_norm": 0.17306087911128998, "kl": 0.06317138671875, "learning_rate": 4.380782819489295e-06, "loss": 0.000631675124168396, "memory(GiB)": 38.02, "reward": 0.36561453342437744, "reward_std": 0.05990662798285484, "rewards/VisualizationJSONCombinedORM/mean": 0.36561453342437744, "rewards/VisualizationJSONCombinedORM/std": 0.07741416990756989, "step": 1416, "train_speed(iter/s)": 0.183931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 235.3125, "completions/min_length": 198.0, "epoch": 1.1720430107526882, "grad_norm": 0.1798991709947586, "kl": 0.044647216796875, "learning_rate": 4.3736203056734075e-06, "loss": 0.00044561177492141724, "memory(GiB)": 38.02, "reward": 0.47098636627197266, "reward_std": 0.06700433045625687, "rewards/VisualizationJSONCombinedORM/mean": 0.47098636627197266, "rewards/VisualizationJSONCombinedORM/std": 0.29982003569602966, "step": 1417, "train_speed(iter/s)": 0.183518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 245.0, "completions/min_length": 204.0, "epoch": 1.172870140612076, "grad_norm": 0.20881661772727966, "kl": 0.0582275390625, "learning_rate": 4.366459097485832e-06, "loss": 0.0005816519260406494, "memory(GiB)": 38.02, "reward": 0.39768487215042114, "reward_std": 0.053762905299663544, "rewards/VisualizationJSONCombinedORM/mean": 0.39768487215042114, "rewards/VisualizationJSONCombinedORM/std": 0.054337210953235626, "step": 1418, "train_speed(iter/s)": 0.183071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 225.9375, "completions/min_length": 202.0, "epoch": 1.173697270471464, "grad_norm": 0.19885119795799255, "kl": 0.0655517578125, "learning_rate": 4.359299209853416e-06, "loss": 0.0006553903222084045, "memory(GiB)": 38.02, "reward": 0.5289932489395142, "reward_std": 0.0686548501253128, "rewards/VisualizationJSONCombinedORM/mean": 0.5289932489395142, "rewards/VisualizationJSONCombinedORM/std": 0.09167952835559845, "step": 1419, "train_speed(iter/s)": 0.182631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 235.0625, "completions/min_length": 197.0, "epoch": 1.174524400330852, "grad_norm": 0.16978375613689423, "kl": 0.06024169921875, "learning_rate": 4.352140657700259e-06, "loss": 0.0006024129688739777, "memory(GiB)": 38.02, "reward": 0.49893197417259216, "reward_std": 0.060893069952726364, "rewards/VisualizationJSONCombinedORM/mean": 0.49893197417259216, "rewards/VisualizationJSONCombinedORM/std": 0.10096663981676102, "step": 1420, "train_speed(iter/s)": 0.182188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 248.0625, "completions/min_length": 210.0, "epoch": 1.1753515301902397, "grad_norm": 0.18990905582904816, "kl": 0.0947265625, "learning_rate": 4.344983455947675e-06, "loss": 0.0009461361914873123, "memory(GiB)": 38.02, "reward": 0.7452165484428406, "reward_std": 0.12673649191856384, "rewards/VisualizationJSONCombinedORM/mean": 0.7452165484428406, "rewards/VisualizationJSONCombinedORM/std": 0.1273745596408844, "step": 1421, "train_speed(iter/s)": 0.181828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 234.8125, "completions/min_length": 192.0, "epoch": 1.1761786600496278, "grad_norm": 0.1567462980747223, "kl": 0.0513916015625, "learning_rate": 4.3378276195141665e-06, "loss": 0.0005134977400302887, "memory(GiB)": 38.02, "reward": 0.5336131453514099, "reward_std": 0.11678257584571838, "rewards/VisualizationJSONCombinedORM/mean": 0.5336131453514099, "rewards/VisualizationJSONCombinedORM/std": 0.12201666086912155, "step": 1422, "train_speed(iter/s)": 0.181303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 255.6875, "completions/min_length": 217.0, "epoch": 1.1770057899090158, "grad_norm": 0.17672620713710785, "kl": 0.0921630859375, "learning_rate": 4.3306731633153835e-06, "loss": 0.0009210482239723206, "memory(GiB)": 38.02, "reward": 0.6572955846786499, "reward_std": 0.0857069194316864, "rewards/VisualizationJSONCombinedORM/mean": 0.6572955846786499, "rewards/VisualizationJSONCombinedORM/std": 0.10288871824741364, "step": 1423, "train_speed(iter/s)": 0.180976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 247.375, "completions/min_length": 182.0, "epoch": 1.1778329197684037, "grad_norm": 0.21648281812667847, "kl": 0.06689453125, "learning_rate": 4.323520102264103e-06, "loss": 0.0006700009107589722, "memory(GiB)": 38.02, "reward": 0.35398080945014954, "reward_std": 0.09122302383184433, "rewards/VisualizationJSONCombinedORM/mean": 0.35398080945014954, "rewards/VisualizationJSONCombinedORM/std": 0.15993453562259674, "step": 1424, "train_speed(iter/s)": 0.180512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 243.8125, "completions/min_length": 203.0, "epoch": 1.1786600496277915, "grad_norm": 0.16835592687129974, "kl": 0.07293701171875, "learning_rate": 4.316368451270195e-06, "loss": 0.0007287841290235519, "memory(GiB)": 38.02, "reward": 0.4327526390552521, "reward_std": 0.08573393523693085, "rewards/VisualizationJSONCombinedORM/mean": 0.4327526390552521, "rewards/VisualizationJSONCombinedORM/std": 0.08556566387414932, "step": 1425, "train_speed(iter/s)": 0.179956 }, { "epoch": 1.1786600496277915, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 283.6666666666667, "eval_completions/mean_length": 239.19791666666666, "eval_completions/min_length": 210.33333333333334, "eval_kl": 0.06610107421875, "eval_loss": 0.0006606814567930996, "eval_reward": 0.4216247747341792, "eval_reward_std": 0.06178472703322768, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4216247747341792, "eval_rewards/VisualizationJSONCombinedORM/std": 0.061784727576499186, "eval_runtime": 261.6752, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.011, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 249.9375, "completions/min_length": 214.0, "epoch": 1.1794871794871795, "grad_norm": 0.14560610055923462, "kl": 0.03564453125, "learning_rate": 4.309218225240591e-06, "loss": 0.0003562894416972995, "memory(GiB)": 38.02, "reward": 0.5365725755691528, "reward_std": 0.0649440661072731, "rewards/VisualizationJSONCombinedORM/mean": 0.5365725755691528, "rewards/VisualizationJSONCombinedORM/std": 0.15400360524654388, "step": 1426, "train_speed(iter/s)": 0.173867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 211.6875, "completions/min_length": 180.0, "epoch": 1.1803143093465673, "grad_norm": 0.19782884418964386, "kl": 0.056640625, "learning_rate": 4.302069439079245e-06, "loss": 0.0005663558840751648, "memory(GiB)": 38.02, "reward": 0.5210742950439453, "reward_std": 0.07845503091812134, "rewards/VisualizationJSONCombinedORM/mean": 0.5210742950439453, "rewards/VisualizationJSONCombinedORM/std": 0.11299736052751541, "step": 1427, "train_speed(iter/s)": 0.173548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 239.0625, "completions/min_length": 199.0, "epoch": 1.1811414392059554, "grad_norm": 0.2146797627210617, "kl": 0.1064453125, "learning_rate": 4.29492210768712e-06, "loss": 0.001065727323293686, "memory(GiB)": 38.02, "reward": 0.5393332242965698, "reward_std": 0.0922049731016159, "rewards/VisualizationJSONCombinedORM/mean": 0.5393332242965698, "rewards/VisualizationJSONCombinedORM/std": 0.19816412031650543, "step": 1428, "train_speed(iter/s)": 0.173181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 241.8125, "completions/min_length": 200.0, "epoch": 1.1819685690653432, "grad_norm": 0.1916283518075943, "kl": 0.03594970703125, "learning_rate": 4.28777624596214e-06, "loss": 0.0003596991300582886, "memory(GiB)": 38.02, "reward": 0.6688560247421265, "reward_std": 0.0834278091788292, "rewards/VisualizationJSONCombinedORM/mean": 0.6688560247421265, "rewards/VisualizationJSONCombinedORM/std": 0.1046217679977417, "step": 1429, "train_speed(iter/s)": 0.172781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 231.25, "completions/min_length": 202.0, "epoch": 1.1827956989247312, "grad_norm": 0.17946787178516388, "kl": 0.05804443359375, "learning_rate": 4.280631868799169e-06, "loss": 0.0005803331732749939, "memory(GiB)": 38.02, "reward": 0.43280988931655884, "reward_std": 0.05575360357761383, "rewards/VisualizationJSONCombinedORM/mean": 0.43280988931655884, "rewards/VisualizationJSONCombinedORM/std": 0.06426814943552017, "step": 1430, "train_speed(iter/s)": 0.172308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 240.6875, "completions/min_length": 205.0, "epoch": 1.183622828784119, "grad_norm": 0.194146990776062, "kl": 0.05706787109375, "learning_rate": 4.27348899108997e-06, "loss": 0.0005721338093280792, "memory(GiB)": 38.02, "reward": 0.4984644651412964, "reward_std": 0.04997306689620018, "rewards/VisualizationJSONCombinedORM/mean": 0.4984644651412964, "rewards/VisualizationJSONCombinedORM/std": 0.140182226896286, "step": 1431, "train_speed(iter/s)": 0.171938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 243.75, "completions/min_length": 204.0, "epoch": 1.184449958643507, "grad_norm": 0.17986342310905457, "kl": 0.135498046875, "learning_rate": 4.266347627723192e-06, "loss": 0.0013553202152252197, "memory(GiB)": 38.02, "reward": 0.449134886264801, "reward_std": 0.05735741928219795, "rewards/VisualizationJSONCombinedORM/mean": 0.449134886264801, "rewards/VisualizationJSONCombinedORM/std": 0.28035110235214233, "step": 1432, "train_speed(iter/s)": 0.171565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 229.0, "completions/min_length": 181.0, "epoch": 1.185277088502895, "grad_norm": 0.2386823296546936, "kl": 0.04718017578125, "learning_rate": 4.259207793584317e-06, "loss": 0.00047200173139572144, "memory(GiB)": 38.02, "reward": 0.4070718586444855, "reward_std": 0.06302762031555176, "rewards/VisualizationJSONCombinedORM/mean": 0.4070718586444855, "rewards/VisualizationJSONCombinedORM/std": 0.11770046502351761, "step": 1433, "train_speed(iter/s)": 0.171179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 228.9375, "completions/min_length": 196.0, "epoch": 1.186104218362283, "grad_norm": 0.21379777789115906, "kl": 0.0433349609375, "learning_rate": 4.252069503555645e-06, "loss": 0.000433117151260376, "memory(GiB)": 38.02, "reward": 0.5994774103164673, "reward_std": 0.10239563882350922, "rewards/VisualizationJSONCombinedORM/mean": 0.5994774103164673, "rewards/VisualizationJSONCombinedORM/std": 0.19088660180568695, "step": 1434, "train_speed(iter/s)": 0.170915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/mean_length": 227.0625, "completions/min_length": 186.0, "epoch": 1.1869313482216708, "grad_norm": 0.18724890053272247, "kl": 0.04986572265625, "learning_rate": 4.244932772516256e-06, "loss": 0.0004984773695468903, "memory(GiB)": 38.02, "reward": 0.37774544954299927, "reward_std": 0.02337237447500229, "rewards/VisualizationJSONCombinedORM/mean": 0.37774544954299927, "rewards/VisualizationJSONCombinedORM/std": 0.04690534248948097, "step": 1435, "train_speed(iter/s)": 0.170524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 220.8125, "completions/min_length": 183.0, "epoch": 1.1877584780810588, "grad_norm": 0.20135734975337982, "kl": 0.0762939453125, "learning_rate": 4.23779761534198e-06, "loss": 0.0007637050002813339, "memory(GiB)": 38.02, "reward": 0.5316795110702515, "reward_std": 0.06469672918319702, "rewards/VisualizationJSONCombinedORM/mean": 0.5316795110702515, "rewards/VisualizationJSONCombinedORM/std": 0.2375129908323288, "step": 1436, "train_speed(iter/s)": 0.170168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 230.625, "completions/min_length": 196.0, "epoch": 1.1885856079404467, "grad_norm": 0.15302269160747528, "kl": 0.048095703125, "learning_rate": 4.230664046905365e-06, "loss": 0.0004813903942704201, "memory(GiB)": 38.02, "reward": 0.6544721126556396, "reward_std": 0.08597204834222794, "rewards/VisualizationJSONCombinedORM/mean": 0.6544721126556396, "rewards/VisualizationJSONCombinedORM/std": 0.10476087778806686, "step": 1437, "train_speed(iter/s)": 0.169743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 228.875, "completions/min_length": 197.0, "epoch": 1.1894127377998345, "grad_norm": 0.18388551473617554, "kl": 0.06011962890625, "learning_rate": 4.223532082075652e-06, "loss": 0.0006017237901687622, "memory(GiB)": 38.02, "reward": 0.6973788738250732, "reward_std": 0.07102090865373611, "rewards/VisualizationJSONCombinedORM/mean": 0.6973788738250732, "rewards/VisualizationJSONCombinedORM/std": 0.14263613522052765, "step": 1438, "train_speed(iter/s)": 0.16939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 225.5625, "completions/min_length": 189.0, "epoch": 1.1902398676592225, "grad_norm": 0.17603835463523865, "kl": 0.06585693359375, "learning_rate": 4.216401735718738e-06, "loss": 0.0006573647260665894, "memory(GiB)": 38.02, "reward": 0.3712257146835327, "reward_std": 0.040762320160865784, "rewards/VisualizationJSONCombinedORM/mean": 0.3712257146835327, "rewards/VisualizationJSONCombinedORM/std": 0.03939913958311081, "step": 1439, "train_speed(iter/s)": 0.169086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 234.4375, "completions/min_length": 198.0, "epoch": 1.1910669975186103, "grad_norm": 0.2066819965839386, "kl": 0.07012939453125, "learning_rate": 4.20927302269714e-06, "loss": 0.0007032612338662148, "memory(GiB)": 38.02, "reward": 0.5371432304382324, "reward_std": 0.08678598701953888, "rewards/VisualizationJSONCombinedORM/mean": 0.5371432304382324, "rewards/VisualizationJSONCombinedORM/std": 0.2232365757226944, "step": 1440, "train_speed(iter/s)": 0.168707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 245.5625, "completions/min_length": 209.0, "epoch": 1.1918941273779984, "grad_norm": 0.24989597499370575, "kl": 0.0703125, "learning_rate": 4.202145957869979e-06, "loss": 0.000702936202287674, "memory(GiB)": 38.05, "reward": 0.5064882040023804, "reward_std": 0.07209077477455139, "rewards/VisualizationJSONCombinedORM/mean": 0.5064882040023804, "rewards/VisualizationJSONCombinedORM/std": 0.07440124452114105, "step": 1441, "train_speed(iter/s)": 0.168233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 229.25, "completions/min_length": 195.0, "epoch": 1.1927212572373862, "grad_norm": 0.19579322636127472, "kl": 0.0726318359375, "learning_rate": 4.195020556092935e-06, "loss": 0.0007266923785209656, "memory(GiB)": 38.05, "reward": 0.5578966736793518, "reward_std": 0.07525783777236938, "rewards/VisualizationJSONCombinedORM/mean": 0.5578966736793518, "rewards/VisualizationJSONCombinedORM/std": 0.20887190103530884, "step": 1442, "train_speed(iter/s)": 0.167896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 221.125, "completions/min_length": 177.0, "epoch": 1.1935483870967742, "grad_norm": 0.1847337931394577, "kl": 0.0819091796875, "learning_rate": 4.187896832218229e-06, "loss": 0.0008202865719795227, "memory(GiB)": 38.05, "reward": 0.5607190132141113, "reward_std": 0.1005067229270935, "rewards/VisualizationJSONCombinedORM/mean": 0.5607190132141113, "rewards/VisualizationJSONCombinedORM/std": 0.14187932014465332, "step": 1443, "train_speed(iter/s)": 0.167535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 232.625, "completions/min_length": 191.0, "epoch": 1.194375516956162, "grad_norm": 0.22258619964122772, "kl": 0.06719970703125, "learning_rate": 4.180774801094572e-06, "loss": 0.0006705261766910553, "memory(GiB)": 38.05, "reward": 0.36319980025291443, "reward_std": 0.0497654564678669, "rewards/VisualizationJSONCombinedORM/mean": 0.36319980025291443, "rewards/VisualizationJSONCombinedORM/std": 0.057020388543605804, "step": 1444, "train_speed(iter/s)": 0.167292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 239.5, "completions/min_length": 194.0, "epoch": 1.19520264681555, "grad_norm": 0.17611442506313324, "kl": 0.0579833984375, "learning_rate": 4.173654477567158e-06, "loss": 0.0005798041820526123, "memory(GiB)": 38.05, "reward": 0.5355870127677917, "reward_std": 0.04839511215686798, "rewards/VisualizationJSONCombinedORM/mean": 0.5355870127677917, "rewards/VisualizationJSONCombinedORM/std": 0.26579663157463074, "step": 1445, "train_speed(iter/s)": 0.166936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 223.875, "completions/min_length": 189.0, "epoch": 1.196029776674938, "grad_norm": 0.2588716745376587, "kl": 0.1370849609375, "learning_rate": 4.166535876477616e-06, "loss": 0.0013724416494369507, "memory(GiB)": 38.05, "reward": 0.42058494687080383, "reward_std": 0.11245040595531464, "rewards/VisualizationJSONCombinedORM/mean": 0.42058494687080383, "rewards/VisualizationJSONCombinedORM/std": 0.14594069123268127, "step": 1446, "train_speed(iter/s)": 0.166607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 238.0, "completions/min_length": 198.0, "epoch": 1.196856906534326, "grad_norm": 0.24374796450138092, "kl": 0.1209716796875, "learning_rate": 4.1594190126639886e-06, "loss": 0.001210734248161316, "memory(GiB)": 38.05, "reward": 0.628265917301178, "reward_std": 0.11990253627300262, "rewards/VisualizationJSONCombinedORM/mean": 0.628265917301178, "rewards/VisualizationJSONCombinedORM/std": 0.12447185069322586, "step": 1447, "train_speed(iter/s)": 0.166295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 229.4375, "completions/min_length": 193.0, "epoch": 1.1976840363937138, "grad_norm": 0.18613697588443756, "kl": 0.06488037109375, "learning_rate": 4.152303900960692e-06, "loss": 0.000649530440568924, "memory(GiB)": 38.05, "reward": 0.49249181151390076, "reward_std": 0.08354051411151886, "rewards/VisualizationJSONCombinedORM/mean": 0.49249181151390076, "rewards/VisualizationJSONCombinedORM/std": 0.08165857195854187, "step": 1448, "train_speed(iter/s)": 0.165943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 261.8125, "completions/min_length": 220.0, "epoch": 1.1985111662531018, "grad_norm": 0.19507858157157898, "kl": 0.1070556640625, "learning_rate": 4.145190556198494e-06, "loss": 0.0010725334286689758, "memory(GiB)": 38.05, "reward": 0.5741149187088013, "reward_std": 0.1180696189403534, "rewards/VisualizationJSONCombinedORM/mean": 0.5741149187088013, "rewards/VisualizationJSONCombinedORM/std": 0.19114020466804504, "step": 1449, "train_speed(iter/s)": 0.16561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 240.125, "completions/min_length": 187.0, "epoch": 1.1993382961124897, "grad_norm": 0.15805713832378387, "kl": 0.05938720703125, "learning_rate": 4.1380789932044794e-06, "loss": 0.0005930885672569275, "memory(GiB)": 38.05, "reward": 0.45505377650260925, "reward_std": 0.08716581761837006, "rewards/VisualizationJSONCombinedORM/mean": 0.45505377650260925, "rewards/VisualizationJSONCombinedORM/std": 0.24230331182479858, "step": 1450, "train_speed(iter/s)": 0.165266 }, { "epoch": 1.1993382961124897, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 276.3333333333333, "eval_completions/mean_length": 236.11458333333334, "eval_completions/min_length": 207.54166666666666, "eval_kl": 0.08848063151041667, "eval_loss": 0.0008876065840013325, "eval_reward": 0.46109408140182495, "eval_reward_std": 0.07158139107438426, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46109408140182495, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07158139351910602, "eval_runtime": 256.7195, "eval_samples_per_second": 0.093, "eval_steps_per_second": 0.012, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 222.6875, "completions/min_length": 208.0, "epoch": 1.2001654259718775, "grad_norm": 0.23631691932678223, "kl": 0.06561279296875, "learning_rate": 4.130969226802016e-06, "loss": 0.0006544962525367737, "memory(GiB)": 38.05, "reward": 0.6183534860610962, "reward_std": 0.09855446219444275, "rewards/VisualizationJSONCombinedORM/mean": 0.6183534860610962, "rewards/VisualizationJSONCombinedORM/std": 0.12979590892791748, "step": 1451, "train_speed(iter/s)": 0.16035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 234.6875, "completions/min_length": 188.0, "epoch": 1.2009925558312655, "grad_norm": 0.17854991555213928, "kl": 0.0579833984375, "learning_rate": 4.123861271810735e-06, "loss": 0.0005795955657958984, "memory(GiB)": 38.05, "reward": 0.7385267019271851, "reward_std": 0.12833748757839203, "rewards/VisualizationJSONCombinedORM/mean": 0.7385267019271851, "rewards/VisualizationJSONCombinedORM/std": 0.146851047873497, "step": 1452, "train_speed(iter/s)": 0.160026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 237.9375, "completions/min_length": 198.0, "epoch": 1.2018196856906533, "grad_norm": 0.17106500267982483, "kl": 0.0950927734375, "learning_rate": 4.116755143046477e-06, "loss": 0.000953279435634613, "memory(GiB)": 38.05, "reward": 0.44210702180862427, "reward_std": 0.06656581908464432, "rewards/VisualizationJSONCombinedORM/mean": 0.44210702180862427, "rewards/VisualizationJSONCombinedORM/std": 0.26399967074394226, "step": 1453, "train_speed(iter/s)": 0.159696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 243.75, "completions/min_length": 191.0, "epoch": 1.2026468155500414, "grad_norm": 0.1689673811197281, "kl": 0.06231689453125, "learning_rate": 4.109650855321291e-06, "loss": 0.0006235688924789429, "memory(GiB)": 38.05, "reward": 0.6859372854232788, "reward_std": 0.07560276985168457, "rewards/VisualizationJSONCombinedORM/mean": 0.6859372854232788, "rewards/VisualizationJSONCombinedORM/std": 0.11309721320867538, "step": 1454, "train_speed(iter/s)": 0.159286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 211.9375, "completions/min_length": 182.0, "epoch": 1.2034739454094292, "grad_norm": 0.15855734050273895, "kl": 0.0457763671875, "learning_rate": 4.10254842344338e-06, "loss": 0.0004581138491630554, "memory(GiB)": 38.05, "reward": 0.5767593383789062, "reward_std": 0.06650029867887497, "rewards/VisualizationJSONCombinedORM/mean": 0.5767593383789062, "rewards/VisualizationJSONCombinedORM/std": 0.09313514083623886, "step": 1455, "train_speed(iter/s)": 0.159074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 236.5625, "completions/min_length": 196.0, "epoch": 1.2043010752688172, "grad_norm": 0.21666984260082245, "kl": 0.109130859375, "learning_rate": 4.095447862217084e-06, "loss": 0.0010929293930530548, "memory(GiB)": 38.05, "reward": 0.4409661591053009, "reward_std": 0.09470333158969879, "rewards/VisualizationJSONCombinedORM/mean": 0.4409661591053009, "rewards/VisualizationJSONCombinedORM/std": 0.17876005172729492, "step": 1456, "train_speed(iter/s)": 0.158797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 238.25, "completions/min_length": 214.0, "epoch": 1.205128205128205, "grad_norm": 0.20176255702972412, "kl": 0.0931396484375, "learning_rate": 4.088349186442838e-06, "loss": 0.0009328983724117279, "memory(GiB)": 38.05, "reward": 0.5725448727607727, "reward_std": 0.12010863423347473, "rewards/VisualizationJSONCombinedORM/mean": 0.5725448727607727, "rewards/VisualizationJSONCombinedORM/std": 0.12480315566062927, "step": 1457, "train_speed(iter/s)": 0.158566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 243.9375, "completions/min_length": 214.0, "epoch": 1.205955334987593, "grad_norm": 0.20350104570388794, "kl": 0.1033935546875, "learning_rate": 4.081252410917148e-06, "loss": 0.0010342225432395935, "memory(GiB)": 38.05, "reward": 0.6216487884521484, "reward_std": 0.07993534207344055, "rewards/VisualizationJSONCombinedORM/mean": 0.6216487884521484, "rewards/VisualizationJSONCombinedORM/std": 0.17987224459648132, "step": 1458, "train_speed(iter/s)": 0.158336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 224.8125, "completions/min_length": 205.0, "epoch": 1.206782464846981, "grad_norm": 0.21295149624347687, "kl": 0.05804443359375, "learning_rate": 4.074157550432566e-06, "loss": 0.0005805175751447678, "memory(GiB)": 38.05, "reward": 0.6085090041160583, "reward_std": 0.07716235518455505, "rewards/VisualizationJSONCombinedORM/mean": 0.6085090041160583, "rewards/VisualizationJSONCombinedORM/std": 0.090894415974617, "step": 1459, "train_speed(iter/s)": 0.158082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 244.0625, "completions/min_length": 200.0, "epoch": 1.207609594706369, "grad_norm": 0.14252497255802155, "kl": 0.088623046875, "learning_rate": 4.067064619777645e-06, "loss": 0.0008873827755451202, "memory(GiB)": 38.05, "reward": 0.6510655879974365, "reward_std": 0.06869448721408844, "rewards/VisualizationJSONCombinedORM/mean": 0.6510655879974365, "rewards/VisualizationJSONCombinedORM/std": 0.07405266165733337, "step": 1460, "train_speed(iter/s)": 0.157734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 224.0625, "completions/min_length": 186.0, "epoch": 1.2084367245657568, "grad_norm": 0.21952582895755768, "kl": 0.060791015625, "learning_rate": 4.059973633736917e-06, "loss": 0.000607617199420929, "memory(GiB)": 38.05, "reward": 0.33878207206726074, "reward_std": 0.03119828552007675, "rewards/VisualizationJSONCombinedORM/mean": 0.33878207206726074, "rewards/VisualizationJSONCombinedORM/std": 0.03402355685830116, "step": 1461, "train_speed(iter/s)": 0.157358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 234.0, "completions/min_length": 196.0, "epoch": 1.2092638544251448, "grad_norm": 0.20456992089748383, "kl": 0.0677490234375, "learning_rate": 4.05288460709086e-06, "loss": 0.0006765648722648621, "memory(GiB)": 38.05, "reward": 0.594763457775116, "reward_std": 0.08256061375141144, "rewards/VisualizationJSONCombinedORM/mean": 0.594763457775116, "rewards/VisualizationJSONCombinedORM/std": 0.14493927359580994, "step": 1462, "train_speed(iter/s)": 0.157064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 243.4375, "completions/min_length": 207.0, "epoch": 1.2100909842845327, "grad_norm": 0.17076316475868225, "kl": 0.05987548828125, "learning_rate": 4.045797554615872e-06, "loss": 0.0005989596247673035, "memory(GiB)": 38.05, "reward": 0.47807520627975464, "reward_std": 0.07505636662244797, "rewards/VisualizationJSONCombinedORM/mean": 0.47807520627975464, "rewards/VisualizationJSONCombinedORM/std": 0.24032598733901978, "step": 1463, "train_speed(iter/s)": 0.156752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 242.125, "completions/min_length": 206.0, "epoch": 1.2109181141439205, "grad_norm": 0.20941397547721863, "kl": 0.04339599609375, "learning_rate": 4.038712491084234e-06, "loss": 0.0004341527819633484, "memory(GiB)": 38.05, "reward": 0.45206642150878906, "reward_std": 0.057211242616176605, "rewards/VisualizationJSONCombinedORM/mean": 0.45206642150878906, "rewards/VisualizationJSONCombinedORM/std": 0.12949275970458984, "step": 1464, "train_speed(iter/s)": 0.156432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 247.25, "completions/min_length": 219.0, "epoch": 1.2117452440033085, "grad_norm": 0.185725137591362, "kl": 0.109375, "learning_rate": 4.0316294312640754e-06, "loss": 0.0010926220566034317, "memory(GiB)": 38.05, "reward": 0.47056227922439575, "reward_std": 0.05950641632080078, "rewards/VisualizationJSONCombinedORM/mean": 0.47056227922439575, "rewards/VisualizationJSONCombinedORM/std": 0.11532986164093018, "step": 1465, "train_speed(iter/s)": 0.156056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 236.5625, "completions/min_length": 194.0, "epoch": 1.2125723738626966, "grad_norm": 0.18966957926750183, "kl": 0.0904541015625, "learning_rate": 4.02454838991936e-06, "loss": 0.0009045724291354418, "memory(GiB)": 38.05, "reward": 0.5984811782836914, "reward_std": 0.07522246241569519, "rewards/VisualizationJSONCombinedORM/mean": 0.5984811782836914, "rewards/VisualizationJSONCombinedORM/std": 0.10320962965488434, "step": 1466, "train_speed(iter/s)": 0.155812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 239.3125, "completions/min_length": 199.0, "epoch": 1.2133995037220844, "grad_norm": 0.1902977079153061, "kl": 0.088623046875, "learning_rate": 4.017469381809834e-06, "loss": 0.0008875355124473572, "memory(GiB)": 38.05, "reward": 0.7317471504211426, "reward_std": 0.08587075769901276, "rewards/VisualizationJSONCombinedORM/mean": 0.7317471504211426, "rewards/VisualizationJSONCombinedORM/std": 0.08743046224117279, "step": 1467, "train_speed(iter/s)": 0.155495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 233.5625, "completions/min_length": 201.0, "epoch": 1.2142266335814722, "grad_norm": 0.22604703903198242, "kl": 0.060302734375, "learning_rate": 4.0103924216910104e-06, "loss": 0.0006026662886142731, "memory(GiB)": 38.05, "reward": 0.6076282262802124, "reward_std": 0.06723535805940628, "rewards/VisualizationJSONCombinedORM/mean": 0.6076282262802124, "rewards/VisualizationJSONCombinedORM/std": 0.11419175565242767, "step": 1468, "train_speed(iter/s)": 0.15522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 218.1875, "completions/min_length": 177.0, "epoch": 1.2150537634408602, "grad_norm": 0.2181006669998169, "kl": 0.05572509765625, "learning_rate": 4.0033175243141365e-06, "loss": 0.0005575492978096008, "memory(GiB)": 38.05, "reward": 0.5101737976074219, "reward_std": 0.0654263123869896, "rewards/VisualizationJSONCombinedORM/mean": 0.5101737976074219, "rewards/VisualizationJSONCombinedORM/std": 0.27423095703125, "step": 1469, "train_speed(iter/s)": 0.154934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 218.3125, "completions/min_length": 188.0, "epoch": 1.215880893300248, "grad_norm": 0.19247044622898102, "kl": 0.08740234375, "learning_rate": 3.996244704426153e-06, "loss": 0.0008726343512535095, "memory(GiB)": 38.05, "reward": 0.35691601037979126, "reward_std": 0.06894867867231369, "rewards/VisualizationJSONCombinedORM/mean": 0.35691601037979126, "rewards/VisualizationJSONCombinedORM/std": 0.13213029503822327, "step": 1470, "train_speed(iter/s)": 0.154638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 235.0625, "completions/min_length": 185.0, "epoch": 1.216708023159636, "grad_norm": 0.16410039365291595, "kl": 0.084716796875, "learning_rate": 3.989173976769672e-06, "loss": 0.000846564769744873, "memory(GiB)": 38.05, "reward": 0.3553447425365448, "reward_std": 0.041688039898872375, "rewards/VisualizationJSONCombinedORM/mean": 0.3553447425365448, "rewards/VisualizationJSONCombinedORM/std": 0.15282988548278809, "step": 1471, "train_speed(iter/s)": 0.154344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 229.375, "completions/min_length": 206.0, "epoch": 1.217535153019024, "grad_norm": 0.13807179033756256, "kl": 0.05926513671875, "learning_rate": 3.982105356082951e-06, "loss": 0.0005913600325584412, "memory(GiB)": 38.05, "reward": 0.4747282564640045, "reward_std": 0.05908409506082535, "rewards/VisualizationJSONCombinedORM/mean": 0.4747282564640045, "rewards/VisualizationJSONCombinedORM/std": 0.05852256342768669, "step": 1472, "train_speed(iter/s)": 0.154052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 231.0625, "completions/min_length": 206.0, "epoch": 1.218362282878412, "grad_norm": 0.180860698223114, "kl": 0.0546875, "learning_rate": 3.975038857099849e-06, "loss": 0.000546582043170929, "memory(GiB)": 38.05, "reward": 0.6385626196861267, "reward_std": 0.0409461185336113, "rewards/VisualizationJSONCombinedORM/mean": 0.6385626196861267, "rewards/VisualizationJSONCombinedORM/std": 0.08734478056430817, "step": 1473, "train_speed(iter/s)": 0.15374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 245.875, "completions/min_length": 205.0, "epoch": 1.2191894127377998, "grad_norm": 0.20818699896335602, "kl": 0.1009521484375, "learning_rate": 3.967974494549803e-06, "loss": 0.0010087676346302032, "memory(GiB)": 38.05, "reward": 0.5248838663101196, "reward_std": 0.07477031648159027, "rewards/VisualizationJSONCombinedORM/mean": 0.5248838663101196, "rewards/VisualizationJSONCombinedORM/std": 0.12189238518476486, "step": 1474, "train_speed(iter/s)": 0.153367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 236.625, "completions/min_length": 200.0, "epoch": 1.2200165425971878, "grad_norm": 0.16975386440753937, "kl": 0.1375732421875, "learning_rate": 3.960912283157797e-06, "loss": 0.001374424435198307, "memory(GiB)": 38.05, "reward": 0.715583086013794, "reward_std": 0.07980190217494965, "rewards/VisualizationJSONCombinedORM/mean": 0.715583086013794, "rewards/VisualizationJSONCombinedORM/std": 0.09412398934364319, "step": 1475, "train_speed(iter/s)": 0.15309 }, { "epoch": 1.2200165425971878, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 276.0833333333333, "eval_completions/mean_length": 235.22916666666666, "eval_completions/min_length": 203.375, "eval_kl": 0.09340413411458333, "eval_loss": 0.0009385297889821231, "eval_reward": 0.47195088490843773, "eval_reward_std": 0.07784408253307144, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47195088490843773, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07784408369722466, "eval_runtime": 256.7623, "eval_samples_per_second": 0.093, "eval_steps_per_second": 0.012, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 239.25, "completions/min_length": 211.0, "epoch": 1.2208436724565757, "grad_norm": 0.1936580091714859, "kl": 0.097900390625, "learning_rate": 3.953852237644337e-06, "loss": 0.000979635864496231, "memory(GiB)": 38.05, "reward": 0.246337890625, "reward_std": 0.04218892753124237, "rewards/VisualizationJSONCombinedORM/mean": 0.246337890625, "rewards/VisualizationJSONCombinedORM/std": 0.0637759193778038, "step": 1476, "train_speed(iter/s)": 0.148861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/mean_length": 221.5, "completions/min_length": 190.0, "epoch": 1.2216708023159637, "grad_norm": 0.1451890915632248, "kl": 0.046630859375, "learning_rate": 3.946794372725407e-06, "loss": 0.000465981662273407, "memory(GiB)": 38.05, "reward": 0.7541232109069824, "reward_std": 0.0858570784330368, "rewards/VisualizationJSONCombinedORM/mean": 0.7541232109069824, "rewards/VisualizationJSONCombinedORM/std": 0.08300001174211502, "step": 1477, "train_speed(iter/s)": 0.148656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 214.9375, "completions/min_length": 186.0, "epoch": 1.2224979321753515, "grad_norm": 0.17141509056091309, "kl": 0.078369140625, "learning_rate": 3.939738703112447e-06, "loss": 0.0007836967706680298, "memory(GiB)": 38.05, "reward": 0.4020969867706299, "reward_std": 0.05039478838443756, "rewards/VisualizationJSONCombinedORM/mean": 0.4020969867706299, "rewards/VisualizationJSONCombinedORM/std": 0.07057519257068634, "step": 1478, "train_speed(iter/s)": 0.148435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 238.375, "completions/min_length": 216.0, "epoch": 1.2233250620347396, "grad_norm": 0.18144568800926208, "kl": 0.1060791015625, "learning_rate": 3.932685243512326e-06, "loss": 0.0010599717497825623, "memory(GiB)": 38.05, "reward": 0.49827927350997925, "reward_std": 0.06297747045755386, "rewards/VisualizationJSONCombinedORM/mean": 0.49827927350997925, "rewards/VisualizationJSONCombinedORM/std": 0.12461795657873154, "step": 1479, "train_speed(iter/s)": 0.148212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 236.4375, "completions/min_length": 206.0, "epoch": 1.2241521918941274, "grad_norm": 0.17889520525932312, "kl": 0.10986328125, "learning_rate": 3.925634008627299e-06, "loss": 0.0010988786816596985, "memory(GiB)": 38.05, "reward": 0.5287104845046997, "reward_std": 0.0799657553434372, "rewards/VisualizationJSONCombinedORM/mean": 0.5287104845046997, "rewards/VisualizationJSONCombinedORM/std": 0.1207556277513504, "step": 1480, "train_speed(iter/s)": 0.147986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 226.25, "completions/min_length": 201.0, "epoch": 1.2249793217535152, "grad_norm": 0.23540203273296356, "kl": 0.1114501953125, "learning_rate": 3.918585013154995e-06, "loss": 0.0011146292090415955, "memory(GiB)": 38.05, "reward": 0.45824331045150757, "reward_std": 0.09662685543298721, "rewards/VisualizationJSONCombinedORM/mean": 0.45824331045150757, "rewards/VisualizationJSONCombinedORM/std": 0.23165497183799744, "step": 1481, "train_speed(iter/s)": 0.14778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 229.625, "completions/min_length": 163.0, "epoch": 1.2258064516129032, "grad_norm": 0.17161008715629578, "kl": 0.091796875, "learning_rate": 3.911538271788359e-06, "loss": 0.0009181611239910126, "memory(GiB)": 38.05, "reward": 0.6659939289093018, "reward_std": 0.08188705146312714, "rewards/VisualizationJSONCombinedORM/mean": 0.6659939289093018, "rewards/VisualizationJSONCombinedORM/std": 0.09253255277872086, "step": 1482, "train_speed(iter/s)": 0.147548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 245.1875, "completions/min_length": 203.0, "epoch": 1.226633581472291, "grad_norm": 0.1941443234682083, "kl": 0.094970703125, "learning_rate": 3.904493799215652e-06, "loss": 0.0009470544755458832, "memory(GiB)": 38.05, "reward": 0.6481664776802063, "reward_std": 0.11166489869356155, "rewards/VisualizationJSONCombinedORM/mean": 0.6481664776802063, "rewards/VisualizationJSONCombinedORM/std": 0.11726035177707672, "step": 1483, "train_speed(iter/s)": 0.147299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 238.5625, "completions/min_length": 192.0, "epoch": 1.227460711331679, "grad_norm": 0.1897123008966446, "kl": 0.0972900390625, "learning_rate": 3.897451610120399e-06, "loss": 0.0009716935455799103, "memory(GiB)": 38.05, "reward": 0.5592600107192993, "reward_std": 0.07439710199832916, "rewards/VisualizationJSONCombinedORM/mean": 0.5592600107192993, "rewards/VisualizationJSONCombinedORM/std": 0.16930150985717773, "step": 1484, "train_speed(iter/s)": 0.147001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 233.375, "completions/min_length": 192.0, "epoch": 1.228287841191067, "grad_norm": 0.20380911231040955, "kl": 0.1092529296875, "learning_rate": 3.890411719181367e-06, "loss": 0.001092277467250824, "memory(GiB)": 38.05, "reward": 0.4900546073913574, "reward_std": 0.07764360308647156, "rewards/VisualizationJSONCombinedORM/mean": 0.4900546073913574, "rewards/VisualizationJSONCombinedORM/std": 0.22539323568344116, "step": 1485, "train_speed(iter/s)": 0.146838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 234.0625, "completions/min_length": 187.0, "epoch": 1.229114971050455, "grad_norm": 0.2205846905708313, "kl": 0.068603515625, "learning_rate": 3.883374141072534e-06, "loss": 0.0006867628544569016, "memory(GiB)": 38.05, "reward": 0.4680154323577881, "reward_std": 0.05964882671833038, "rewards/VisualizationJSONCombinedORM/mean": 0.4680154323577881, "rewards/VisualizationJSONCombinedORM/std": 0.21675896644592285, "step": 1486, "train_speed(iter/s)": 0.146611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 217.125, "completions/min_length": 187.0, "epoch": 1.2299421009098428, "grad_norm": 0.28537771105766296, "kl": 0.129150390625, "learning_rate": 3.8763388904630525e-06, "loss": 0.0012911856174468994, "memory(GiB)": 38.05, "reward": 0.6093443036079407, "reward_std": 0.17156526446342468, "rewards/VisualizationJSONCombinedORM/mean": 0.6093443036079407, "rewards/VisualizationJSONCombinedORM/std": 0.16681841015815735, "step": 1487, "train_speed(iter/s)": 0.146307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 235.875, "completions/min_length": 191.0, "epoch": 1.2307692307692308, "grad_norm": 0.1658446490764618, "kl": 0.10205078125, "learning_rate": 3.869305982017229e-06, "loss": 0.0010202601552009583, "memory(GiB)": 38.05, "reward": 0.543757438659668, "reward_std": 0.06722109019756317, "rewards/VisualizationJSONCombinedORM/mean": 0.543757438659668, "rewards/VisualizationJSONCombinedORM/std": 0.07218381017446518, "step": 1488, "train_speed(iter/s)": 0.146059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 248.5, "completions/min_length": 194.0, "epoch": 1.2315963606286187, "grad_norm": 0.21001099050045013, "kl": 0.0782470703125, "learning_rate": 3.862275430394484e-06, "loss": 0.000782303512096405, "memory(GiB)": 38.05, "reward": 0.5317912101745605, "reward_std": 0.07599713653326035, "rewards/VisualizationJSONCombinedORM/mean": 0.5317912101745605, "rewards/VisualizationJSONCombinedORM/std": 0.11350049823522568, "step": 1489, "train_speed(iter/s)": 0.145806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 228.75, "completions/min_length": 176.0, "epoch": 1.2324234904880067, "grad_norm": 0.2211875021457672, "kl": 0.0950927734375, "learning_rate": 3.855247250249331e-06, "loss": 0.0009484067559242249, "memory(GiB)": 38.05, "reward": 0.44082319736480713, "reward_std": 0.0997694581747055, "rewards/VisualizationJSONCombinedORM/mean": 0.44082319736480713, "rewards/VisualizationJSONCombinedORM/std": 0.17765121161937714, "step": 1490, "train_speed(iter/s)": 0.145537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 251.4375, "completions/min_length": 208.0, "epoch": 1.2332506203473945, "grad_norm": 0.20088213682174683, "kl": 0.142578125, "learning_rate": 3.848221456231331e-06, "loss": 0.0014229491353034973, "memory(GiB)": 38.05, "reward": 0.45482930541038513, "reward_std": 0.06462115049362183, "rewards/VisualizationJSONCombinedORM/mean": 0.45482930541038513, "rewards/VisualizationJSONCombinedORM/std": 0.27300822734832764, "step": 1491, "train_speed(iter/s)": 0.145331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 237.25, "completions/min_length": 195.0, "epoch": 1.2340777502067826, "grad_norm": 0.21543455123901367, "kl": 0.05828857421875, "learning_rate": 3.84119806298508e-06, "loss": 0.0005834735929965973, "memory(GiB)": 38.05, "reward": 0.5629885196685791, "reward_std": 0.04181261360645294, "rewards/VisualizationJSONCombinedORM/mean": 0.5629885196685791, "rewards/VisualizationJSONCombinedORM/std": 0.2890433371067047, "step": 1492, "train_speed(iter/s)": 0.145077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 235.1875, "completions/min_length": 197.0, "epoch": 1.2349048800661704, "grad_norm": 0.1789354532957077, "kl": 0.0849609375, "learning_rate": 3.834177085150166e-06, "loss": 0.0008486583828926086, "memory(GiB)": 38.05, "reward": 0.32817018032073975, "reward_std": 0.05555626377463341, "rewards/VisualizationJSONCombinedORM/mean": 0.32817018032073975, "rewards/VisualizationJSONCombinedORM/std": 0.05740649625658989, "step": 1493, "train_speed(iter/s)": 0.144914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 257.5, "completions/min_length": 228.0, "epoch": 1.2357320099255582, "grad_norm": 0.11986903101205826, "kl": 0.1007080078125, "learning_rate": 3.827158537361144e-06, "loss": 0.0010057613253593445, "memory(GiB)": 38.05, "reward": 0.5639333724975586, "reward_std": 0.06167598441243172, "rewards/VisualizationJSONCombinedORM/mean": 0.5639333724975586, "rewards/VisualizationJSONCombinedORM/std": 0.278977632522583, "step": 1494, "train_speed(iter/s)": 0.144684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 230.625, "completions/min_length": 194.0, "epoch": 1.2365591397849462, "grad_norm": 0.19968263804912567, "kl": 0.069580078125, "learning_rate": 3.8201424342475e-06, "loss": 0.0006954632699489594, "memory(GiB)": 38.05, "reward": 0.503677487373352, "reward_std": 0.06649050116539001, "rewards/VisualizationJSONCombinedORM/mean": 0.503677487373352, "rewards/VisualizationJSONCombinedORM/std": 0.10782203078269958, "step": 1495, "train_speed(iter/s)": 0.144465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 225.75, "completions/min_length": 187.0, "epoch": 1.237386269644334, "grad_norm": 0.19321702420711517, "kl": 0.09521484375, "learning_rate": 3.8131287904336288e-06, "loss": 0.0009498968720436096, "memory(GiB)": 38.05, "reward": 0.5252761840820312, "reward_std": 0.057333171367645264, "rewards/VisualizationJSONCombinedORM/mean": 0.5252761840820312, "rewards/VisualizationJSONCombinedORM/std": 0.12168631702661514, "step": 1496, "train_speed(iter/s)": 0.144216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 226.0, "completions/min_length": 183.0, "epoch": 1.2382133995037221, "grad_norm": 0.19406181573867798, "kl": 0.062255859375, "learning_rate": 3.8061176205387983e-06, "loss": 0.0006226412951946259, "memory(GiB)": 38.05, "reward": 0.6434000730514526, "reward_std": 0.09103932976722717, "rewards/VisualizationJSONCombinedORM/mean": 0.6434000730514526, "rewards/VisualizationJSONCombinedORM/std": 0.15364305675029755, "step": 1497, "train_speed(iter/s)": 0.144035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 236.6875, "completions/min_length": 187.0, "epoch": 1.23904052936311, "grad_norm": 0.17974677681922913, "kl": 0.0899658203125, "learning_rate": 3.7991089391771185e-06, "loss": 0.0008999332785606384, "memory(GiB)": 38.05, "reward": 0.5022174715995789, "reward_std": 0.07613567262887955, "rewards/VisualizationJSONCombinedORM/mean": 0.5022174715995789, "rewards/VisualizationJSONCombinedORM/std": 0.2297707498073578, "step": 1498, "train_speed(iter/s)": 0.143802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 244.9375, "completions/min_length": 205.0, "epoch": 1.239867659222498, "grad_norm": 0.172148659825325, "kl": 0.0819091796875, "learning_rate": 3.7921027609575114e-06, "loss": 0.000819031149148941, "memory(GiB)": 38.05, "reward": 0.6764435768127441, "reward_std": 0.10788485407829285, "rewards/VisualizationJSONCombinedORM/mean": 0.6764435768127441, "rewards/VisualizationJSONCombinedORM/std": 0.1593678742647171, "step": 1499, "train_speed(iter/s)": 0.143483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 246.125, "completions/min_length": 205.0, "epoch": 1.2406947890818858, "grad_norm": 0.19131921231746674, "kl": 0.1070556640625, "learning_rate": 3.7850991004836813e-06, "loss": 0.0010728128254413605, "memory(GiB)": 38.05, "reward": 0.32447507977485657, "reward_std": 0.03536493703722954, "rewards/VisualizationJSONCombinedORM/mean": 0.32447507977485657, "rewards/VisualizationJSONCombinedORM/std": 0.10785625129938126, "step": 1500, "train_speed(iter/s)": 0.143287 }, { "epoch": 1.2406947890818858, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 272.4583333333333, "eval_completions/mean_length": 233.02083333333334, "eval_completions/min_length": 203.54166666666666, "eval_kl": 0.08221435546875, "eval_loss": 0.0008299567853100598, "eval_reward": 0.4690913117180268, "eval_reward_std": 0.07198860408971086, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4690913117180268, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07198860575833048, "eval_runtime": 254.5159, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.012, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 235.9375, "completions/min_length": 202.0, "epoch": 1.2415219189412738, "grad_norm": 0.1890418529510498, "kl": 0.072998046875, "learning_rate": 3.778097972354088e-06, "loss": 0.0007297247648239136, "memory(GiB)": 38.05, "reward": 0.39172592759132385, "reward_std": 0.05283260717988014, "rewards/VisualizationJSONCombinedORM/mean": 0.39172592759132385, "rewards/VisualizationJSONCombinedORM/std": 0.24327318370342255, "step": 1501, "train_speed(iter/s)": 0.139706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 225.9375, "completions/min_length": 193.0, "epoch": 1.2423490488006617, "grad_norm": 0.17998576164245605, "kl": 0.07208251953125, "learning_rate": 3.7710993911619093e-06, "loss": 0.0007190890610218048, "memory(GiB)": 38.05, "reward": 0.6256597638130188, "reward_std": 0.07190732657909393, "rewards/VisualizationJSONCombinedORM/mean": 0.6256597638130188, "rewards/VisualizationJSONCombinedORM/std": 0.1850803792476654, "step": 1502, "train_speed(iter/s)": 0.139521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 226.6875, "completions/min_length": 191.0, "epoch": 1.2431761786600497, "grad_norm": 0.19388669729232788, "kl": 0.0545654296875, "learning_rate": 3.764103371495018e-06, "loss": 0.0005453824996948242, "memory(GiB)": 38.05, "reward": 0.5720444321632385, "reward_std": 0.08841346204280853, "rewards/VisualizationJSONCombinedORM/mean": 0.5720444321632385, "rewards/VisualizationJSONCombinedORM/std": 0.105216383934021, "step": 1503, "train_speed(iter/s)": 0.139341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 250.6875, "completions/min_length": 218.0, "epoch": 1.2440033085194375, "grad_norm": 0.19023717939853668, "kl": 0.12451171875, "learning_rate": 3.757109927935943e-06, "loss": 0.00124395452439785, "memory(GiB)": 38.05, "reward": 0.6037633419036865, "reward_std": 0.09657241404056549, "rewards/VisualizationJSONCombinedORM/mean": 0.6037633419036865, "rewards/VisualizationJSONCombinedORM/std": 0.1331566423177719, "step": 1504, "train_speed(iter/s)": 0.139115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 226.0625, "completions/min_length": 185.0, "epoch": 1.2448304383788256, "grad_norm": 0.18927821516990662, "kl": 0.08251953125, "learning_rate": 3.7501190750618454e-06, "loss": 0.0008246749639511108, "memory(GiB)": 38.05, "reward": 0.6014382243156433, "reward_std": 0.09639449417591095, "rewards/VisualizationJSONCombinedORM/mean": 0.6014382243156433, "rewards/VisualizationJSONCombinedORM/std": 0.2230077087879181, "step": 1505, "train_speed(iter/s)": 0.138919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 242.3125, "completions/min_length": 205.0, "epoch": 1.2456575682382134, "grad_norm": 0.15448927879333496, "kl": 0.08935546875, "learning_rate": 3.743130827444487e-06, "loss": 0.0008923392742872238, "memory(GiB)": 38.05, "reward": 0.5610204339027405, "reward_std": 0.06055477634072304, "rewards/VisualizationJSONCombinedORM/mean": 0.5610204339027405, "rewards/VisualizationJSONCombinedORM/std": 0.23152108490467072, "step": 1506, "train_speed(iter/s)": 0.138705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 224.8125, "completions/min_length": 200.0, "epoch": 1.2464846980976012, "grad_norm": 0.1697482466697693, "kl": 0.05975341796875, "learning_rate": 3.7361451996501997e-06, "loss": 0.0005976259708404541, "memory(GiB)": 38.05, "reward": 0.49469253420829773, "reward_std": 0.08932249993085861, "rewards/VisualizationJSONCombinedORM/mean": 0.49469253420829773, "rewards/VisualizationJSONCombinedORM/std": 0.21839648485183716, "step": 1507, "train_speed(iter/s)": 0.13852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 238.3125, "completions/min_length": 194.0, "epoch": 1.2473118279569892, "grad_norm": 0.21914443373680115, "kl": 0.068359375, "learning_rate": 3.7291622062398523e-06, "loss": 0.000682394951581955, "memory(GiB)": 38.05, "reward": 0.401627779006958, "reward_std": 0.047870732843875885, "rewards/VisualizationJSONCombinedORM/mean": 0.401627779006958, "rewards/VisualizationJSONCombinedORM/std": 0.1289246529340744, "step": 1508, "train_speed(iter/s)": 0.138282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 220.75, "completions/min_length": 197.0, "epoch": 1.2481389578163773, "grad_norm": 0.21510404348373413, "kl": 0.0733642578125, "learning_rate": 3.722181861768824e-06, "loss": 0.0007345341145992279, "memory(GiB)": 38.05, "reward": 0.3965959846973419, "reward_std": 0.08170989155769348, "rewards/VisualizationJSONCombinedORM/mean": 0.3965959846973419, "rewards/VisualizationJSONCombinedORM/std": 0.11855700612068176, "step": 1509, "train_speed(iter/s)": 0.138121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 235.375, "completions/min_length": 204.0, "epoch": 1.2489660876757651, "grad_norm": 0.14680416882038116, "kl": 0.05352783203125, "learning_rate": 3.7152041807869744e-06, "loss": 0.0005357712507247925, "memory(GiB)": 38.05, "reward": 0.5082008242607117, "reward_std": 0.07270935922861099, "rewards/VisualizationJSONCombinedORM/mean": 0.5082008242607117, "rewards/VisualizationJSONCombinedORM/std": 0.17609147727489471, "step": 1510, "train_speed(iter/s)": 0.137891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 235.375, "completions/min_length": 214.0, "epoch": 1.249793217535153, "grad_norm": 0.15654800832271576, "kl": 0.0972900390625, "learning_rate": 3.7082291778386077e-06, "loss": 0.0009699556976556778, "memory(GiB)": 38.05, "reward": 0.637959361076355, "reward_std": 0.09323175996541977, "rewards/VisualizationJSONCombinedORM/mean": 0.637959361076355, "rewards/VisualizationJSONCombinedORM/std": 0.1697029322385788, "step": 1511, "train_speed(iter/s)": 0.137702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 235.25, "completions/min_length": 187.0, "epoch": 1.250620347394541, "grad_norm": 0.1837310642004013, "kl": 0.062255859375, "learning_rate": 3.7012568674624473e-06, "loss": 0.0006218738853931427, "memory(GiB)": 38.05, "reward": 0.703896164894104, "reward_std": 0.051350854337215424, "rewards/VisualizationJSONCombinedORM/mean": 0.703896164894104, "rewards/VisualizationJSONCombinedORM/std": 0.14264324307441711, "step": 1512, "train_speed(iter/s)": 0.137523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 233.9375, "completions/min_length": 188.0, "epoch": 1.2514474772539288, "grad_norm": 0.20265743136405945, "kl": 0.07763671875, "learning_rate": 3.6942872641916034e-06, "loss": 0.0007770583033561707, "memory(GiB)": 38.05, "reward": 0.4930011034011841, "reward_std": 0.06914617866277695, "rewards/VisualizationJSONCombinedORM/mean": 0.4930011034011841, "rewards/VisualizationJSONCombinedORM/std": 0.18972325325012207, "step": 1513, "train_speed(iter/s)": 0.137377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 247.5, "completions/min_length": 199.0, "epoch": 1.2522746071133168, "grad_norm": 0.18425703048706055, "kl": 0.07861328125, "learning_rate": 3.6873203825535473e-06, "loss": 0.0007871240377426147, "memory(GiB)": 38.05, "reward": 0.5078756809234619, "reward_std": 0.07309404015541077, "rewards/VisualizationJSONCombinedORM/mean": 0.5078756809234619, "rewards/VisualizationJSONCombinedORM/std": 0.217641681432724, "step": 1514, "train_speed(iter/s)": 0.137164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 227.625, "completions/min_length": 196.0, "epoch": 1.2531017369727047, "grad_norm": 0.19254820048809052, "kl": 0.0606689453125, "learning_rate": 3.6803562370700745e-06, "loss": 0.0006073415279388428, "memory(GiB)": 38.05, "reward": 0.5469245910644531, "reward_std": 0.09647861123085022, "rewards/VisualizationJSONCombinedORM/mean": 0.5469245910644531, "rewards/VisualizationJSONCombinedORM/std": 0.18871454894542694, "step": 1515, "train_speed(iter/s)": 0.136957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 248.3125, "completions/min_length": 185.0, "epoch": 1.2539288668320927, "grad_norm": 0.168950155377388, "kl": 0.042236328125, "learning_rate": 3.673394842257275e-06, "loss": 0.0004234425723552704, "memory(GiB)": 38.05, "reward": 0.697216808795929, "reward_std": 0.07753563672304153, "rewards/VisualizationJSONCombinedORM/mean": 0.697216808795929, "rewards/VisualizationJSONCombinedORM/std": 0.09952481091022491, "step": 1516, "train_speed(iter/s)": 0.136746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 213.6875, "completions/min_length": 190.0, "epoch": 1.2547559966914805, "grad_norm": 0.2669867277145386, "kl": 0.0543212890625, "learning_rate": 3.6664362126255087e-06, "loss": 0.0005432441830635071, "memory(GiB)": 38.05, "reward": 0.6308022737503052, "reward_std": 0.15583398938179016, "rewards/VisualizationJSONCombinedORM/mean": 0.6308022737503052, "rewards/VisualizationJSONCombinedORM/std": 0.1512131690979004, "step": 1517, "train_speed(iter/s)": 0.136528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 245.75, "completions/min_length": 210.0, "epoch": 1.2555831265508686, "grad_norm": 0.1813296526670456, "kl": 0.08251953125, "learning_rate": 3.659480362679371e-06, "loss": 0.0008254051208496094, "memory(GiB)": 38.05, "reward": 0.3959074020385742, "reward_std": 0.03261104226112366, "rewards/VisualizationJSONCombinedORM/mean": 0.3959074020385742, "rewards/VisualizationJSONCombinedORM/std": 0.044309817254543304, "step": 1518, "train_speed(iter/s)": 0.136309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 230.9375, "completions/min_length": 193.0, "epoch": 1.2564102564102564, "grad_norm": 0.19462373852729797, "kl": 0.0802001953125, "learning_rate": 3.652527306917663e-06, "loss": 0.0008017197251319885, "memory(GiB)": 38.05, "reward": 0.4208086133003235, "reward_std": 0.11223220825195312, "rewards/VisualizationJSONCombinedORM/mean": 0.4208086133003235, "rewards/VisualizationJSONCombinedORM/std": 0.1316501945257187, "step": 1519, "train_speed(iter/s)": 0.136058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 246.9375, "completions/min_length": 217.0, "epoch": 1.2572373862696442, "grad_norm": 0.18114501237869263, "kl": 0.08197021484375, "learning_rate": 3.6455770598333633e-06, "loss": 0.0008197834249585867, "memory(GiB)": 38.05, "reward": 0.6698660850524902, "reward_std": 0.08090952038764954, "rewards/VisualizationJSONCombinedORM/mean": 0.6698660850524902, "rewards/VisualizationJSONCombinedORM/std": 0.08582800626754761, "step": 1520, "train_speed(iter/s)": 0.135927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 230.125, "completions/min_length": 205.0, "epoch": 1.2580645161290323, "grad_norm": 0.1264146864414215, "kl": 0.03778076171875, "learning_rate": 3.638629635913592e-06, "loss": 0.00037799030542373657, "memory(GiB)": 38.05, "reward": 0.718921959400177, "reward_std": 0.10565553605556488, "rewards/VisualizationJSONCombinedORM/mean": 0.718921959400177, "rewards/VisualizationJSONCombinedORM/std": 0.12518414855003357, "step": 1521, "train_speed(iter/s)": 0.135816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 229.4375, "completions/min_length": 194.0, "epoch": 1.2588916459884203, "grad_norm": 0.19913312792778015, "kl": 0.071533203125, "learning_rate": 3.6316850496395863e-06, "loss": 0.0007163956761360168, "memory(GiB)": 38.05, "reward": 0.5919985771179199, "reward_std": 0.08389703929424286, "rewards/VisualizationJSONCombinedORM/mean": 0.5919985771179199, "rewards/VisualizationJSONCombinedORM/std": 0.20062711834907532, "step": 1522, "train_speed(iter/s)": 0.135605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 263.75, "completions/min_length": 227.0, "epoch": 1.2597187758478081, "grad_norm": 0.21920792758464813, "kl": 0.05352783203125, "learning_rate": 3.6247433154866707e-06, "loss": 0.0005342084914445877, "memory(GiB)": 38.05, "reward": 0.5083657503128052, "reward_std": 0.07964207231998444, "rewards/VisualizationJSONCombinedORM/mean": 0.5083657503128052, "rewards/VisualizationJSONCombinedORM/std": 0.1840757131576538, "step": 1523, "train_speed(iter/s)": 0.135463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 240.75, "completions/min_length": 183.0, "epoch": 1.260545905707196, "grad_norm": 0.16225101053714752, "kl": 0.0657958984375, "learning_rate": 3.6178044479242256e-06, "loss": 0.0006572157144546509, "memory(GiB)": 38.05, "reward": 0.27804601192474365, "reward_std": 0.020304720848798752, "rewards/VisualizationJSONCombinedORM/mean": 0.27804601192474365, "rewards/VisualizationJSONCombinedORM/std": 0.10197319090366364, "step": 1524, "train_speed(iter/s)": 0.13528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 217.0, "completions/min_length": 182.0, "epoch": 1.261373035566584, "grad_norm": 0.2050008773803711, "kl": 0.0877685546875, "learning_rate": 3.6108684614156487e-06, "loss": 0.000878460705280304, "memory(GiB)": 38.05, "reward": 0.4213084876537323, "reward_std": 0.07275277376174927, "rewards/VisualizationJSONCombinedORM/mean": 0.4213084876537323, "rewards/VisualizationJSONCombinedORM/std": 0.07246450334787369, "step": 1525, "train_speed(iter/s)": 0.135073 }, { "epoch": 1.261373035566584, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 273.3333333333333, "eval_completions/mean_length": 235.80208333333334, "eval_completions/min_length": 205.66666666666666, "eval_kl": 0.07415771484375, "eval_loss": 0.0007435579900629818, "eval_reward": 0.4453504861642917, "eval_reward_std": 0.0674787536651517, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4453504861642917, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06747875618748367, "eval_runtime": 255.8536, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.012, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 228.125, "completions/min_length": 205.0, "epoch": 1.2622001654259718, "grad_norm": 0.18903549015522003, "kl": 0.04791259765625, "learning_rate": 3.603935370418342e-06, "loss": 0.0004793591797351837, "memory(GiB)": 38.05, "reward": 0.7081068754196167, "reward_std": 0.09257794916629791, "rewards/VisualizationJSONCombinedORM/mean": 0.7081068754196167, "rewards/VisualizationJSONCombinedORM/std": 0.0914873406291008, "step": 1526, "train_speed(iter/s)": 0.131871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 233.5625, "completions/min_length": 203.0, "epoch": 1.2630272952853598, "grad_norm": 0.18784156441688538, "kl": 0.1195068359375, "learning_rate": 3.5970051893836667e-06, "loss": 0.0011946549639105797, "memory(GiB)": 38.05, "reward": 0.638974130153656, "reward_std": 0.0853767916560173, "rewards/VisualizationJSONCombinedORM/mean": 0.638974130153656, "rewards/VisualizationJSONCombinedORM/std": 0.10182946920394897, "step": 1527, "train_speed(iter/s)": 0.13171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 237.0, "completions/min_length": 201.0, "epoch": 1.2638544251447477, "grad_norm": 0.22096703946590424, "kl": 0.0804443359375, "learning_rate": 3.59007793275692e-06, "loss": 0.0008050203323364258, "memory(GiB)": 38.05, "reward": 0.3033612072467804, "reward_std": 0.04527409002184868, "rewards/VisualizationJSONCombinedORM/mean": 0.3033612072467804, "rewards/VisualizationJSONCombinedORM/std": 0.0743410512804985, "step": 1528, "train_speed(iter/s)": 0.131504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 235.9375, "completions/min_length": 206.0, "epoch": 1.2646815550041357, "grad_norm": 0.2245538979768753, "kl": 0.0760498046875, "learning_rate": 3.5831536149772993e-06, "loss": 0.0007606223225593567, "memory(GiB)": 38.05, "reward": 0.4110117256641388, "reward_std": 0.0666038766503334, "rewards/VisualizationJSONCombinedORM/mean": 0.4110117256641388, "rewards/VisualizationJSONCombinedORM/std": 0.16459745168685913, "step": 1529, "train_speed(iter/s)": 0.131368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 246.4375, "completions/min_length": 184.0, "epoch": 1.2655086848635235, "grad_norm": 0.18361124396324158, "kl": 0.05645751953125, "learning_rate": 3.5762322504778846e-06, "loss": 0.0005646552890539169, "memory(GiB)": 38.05, "reward": 0.45036444067955017, "reward_std": 0.053054243326187134, "rewards/VisualizationJSONCombinedORM/mean": 0.45036444067955017, "rewards/VisualizationJSONCombinedORM/std": 0.05835072696208954, "step": 1530, "train_speed(iter/s)": 0.131181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 223.5625, "completions/min_length": 188.0, "epoch": 1.2663358147229116, "grad_norm": 0.21798479557037354, "kl": 0.07159423828125, "learning_rate": 3.5693138536855927e-06, "loss": 0.000715993344783783, "memory(GiB)": 38.05, "reward": 0.4670136868953705, "reward_std": 0.09603134542703629, "rewards/VisualizationJSONCombinedORM/mean": 0.4670136868953705, "rewards/VisualizationJSONCombinedORM/std": 0.11477963626384735, "step": 1531, "train_speed(iter/s)": 0.131058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 229.3125, "completions/min_length": 182.0, "epoch": 1.2671629445822994, "grad_norm": 0.155789315700531, "kl": 0.05096435546875, "learning_rate": 3.5623984390211597e-06, "loss": 0.0005091801285743713, "memory(GiB)": 38.05, "reward": 0.6610591411590576, "reward_std": 0.1034197136759758, "rewards/VisualizationJSONCombinedORM/mean": 0.6610591411590576, "rewards/VisualizationJSONCombinedORM/std": 0.10592283308506012, "step": 1532, "train_speed(iter/s)": 0.130914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 238.4375, "completions/min_length": 184.0, "epoch": 1.2679900744416872, "grad_norm": 0.15878625214099884, "kl": 0.0596923828125, "learning_rate": 3.5554860208991015e-06, "loss": 0.0005970783531665802, "memory(GiB)": 38.05, "reward": 0.6432841420173645, "reward_std": 0.07080718874931335, "rewards/VisualizationJSONCombinedORM/mean": 0.6432841420173645, "rewards/VisualizationJSONCombinedORM/std": 0.13387422263622284, "step": 1533, "train_speed(iter/s)": 0.130733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 222.3125, "completions/min_length": 182.0, "epoch": 1.2688172043010753, "grad_norm": 0.18490153551101685, "kl": 0.085205078125, "learning_rate": 3.5485766137276894e-06, "loss": 0.0008512362837791443, "memory(GiB)": 38.05, "reward": 0.7076157331466675, "reward_std": 0.03603249043226242, "rewards/VisualizationJSONCombinedORM/mean": 0.7076157331466675, "rewards/VisualizationJSONCombinedORM/std": 0.035272713750600815, "step": 1534, "train_speed(iter/s)": 0.130523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 245.3125, "completions/min_length": 209.0, "epoch": 1.2696443341604633, "grad_norm": 0.18932518362998962, "kl": 0.06817626953125, "learning_rate": 3.5416702319089185e-06, "loss": 0.0006806263700127602, "memory(GiB)": 38.05, "reward": 0.5098048448562622, "reward_std": 0.09666180610656738, "rewards/VisualizationJSONCombinedORM/mean": 0.5098048448562622, "rewards/VisualizationJSONCombinedORM/std": 0.13414259254932404, "step": 1535, "train_speed(iter/s)": 0.130355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 229.0, "completions/min_length": 184.0, "epoch": 1.2704714640198511, "grad_norm": 0.13648371398448944, "kl": 0.03826904296875, "learning_rate": 3.5347668898384805e-06, "loss": 0.0003828108310699463, "memory(GiB)": 38.05, "reward": 0.6291036605834961, "reward_std": 0.09087613224983215, "rewards/VisualizationJSONCombinedORM/mean": 0.6291036605834961, "rewards/VisualizationJSONCombinedORM/std": 0.15208104252815247, "step": 1536, "train_speed(iter/s)": 0.130157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 227.4375, "completions/min_length": 193.0, "epoch": 1.271298593879239, "grad_norm": 0.22285130620002747, "kl": 0.07098388671875, "learning_rate": 3.5278666019057294e-06, "loss": 0.0007105879485607147, "memory(GiB)": 38.05, "reward": 0.48248928785324097, "reward_std": 0.12316592037677765, "rewards/VisualizationJSONCombinedORM/mean": 0.48248928785324097, "rewards/VisualizationJSONCombinedORM/std": 0.15066464245319366, "step": 1537, "train_speed(iter/s)": 0.130029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 238.75, "completions/min_length": 190.0, "epoch": 1.272125723738627, "grad_norm": 0.255278617143631, "kl": 0.050537109375, "learning_rate": 3.5209693824936486e-06, "loss": 0.0005056317895650864, "memory(GiB)": 38.05, "reward": 0.5835089683532715, "reward_std": 0.11971978098154068, "rewards/VisualizationJSONCombinedORM/mean": 0.5835089683532715, "rewards/VisualizationJSONCombinedORM/std": 0.13201183080673218, "step": 1538, "train_speed(iter/s)": 0.129851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 254.5, "completions/min_length": 205.0, "epoch": 1.272952853598015, "grad_norm": 0.18732251226902008, "kl": 0.0740966796875, "learning_rate": 3.514075245978833e-06, "loss": 0.0007418617606163025, "memory(GiB)": 38.05, "reward": 0.5046420097351074, "reward_std": 0.06641025841236115, "rewards/VisualizationJSONCombinedORM/mean": 0.5046420097351074, "rewards/VisualizationJSONCombinedORM/std": 0.09057389199733734, "step": 1539, "train_speed(iter/s)": 0.129632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 249.8125, "completions/min_length": 217.0, "epoch": 1.2737799834574028, "grad_norm": 0.18588054180145264, "kl": 0.05584716796875, "learning_rate": 3.5071842067314453e-06, "loss": 0.0005589351058006287, "memory(GiB)": 38.05, "reward": 0.5165489315986633, "reward_std": 0.10854898393154144, "rewards/VisualizationJSONCombinedORM/mean": 0.5165489315986633, "rewards/VisualizationJSONCombinedORM/std": 0.12310308963060379, "step": 1540, "train_speed(iter/s)": 0.129475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 219.375, "completions/min_length": 198.0, "epoch": 1.2746071133167907, "grad_norm": 0.2257649302482605, "kl": 0.06817626953125, "learning_rate": 3.5002962791151994e-06, "loss": 0.0006835311651229858, "memory(GiB)": 38.05, "reward": 0.6636538505554199, "reward_std": 0.09708484262228012, "rewards/VisualizationJSONCombinedORM/mean": 0.6636538505554199, "rewards/VisualizationJSONCombinedORM/std": 0.09542101621627808, "step": 1541, "train_speed(iter/s)": 0.129301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 227.0, "completions/min_length": 184.0, "epoch": 1.2754342431761787, "grad_norm": 0.19326263666152954, "kl": 0.055908203125, "learning_rate": 3.4934114774873153e-06, "loss": 0.0005592107772827148, "memory(GiB)": 38.05, "reward": 0.46817460656166077, "reward_std": 0.0642094612121582, "rewards/VisualizationJSONCombinedORM/mean": 0.46817460656166077, "rewards/VisualizationJSONCombinedORM/std": 0.0651574656367302, "step": 1542, "train_speed(iter/s)": 0.129137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 245.4375, "completions/min_length": 214.0, "epoch": 1.2762613730355665, "grad_norm": 0.20000971853733063, "kl": 0.0662841796875, "learning_rate": 3.486529816198501e-06, "loss": 0.0006608515977859497, "memory(GiB)": 38.05, "reward": 0.6406911611557007, "reward_std": 0.09292763471603394, "rewards/VisualizationJSONCombinedORM/mean": 0.6406911611557007, "rewards/VisualizationJSONCombinedORM/std": 0.1159181296825409, "step": 1543, "train_speed(iter/s)": 0.128971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 241.5625, "completions/min_length": 198.0, "epoch": 1.2770885028949546, "grad_norm": 0.17566019296646118, "kl": 0.0517578125, "learning_rate": 3.4796513095929178e-06, "loss": 0.0005178302526473999, "memory(GiB)": 38.05, "reward": 0.7180539965629578, "reward_std": 0.09748479723930359, "rewards/VisualizationJSONCombinedORM/mean": 0.7180539965629578, "rewards/VisualizationJSONCombinedORM/std": 0.0961553230881691, "step": 1544, "train_speed(iter/s)": 0.128797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 255.125, "completions/min_length": 213.0, "epoch": 1.2779156327543424, "grad_norm": 0.20113150775432587, "kl": 0.08154296875, "learning_rate": 3.4727759720081553e-06, "loss": 0.0008141808211803436, "memory(GiB)": 38.05, "reward": 0.4028090536594391, "reward_std": 0.09001868218183517, "rewards/VisualizationJSONCombinedORM/mean": 0.4028090536594391, "rewards/VisualizationJSONCombinedORM/std": 0.2238118052482605, "step": 1545, "train_speed(iter/s)": 0.128554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 225.125, "completions/min_length": 198.0, "epoch": 1.2787427626137304, "grad_norm": 0.18415731191635132, "kl": 0.0474853515625, "learning_rate": 3.4659038177751918e-06, "loss": 0.0004745796322822571, "memory(GiB)": 38.05, "reward": 0.40769171714782715, "reward_std": 0.05159665644168854, "rewards/VisualizationJSONCombinedORM/mean": 0.40769171714782715, "rewards/VisualizationJSONCombinedORM/std": 0.25638577342033386, "step": 1546, "train_speed(iter/s)": 0.128418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 225.6875, "completions/min_length": 204.0, "epoch": 1.2795698924731183, "grad_norm": 0.1994381546974182, "kl": 0.04364013671875, "learning_rate": 3.4590348612183723e-06, "loss": 0.0004362724721431732, "memory(GiB)": 38.05, "reward": 0.49400991201400757, "reward_std": 0.09142234921455383, "rewards/VisualizationJSONCombinedORM/mean": 0.49400991201400757, "rewards/VisualizationJSONCombinedORM/std": 0.1315440982580185, "step": 1547, "train_speed(iter/s)": 0.128254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 246.5, "completions/min_length": 200.0, "epoch": 1.2803970223325063, "grad_norm": 0.16841459274291992, "kl": 0.10693359375, "learning_rate": 3.4521691166553777e-06, "loss": 0.0010702461004257202, "memory(GiB)": 38.05, "reward": 0.4411141872406006, "reward_std": 0.08341469615697861, "rewards/VisualizationJSONCombinedORM/mean": 0.4411141872406006, "rewards/VisualizationJSONCombinedORM/std": 0.20360028743743896, "step": 1548, "train_speed(iter/s)": 0.128108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 223.25, "completions/min_length": 191.0, "epoch": 1.2812241521918941, "grad_norm": 0.1932833194732666, "kl": 0.082275390625, "learning_rate": 3.4453065983971952e-06, "loss": 0.0008227676153182983, "memory(GiB)": 38.05, "reward": 0.6604098081588745, "reward_std": 0.10258319973945618, "rewards/VisualizationJSONCombinedORM/mean": 0.6604098081588745, "rewards/VisualizationJSONCombinedORM/std": 0.12654395401477814, "step": 1549, "train_speed(iter/s)": 0.127903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 264.75, "completions/min_length": 219.0, "epoch": 1.282051282051282, "grad_norm": 0.15286290645599365, "kl": 0.0887451171875, "learning_rate": 3.438447320748082e-06, "loss": 0.0008883979171514511, "memory(GiB)": 38.05, "reward": 0.6489535570144653, "reward_std": 0.10258186608552933, "rewards/VisualizationJSONCombinedORM/mean": 0.6489535570144653, "rewards/VisualizationJSONCombinedORM/std": 0.1031995415687561, "step": 1550, "train_speed(iter/s)": 0.127715 }, { "epoch": 1.282051282051282, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 274.4166666666667, "eval_completions/mean_length": 236.80729166666666, "eval_completions/min_length": 207.25, "eval_kl": 0.056884765625, "eval_loss": 0.0005725957453250885, "eval_reward": 0.4447250614563624, "eval_reward_std": 0.06523396109696478, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4447250614563624, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06523396563716233, "eval_runtime": 256.4632, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.012, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 236.6875, "completions/min_length": 190.0, "epoch": 1.28287841191067, "grad_norm": 0.21416929364204407, "kl": 0.05792236328125, "learning_rate": 3.4315912980055433e-06, "loss": 0.0005789683200418949, "memory(GiB)": 38.05, "reward": 0.33207613229751587, "reward_std": 0.035318709909915924, "rewards/VisualizationJSONCombinedORM/mean": 0.33207613229751587, "rewards/VisualizationJSONCombinedORM/std": 0.0993456020951271, "step": 1551, "train_speed(iter/s)": 0.124922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 220.1875, "completions/min_length": 179.0, "epoch": 1.283705541770058, "grad_norm": 0.21679851412773132, "kl": 0.06707763671875, "learning_rate": 3.424738544460302e-06, "loss": 0.0006692409515380859, "memory(GiB)": 38.05, "reward": 0.3813064694404602, "reward_std": 0.06642486155033112, "rewards/VisualizationJSONCombinedORM/mean": 0.3813064694404602, "rewards/VisualizationJSONCombinedORM/std": 0.07829002290964127, "step": 1552, "train_speed(iter/s)": 0.124739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 238.875, "completions/min_length": 210.0, "epoch": 1.2845326716294458, "grad_norm": 0.16305924952030182, "kl": 0.0543212890625, "learning_rate": 3.4178890743962635e-06, "loss": 0.0005436241626739502, "memory(GiB)": 38.05, "reward": 0.5338683128356934, "reward_std": 0.07447407394647598, "rewards/VisualizationJSONCombinedORM/mean": 0.5338683128356934, "rewards/VisualizationJSONCombinedORM/std": 0.12509536743164062, "step": 1553, "train_speed(iter/s)": 0.124565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 246.5, "completions/min_length": 214.0, "epoch": 1.2853598014888337, "grad_norm": 0.22176305949687958, "kl": 0.05670166015625, "learning_rate": 3.4110429020904924e-06, "loss": 0.0005658082664012909, "memory(GiB)": 38.05, "reward": 0.3547549247741699, "reward_std": 0.051265791058540344, "rewards/VisualizationJSONCombinedORM/mean": 0.3547549247741699, "rewards/VisualizationJSONCombinedORM/std": 0.0895884782075882, "step": 1554, "train_speed(iter/s)": 0.124418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 250.25, "completions/min_length": 218.0, "epoch": 1.2861869313482217, "grad_norm": 0.23835329711437225, "kl": 0.05926513671875, "learning_rate": 3.404200041813175e-06, "loss": 0.0005919784307479858, "memory(GiB)": 38.05, "reward": 0.3831283450126648, "reward_std": 0.07051113247871399, "rewards/VisualizationJSONCombinedORM/mean": 0.3831283450126648, "rewards/VisualizationJSONCombinedORM/std": 0.08013920485973358, "step": 1555, "train_speed(iter/s)": 0.124218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 248.5, "completions/min_length": 193.0, "epoch": 1.2870140612076095, "grad_norm": 0.21674180030822754, "kl": 0.0911865234375, "learning_rate": 3.3973605078275955e-06, "loss": 0.0009125005453824997, "memory(GiB)": 38.05, "reward": 0.7414808869361877, "reward_std": 0.10884062945842743, "rewards/VisualizationJSONCombinedORM/mean": 0.7414808869361877, "rewards/VisualizationJSONCombinedORM/std": 0.11321558058261871, "step": 1556, "train_speed(iter/s)": 0.124058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 256.9375, "completions/min_length": 192.0, "epoch": 1.2878411910669976, "grad_norm": 0.1905432492494583, "kl": 0.06005859375, "learning_rate": 3.3905243143901085e-06, "loss": 0.0006009265780448914, "memory(GiB)": 38.05, "reward": 0.5307589769363403, "reward_std": 0.08315201848745346, "rewards/VisualizationJSONCombinedORM/mean": 0.5307589769363403, "rewards/VisualizationJSONCombinedORM/std": 0.14178435504436493, "step": 1557, "train_speed(iter/s)": 0.123853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 222.3125, "completions/min_length": 187.0, "epoch": 1.2886683209263854, "grad_norm": 0.18666543066501617, "kl": 0.0811767578125, "learning_rate": 3.3836914757501023e-06, "loss": 0.00081302085891366, "memory(GiB)": 38.05, "reward": 0.4144420921802521, "reward_std": 0.05095238983631134, "rewards/VisualizationJSONCombinedORM/mean": 0.4144420921802521, "rewards/VisualizationJSONCombinedORM/std": 0.10551252216100693, "step": 1558, "train_speed(iter/s)": 0.123704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 246.6875, "completions/min_length": 204.0, "epoch": 1.2894954507857734, "grad_norm": 0.16405034065246582, "kl": 0.06207275390625, "learning_rate": 3.37686200614997e-06, "loss": 0.0006204694509506226, "memory(GiB)": 38.05, "reward": 0.5235767364501953, "reward_std": 0.0748400092124939, "rewards/VisualizationJSONCombinedORM/mean": 0.5235767364501953, "rewards/VisualizationJSONCombinedORM/std": 0.09214357286691666, "step": 1559, "train_speed(iter/s)": 0.123556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 242.25, "completions/min_length": 198.0, "epoch": 1.2903225806451613, "grad_norm": 0.16390551626682281, "kl": 0.076416015625, "learning_rate": 3.3700359198250854e-06, "loss": 0.0007651969790458679, "memory(GiB)": 38.05, "reward": 0.452932208776474, "reward_std": 0.06699810177087784, "rewards/VisualizationJSONCombinedORM/mean": 0.452932208776474, "rewards/VisualizationJSONCombinedORM/std": 0.09530685842037201, "step": 1560, "train_speed(iter/s)": 0.123382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 252.75, "completions/min_length": 195.0, "epoch": 1.2911497105045493, "grad_norm": 0.18182218074798584, "kl": 0.05609130859375, "learning_rate": 3.3632132310037728e-06, "loss": 0.0005605071783065796, "memory(GiB)": 38.05, "reward": 0.5884054899215698, "reward_std": 0.0693311095237732, "rewards/VisualizationJSONCombinedORM/mean": 0.5884054899215698, "rewards/VisualizationJSONCombinedORM/std": 0.14114513993263245, "step": 1561, "train_speed(iter/s)": 0.123248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 245.25, "completions/min_length": 201.0, "epoch": 1.2919768403639371, "grad_norm": 0.18563483655452728, "kl": 0.1094970703125, "learning_rate": 3.356393953907271e-06, "loss": 0.0010926425457000732, "memory(GiB)": 38.05, "reward": 0.5524342656135559, "reward_std": 0.11108715832233429, "rewards/VisualizationJSONCombinedORM/mean": 0.5524342656135559, "rewards/VisualizationJSONCombinedORM/std": 0.17107801139354706, "step": 1562, "train_speed(iter/s)": 0.123125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 217.5, "completions/min_length": 195.0, "epoch": 1.292803970223325, "grad_norm": 0.17879348993301392, "kl": 0.0693359375, "learning_rate": 3.3495781027497043e-06, "loss": 0.000694863498210907, "memory(GiB)": 38.05, "reward": 0.6843901872634888, "reward_std": 0.06349492818117142, "rewards/VisualizationJSONCombinedORM/mean": 0.6843901872634888, "rewards/VisualizationJSONCombinedORM/std": 0.10114534944295883, "step": 1563, "train_speed(iter/s)": 0.123004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 226.1875, "completions/min_length": 192.0, "epoch": 1.293631100082713, "grad_norm": 0.208063542842865, "kl": 0.052490234375, "learning_rate": 3.342765691738064e-06, "loss": 0.0005250722169876099, "memory(GiB)": 38.05, "reward": 0.4886697232723236, "reward_std": 0.07262299954891205, "rewards/VisualizationJSONCombinedORM/mean": 0.4886697232723236, "rewards/VisualizationJSONCombinedORM/std": 0.07796458899974823, "step": 1564, "train_speed(iter/s)": 0.122889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/mean_length": 242.8125, "completions/min_length": 204.0, "epoch": 1.294458229942101, "grad_norm": 0.22973091900348663, "kl": 0.1070556640625, "learning_rate": 3.3359567350721657e-06, "loss": 0.0010727979242801666, "memory(GiB)": 38.05, "reward": 0.3570113182067871, "reward_std": 0.06823644042015076, "rewards/VisualizationJSONCombinedORM/mean": 0.3570113182067871, "rewards/VisualizationJSONCombinedORM/std": 0.12408173084259033, "step": 1565, "train_speed(iter/s)": 0.122752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 246.875, "completions/min_length": 209.0, "epoch": 1.2952853598014888, "grad_norm": 0.17424403131008148, "kl": 0.09271240234375, "learning_rate": 3.3291512469446253e-06, "loss": 0.0009277760982513428, "memory(GiB)": 38.05, "reward": 0.6706947088241577, "reward_std": 0.08859014511108398, "rewards/VisualizationJSONCombinedORM/mean": 0.6706947088241577, "rewards/VisualizationJSONCombinedORM/std": 0.12457627058029175, "step": 1566, "train_speed(iter/s)": 0.122569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 222.9375, "completions/min_length": 180.0, "epoch": 1.2961124896608767, "grad_norm": 0.18862134218215942, "kl": 0.067138671875, "learning_rate": 3.322349241540827e-06, "loss": 0.0006704479455947876, "memory(GiB)": 38.05, "reward": 0.2945661246776581, "reward_std": 0.03376452624797821, "rewards/VisualizationJSONCombinedORM/mean": 0.2945661246776581, "rewards/VisualizationJSONCombinedORM/std": 0.05039959400892258, "step": 1567, "train_speed(iter/s)": 0.122403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 241.375, "completions/min_length": 206.0, "epoch": 1.2969396195202647, "grad_norm": 0.16076481342315674, "kl": 0.074951171875, "learning_rate": 3.3155507330389004e-06, "loss": 0.0007487507537007332, "memory(GiB)": 38.05, "reward": 0.348196804523468, "reward_std": 0.03334871307015419, "rewards/VisualizationJSONCombinedORM/mean": 0.348196804523468, "rewards/VisualizationJSONCombinedORM/std": 0.07481665909290314, "step": 1568, "train_speed(iter/s)": 0.122252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 237.1875, "completions/min_length": 202.0, "epoch": 1.2977667493796525, "grad_norm": 0.1823417842388153, "kl": 0.111572265625, "learning_rate": 3.308755735609682e-06, "loss": 0.0011148899793624878, "memory(GiB)": 38.05, "reward": 0.5042260885238647, "reward_std": 0.06191014498472214, "rewards/VisualizationJSONCombinedORM/mean": 0.5042260885238647, "rewards/VisualizationJSONCombinedORM/std": 0.27211812138557434, "step": 1569, "train_speed(iter/s)": 0.122107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 238.375, "completions/min_length": 202.0, "epoch": 1.2985938792390406, "grad_norm": 0.1690954566001892, "kl": 0.07342529296875, "learning_rate": 3.301964263416693e-06, "loss": 0.0007326751947402954, "memory(GiB)": 38.05, "reward": 0.7517420053482056, "reward_std": 0.07483146339654922, "rewards/VisualizationJSONCombinedORM/mean": 0.7517420053482056, "rewards/VisualizationJSONCombinedORM/std": 0.07315095514059067, "step": 1570, "train_speed(iter/s)": 0.121926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 253.75, "completions/min_length": 213.0, "epoch": 1.2994210090984284, "grad_norm": 0.16558575630187988, "kl": 0.0880126953125, "learning_rate": 3.295176330616105e-06, "loss": 0.0008785352110862732, "memory(GiB)": 38.05, "reward": 0.3879125416278839, "reward_std": 0.07130933552980423, "rewards/VisualizationJSONCombinedORM/mean": 0.3879125416278839, "rewards/VisualizationJSONCombinedORM/std": 0.1136530265212059, "step": 1571, "train_speed(iter/s)": 0.121781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 257.9375, "completions/min_length": 231.0, "epoch": 1.3002481389578164, "grad_norm": 0.163213312625885, "kl": 0.0469970703125, "learning_rate": 3.2883919513567096e-06, "loss": 0.00046911463141441345, "memory(GiB)": 38.05, "reward": 0.4527122378349304, "reward_std": 0.06716637313365936, "rewards/VisualizationJSONCombinedORM/mean": 0.4527122378349304, "rewards/VisualizationJSONCombinedORM/std": 0.1954510509967804, "step": 1572, "train_speed(iter/s)": 0.121639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 235.75, "completions/min_length": 196.0, "epoch": 1.3010752688172043, "grad_norm": 0.1553584486246109, "kl": 0.0633544921875, "learning_rate": 3.281611139779894e-06, "loss": 0.0006331577897071838, "memory(GiB)": 38.05, "reward": 0.630575954914093, "reward_std": 0.07576338946819305, "rewards/VisualizationJSONCombinedORM/mean": 0.630575954914093, "rewards/VisualizationJSONCombinedORM/std": 0.14871883392333984, "step": 1573, "train_speed(iter/s)": 0.121497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 253.4375, "completions/min_length": 185.0, "epoch": 1.3019023986765923, "grad_norm": 0.18682928383350372, "kl": 0.0675048828125, "learning_rate": 3.2748339100196105e-06, "loss": 0.0006759054958820343, "memory(GiB)": 38.05, "reward": 0.5144533514976501, "reward_std": 0.0746038630604744, "rewards/VisualizationJSONCombinedORM/mean": 0.5144533514976501, "rewards/VisualizationJSONCombinedORM/std": 0.09672776609659195, "step": 1574, "train_speed(iter/s)": 0.121333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 241.5, "completions/min_length": 211.0, "epoch": 1.3027295285359801, "grad_norm": 0.18236137926578522, "kl": 0.07537841796875, "learning_rate": 3.268060276202344e-06, "loss": 0.0007534697651863098, "memory(GiB)": 38.05, "reward": 0.5230777263641357, "reward_std": 0.05739534646272659, "rewards/VisualizationJSONCombinedORM/mean": 0.5230777263641357, "rewards/VisualizationJSONCombinedORM/std": 0.20010748505592346, "step": 1575, "train_speed(iter/s)": 0.121158 }, { "epoch": 1.3027295285359801, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 286.375, "eval_completions/mean_length": 243.40104166666666, "eval_completions/min_length": 208.625, "eval_kl": 0.07313028971354167, "eval_loss": 0.0007357212598435581, "eval_reward": 0.4936448968946934, "eval_reward_std": 0.06632996210828424, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4936448968946934, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06632996218589444, "eval_runtime": 263.4309, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.011, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 270.5, "completions/min_length": 231.0, "epoch": 1.303556658395368, "grad_norm": 0.20665745437145233, "kl": 0.0621337890625, "learning_rate": 3.2612902524470803e-06, "loss": 0.0006198037881404161, "memory(GiB)": 38.05, "reward": 0.35585588216781616, "reward_std": 0.0464973971247673, "rewards/VisualizationJSONCombinedORM/mean": 0.35585588216781616, "rewards/VisualizationJSONCombinedORM/std": 0.0789548009634018, "step": 1576, "train_speed(iter/s)": 0.1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 232.125, "completions/min_length": 202.0, "epoch": 1.304383788254756, "grad_norm": 0.20214389264583588, "kl": 0.07952880859375, "learning_rate": 3.254523852865286e-06, "loss": 0.0007954202592372894, "memory(GiB)": 38.05, "reward": 0.6383330821990967, "reward_std": 0.09641572088003159, "rewards/VisualizationJSONCombinedORM/mean": 0.6383330821990967, "rewards/VisualizationJSONCombinedORM/std": 0.12978795170783997, "step": 1577, "train_speed(iter/s)": 0.118506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 247.75, "completions/min_length": 198.0, "epoch": 1.305210918114144, "grad_norm": 0.20386433601379395, "kl": 0.069091796875, "learning_rate": 3.2477610915608705e-06, "loss": 0.0006909891963005066, "memory(GiB)": 38.05, "reward": 0.6491484642028809, "reward_std": 0.1265871524810791, "rewards/VisualizationJSONCombinedORM/mean": 0.6491484642028809, "rewards/VisualizationJSONCombinedORM/std": 0.12251269072294235, "step": 1578, "train_speed(iter/s)": 0.118381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 256.75, "completions/min_length": 208.0, "epoch": 1.3060380479735318, "grad_norm": 0.18487469851970673, "kl": 0.0767822265625, "learning_rate": 3.241001982630163e-06, "loss": 0.0007688701152801514, "memory(GiB)": 38.05, "reward": 0.4824323058128357, "reward_std": 0.0764995664358139, "rewards/VisualizationJSONCombinedORM/mean": 0.4824323058128357, "rewards/VisualizationJSONCombinedORM/std": 0.1370820552110672, "step": 1579, "train_speed(iter/s)": 0.118259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 225.3125, "completions/min_length": 205.0, "epoch": 1.3068651778329197, "grad_norm": 0.21258139610290527, "kl": 0.095947265625, "learning_rate": 3.2342465401618715e-06, "loss": 0.0009581670165061951, "memory(GiB)": 38.05, "reward": 0.48742157220840454, "reward_std": 0.0800001323223114, "rewards/VisualizationJSONCombinedORM/mean": 0.48742157220840454, "rewards/VisualizationJSONCombinedORM/std": 0.09454334527254105, "step": 1580, "train_speed(iter/s)": 0.118134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 231.0625, "completions/min_length": 194.0, "epoch": 1.3076923076923077, "grad_norm": 0.14875394105911255, "kl": 0.07000732421875, "learning_rate": 3.2274947782370713e-06, "loss": 0.0007013268768787384, "memory(GiB)": 38.05, "reward": 0.6467646956443787, "reward_std": 0.0436505563557148, "rewards/VisualizationJSONCombinedORM/mean": 0.6467646956443787, "rewards/VisualizationJSONCombinedORM/std": 0.15095199644565582, "step": 1581, "train_speed(iter/s)": 0.11801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 223.8125, "completions/min_length": 186.0, "epoch": 1.3085194375516958, "grad_norm": 0.19139885902404785, "kl": 0.0477294921875, "learning_rate": 3.220746710929159e-06, "loss": 0.00047675520181655884, "memory(GiB)": 38.05, "reward": 0.423250675201416, "reward_std": 0.0578666552901268, "rewards/VisualizationJSONCombinedORM/mean": 0.423250675201416, "rewards/VisualizationJSONCombinedORM/std": 0.07040131837129593, "step": 1582, "train_speed(iter/s)": 0.117914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 239.5, "completions/min_length": 221.0, "epoch": 1.3093465674110836, "grad_norm": 0.15708822011947632, "kl": 0.109619140625, "learning_rate": 3.2140023523038355e-06, "loss": 0.0010952912271022797, "memory(GiB)": 38.05, "reward": 0.37903910875320435, "reward_std": 0.039842985570430756, "rewards/VisualizationJSONCombinedORM/mean": 0.37903910875320435, "rewards/VisualizationJSONCombinedORM/std": 0.07744867354631424, "step": 1583, "train_speed(iter/s)": 0.11778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 256.375, "completions/min_length": 200.0, "epoch": 1.3101736972704714, "grad_norm": 0.1588800847530365, "kl": 0.079345703125, "learning_rate": 3.207261716419067e-06, "loss": 0.0007927119731903076, "memory(GiB)": 38.05, "reward": 0.616835355758667, "reward_std": 0.0961313396692276, "rewards/VisualizationJSONCombinedORM/mean": 0.616835355758667, "rewards/VisualizationJSONCombinedORM/std": 0.14384499192237854, "step": 1584, "train_speed(iter/s)": 0.117649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 234.6875, "completions/min_length": 206.0, "epoch": 1.3110008271298594, "grad_norm": 0.18917478621006012, "kl": 0.075439453125, "learning_rate": 3.2005248173250593e-06, "loss": 0.0007539279758930206, "memory(GiB)": 38.05, "reward": 0.5671482682228088, "reward_std": 0.08727344125509262, "rewards/VisualizationJSONCombinedORM/mean": 0.5671482682228088, "rewards/VisualizationJSONCombinedORM/std": 0.12634147703647614, "step": 1585, "train_speed(iter/s)": 0.117507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 244.125, "completions/min_length": 209.0, "epoch": 1.3118279569892473, "grad_norm": 0.16822925209999084, "kl": 0.03558349609375, "learning_rate": 3.1937916690642356e-06, "loss": 0.00035579875111579895, "memory(GiB)": 38.05, "reward": 0.5954075455665588, "reward_std": 0.1355540156364441, "rewards/VisualizationJSONCombinedORM/mean": 0.5954075455665588, "rewards/VisualizationJSONCombinedORM/std": 0.13181130588054657, "step": 1586, "train_speed(iter/s)": 0.117385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 246.0625, "completions/min_length": 199.0, "epoch": 1.3126550868486353, "grad_norm": 0.1818128377199173, "kl": 0.0672607421875, "learning_rate": 3.1870622856711934e-06, "loss": 0.0006733760237693787, "memory(GiB)": 38.05, "reward": 0.2868620753288269, "reward_std": 0.03951801359653473, "rewards/VisualizationJSONCombinedORM/mean": 0.2868620753288269, "rewards/VisualizationJSONCombinedORM/std": 0.04672883450984955, "step": 1587, "train_speed(iter/s)": 0.117253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 267.6875, "completions/min_length": 227.0, "epoch": 1.3134822167080231, "grad_norm": 0.16835272312164307, "kl": 0.076416015625, "learning_rate": 3.180336681172691e-06, "loss": 0.000765431672334671, "memory(GiB)": 38.05, "reward": 0.6485527753829956, "reward_std": 0.07334728538990021, "rewards/VisualizationJSONCombinedORM/mean": 0.6485527753829956, "rewards/VisualizationJSONCombinedORM/std": 0.10090892016887665, "step": 1588, "train_speed(iter/s)": 0.117119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 232.4375, "completions/min_length": 205.0, "epoch": 1.3143093465674112, "grad_norm": 0.1524852216243744, "kl": 0.06793212890625, "learning_rate": 3.1736148695875986e-06, "loss": 0.0006784424185752869, "memory(GiB)": 38.05, "reward": 0.6734684705734253, "reward_std": 0.04794685170054436, "rewards/VisualizationJSONCombinedORM/mean": 0.6734684705734253, "rewards/VisualizationJSONCombinedORM/std": 0.11602270603179932, "step": 1589, "train_speed(iter/s)": 0.116942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 255.375, "completions/min_length": 196.0, "epoch": 1.315136476426799, "grad_norm": 0.18453744053840637, "kl": 0.1092529296875, "learning_rate": 3.1668968649268905e-06, "loss": 0.0010909456759691238, "memory(GiB)": 38.05, "reward": 0.6487931609153748, "reward_std": 0.07684458792209625, "rewards/VisualizationJSONCombinedORM/mean": 0.6487931609153748, "rewards/VisualizationJSONCombinedORM/std": 0.1351574808359146, "step": 1590, "train_speed(iter/s)": 0.116786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 247.9375, "completions/min_length": 191.0, "epoch": 1.315963606286187, "grad_norm": 0.18932247161865234, "kl": 0.106689453125, "learning_rate": 3.160182681193601e-06, "loss": 0.0010656192898750305, "memory(GiB)": 38.05, "reward": 0.5737528204917908, "reward_std": 0.08480185270309448, "rewards/VisualizationJSONCombinedORM/mean": 0.5737528204917908, "rewards/VisualizationJSONCombinedORM/std": 0.1271483600139618, "step": 1591, "train_speed(iter/s)": 0.116661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 228.3125, "completions/min_length": 200.0, "epoch": 1.3167907361455748, "grad_norm": 0.1782209873199463, "kl": 0.04803466796875, "learning_rate": 3.153472332382803e-06, "loss": 0.0004801210016012192, "memory(GiB)": 38.05, "reward": 0.4768499433994293, "reward_std": 0.04492100328207016, "rewards/VisualizationJSONCombinedORM/mean": 0.4768499433994293, "rewards/VisualizationJSONCombinedORM/std": 0.10541258007287979, "step": 1592, "train_speed(iter/s)": 0.116518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 253.9375, "completions/min_length": 210.0, "epoch": 1.3176178660049627, "grad_norm": 0.16578470170497894, "kl": 0.092041015625, "learning_rate": 3.146765832481572e-06, "loss": 0.0009183436632156372, "memory(GiB)": 38.05, "reward": 0.5531710386276245, "reward_std": 0.05929875373840332, "rewards/VisualizationJSONCombinedORM/mean": 0.5531710386276245, "rewards/VisualizationJSONCombinedORM/std": 0.21070443093776703, "step": 1593, "train_speed(iter/s)": 0.11636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 244.3125, "completions/min_length": 208.0, "epoch": 1.3184449958643507, "grad_norm": 0.17981502413749695, "kl": 0.111328125, "learning_rate": 3.1400631954689626e-06, "loss": 0.0011142492294311523, "memory(GiB)": 38.05, "reward": 0.6088607311248779, "reward_std": 0.10345827043056488, "rewards/VisualizationJSONCombinedORM/mean": 0.6088607311248779, "rewards/VisualizationJSONCombinedORM/std": 0.1451590359210968, "step": 1594, "train_speed(iter/s)": 0.116245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 250.125, "completions/min_length": 203.0, "epoch": 1.3192721257237388, "grad_norm": 0.1763368844985962, "kl": 0.071533203125, "learning_rate": 3.13336443531598e-06, "loss": 0.0007174164056777954, "memory(GiB)": 38.05, "reward": 0.5420613288879395, "reward_std": 0.0781409814953804, "rewards/VisualizationJSONCombinedORM/mean": 0.5420613288879395, "rewards/VisualizationJSONCombinedORM/std": 0.12872010469436646, "step": 1595, "train_speed(iter/s)": 0.116128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 256.8125, "completions/min_length": 219.0, "epoch": 1.3200992555831266, "grad_norm": 0.17821568250656128, "kl": 0.08349609375, "learning_rate": 3.1266695659855462e-06, "loss": 0.0008355379104614258, "memory(GiB)": 38.05, "reward": 0.41912415623664856, "reward_std": 0.046828094869852066, "rewards/VisualizationJSONCombinedORM/mean": 0.41912415623664856, "rewards/VisualizationJSONCombinedORM/std": 0.05780654773116112, "step": 1596, "train_speed(iter/s)": 0.115994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 236.5625, "completions/min_length": 206.0, "epoch": 1.3209263854425144, "grad_norm": 0.14168444275856018, "kl": 0.0474853515625, "learning_rate": 3.1199786014324717e-06, "loss": 0.0004748106002807617, "memory(GiB)": 38.05, "reward": 0.5904122591018677, "reward_std": 0.09171310067176819, "rewards/VisualizationJSONCombinedORM/mean": 0.5904122591018677, "rewards/VisualizationJSONCombinedORM/std": 0.11582520604133606, "step": 1597, "train_speed(iter/s)": 0.115837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 232.0625, "completions/min_length": 190.0, "epoch": 1.3217535153019024, "grad_norm": 0.17674201726913452, "kl": 0.07525634765625, "learning_rate": 3.1132915556034283e-06, "loss": 0.0007527098059654236, "memory(GiB)": 38.05, "reward": 0.558808445930481, "reward_std": 0.08486449718475342, "rewards/VisualizationJSONCombinedORM/mean": 0.558808445930481, "rewards/VisualizationJSONCombinedORM/std": 0.08628911525011063, "step": 1598, "train_speed(iter/s)": 0.115721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 236.125, "completions/min_length": 182.0, "epoch": 1.3225806451612903, "grad_norm": 0.1718893051147461, "kl": 0.0445556640625, "learning_rate": 3.106608442436924e-06, "loss": 0.00044565461575984955, "memory(GiB)": 38.05, "reward": 0.523095428943634, "reward_std": 0.08654507249593735, "rewards/VisualizationJSONCombinedORM/mean": 0.523095428943634, "rewards/VisualizationJSONCombinedORM/std": 0.2063715010881424, "step": 1599, "train_speed(iter/s)": 0.115606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 246.8125, "completions/min_length": 203.0, "epoch": 1.3234077750206783, "grad_norm": 0.16956909000873566, "kl": 0.0850830078125, "learning_rate": 3.099929275863266e-06, "loss": 0.0008518975228071213, "memory(GiB)": 38.05, "reward": 0.5207576155662537, "reward_std": 0.06616600602865219, "rewards/VisualizationJSONCombinedORM/mean": 0.5207576155662537, "rewards/VisualizationJSONCombinedORM/std": 0.06829629093408585, "step": 1600, "train_speed(iter/s)": 0.115457 }, { "epoch": 1.3234077750206783, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 288.7083333333333, "eval_completions/mean_length": 247.39583333333334, "eval_completions/min_length": 212.5, "eval_kl": 0.06764729817708333, "eval_loss": 0.0006847220356576145, "eval_reward": 0.4807128037015597, "eval_reward_std": 0.0733627094887197, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4807128037015597, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07336271228268743, "eval_runtime": 264.5502, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.011, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 243.375, "completions/min_length": 206.0, "epoch": 1.3242349048800661, "grad_norm": 0.16858816146850586, "kl": 0.0518798828125, "learning_rate": 3.093254069804532e-06, "loss": 0.000519387423992157, "memory(GiB)": 38.05, "reward": 0.5826805233955383, "reward_std": 0.057262614369392395, "rewards/VisualizationJSONCombinedORM/mean": 0.5826805233955383, "rewards/VisualizationJSONCombinedORM/std": 0.19152964651584625, "step": 1601, "train_speed(iter/s)": 0.113191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 251.75, "completions/min_length": 193.0, "epoch": 1.3250620347394542, "grad_norm": 0.1696806401014328, "kl": 0.08160400390625, "learning_rate": 3.0865828381745515e-06, "loss": 0.0008144080638885498, "memory(GiB)": 38.05, "reward": 0.5544485449790955, "reward_std": 0.07876691967248917, "rewards/VisualizationJSONCombinedORM/mean": 0.5544485449790955, "rewards/VisualizationJSONCombinedORM/std": 0.14482158422470093, "step": 1602, "train_speed(iter/s)": 0.113097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 234.0, "completions/min_length": 197.0, "epoch": 1.325889164598842, "grad_norm": 0.28394538164138794, "kl": 0.07000732421875, "learning_rate": 3.079915594878865e-06, "loss": 0.000700049102306366, "memory(GiB)": 38.05, "reward": 0.3838692307472229, "reward_std": 0.1086440160870552, "rewards/VisualizationJSONCombinedORM/mean": 0.3838692307472229, "rewards/VisualizationJSONCombinedORM/std": 0.1916549652814865, "step": 1603, "train_speed(iter/s)": 0.112968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 255.6875, "completions/min_length": 220.0, "epoch": 1.32671629445823, "grad_norm": 0.16803216934204102, "kl": 0.058837890625, "learning_rate": 3.0732523538146997e-06, "loss": 0.0005884356796741486, "memory(GiB)": 38.05, "reward": 0.667320728302002, "reward_std": 0.09704912453889847, "rewards/VisualizationJSONCombinedORM/mean": 0.667320728302002, "rewards/VisualizationJSONCombinedORM/std": 0.09883953630924225, "step": 1604, "train_speed(iter/s)": 0.112886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 255.8125, "completions/min_length": 220.0, "epoch": 1.3275434243176178, "grad_norm": 0.17072415351867676, "kl": 0.07012939453125, "learning_rate": 3.066593128870944e-06, "loss": 0.0007009971886873245, "memory(GiB)": 38.05, "reward": 0.5645149946212769, "reward_std": 0.11220531165599823, "rewards/VisualizationJSONCombinedORM/mean": 0.5645149946212769, "rewards/VisualizationJSONCombinedORM/std": 0.17601966857910156, "step": 1605, "train_speed(iter/s)": 0.112794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 244.3125, "completions/min_length": 182.0, "epoch": 1.3283705541770057, "grad_norm": 0.18695446848869324, "kl": 0.0411376953125, "learning_rate": 3.05993793392811e-06, "loss": 0.00041148625314235687, "memory(GiB)": 38.05, "reward": 0.5825743675231934, "reward_std": 0.07212749868631363, "rewards/VisualizationJSONCombinedORM/mean": 0.5825743675231934, "rewards/VisualizationJSONCombinedORM/std": 0.07251151651144028, "step": 1606, "train_speed(iter/s)": 0.112651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 228.375, "completions/min_length": 195.0, "epoch": 1.3291976840363937, "grad_norm": 0.21888506412506104, "kl": 0.0999755859375, "learning_rate": 3.0532867828583135e-06, "loss": 0.0009981319308280945, "memory(GiB)": 38.05, "reward": 0.7397104501724243, "reward_std": 0.07738160341978073, "rewards/VisualizationJSONCombinedORM/mean": 0.7397104501724243, "rewards/VisualizationJSONCombinedORM/std": 0.1251344233751297, "step": 1607, "train_speed(iter/s)": 0.112553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 257.375, "completions/min_length": 207.0, "epoch": 1.3300248138957818, "grad_norm": 0.1859138309955597, "kl": 0.06695556640625, "learning_rate": 3.0466396895252405e-06, "loss": 0.0006699636578559875, "memory(GiB)": 38.05, "reward": 0.23155413568019867, "reward_std": 0.034786783158779144, "rewards/VisualizationJSONCombinedORM/mean": 0.23155413568019867, "rewards/VisualizationJSONCombinedORM/std": 0.05568023398518562, "step": 1608, "train_speed(iter/s)": 0.112399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 241.5625, "completions/min_length": 188.0, "epoch": 1.3308519437551696, "grad_norm": 0.19667957723140717, "kl": 0.06158447265625, "learning_rate": 3.0399966677841187e-06, "loss": 0.0006146803498268127, "memory(GiB)": 38.05, "reward": 0.45342695713043213, "reward_std": 0.10415220260620117, "rewards/VisualizationJSONCombinedORM/mean": 0.45342695713043213, "rewards/VisualizationJSONCombinedORM/std": 0.14896820485591888, "step": 1609, "train_speed(iter/s)": 0.112258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 275.125, "completions/min_length": 248.0, "epoch": 1.3316790736145574, "grad_norm": 0.16312870383262634, "kl": 0.05419921875, "learning_rate": 3.0333577314816883e-06, "loss": 0.000541195273399353, "memory(GiB)": 38.05, "reward": 0.6683343648910522, "reward_std": 0.09635169804096222, "rewards/VisualizationJSONCombinedORM/mean": 0.6683343648910522, "rewards/VisualizationJSONCombinedORM/std": 0.09527557343244553, "step": 1610, "train_speed(iter/s)": 0.112117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 255.3125, "completions/min_length": 197.0, "epoch": 1.3325062034739454, "grad_norm": 0.17759323120117188, "kl": 0.068115234375, "learning_rate": 3.0267228944561726e-06, "loss": 0.0006810314953327179, "memory(GiB)": 38.05, "reward": 0.49327147006988525, "reward_std": 0.06549887359142303, "rewards/VisualizationJSONCombinedORM/mean": 0.49327147006988525, "rewards/VisualizationJSONCombinedORM/std": 0.0765685886144638, "step": 1611, "train_speed(iter/s)": 0.111995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 240.5, "completions/min_length": 190.0, "epoch": 1.3333333333333333, "grad_norm": 0.21441423892974854, "kl": 0.1058349609375, "learning_rate": 3.0200921705372555e-06, "loss": 0.0010598860681056976, "memory(GiB)": 38.05, "reward": 0.4945950210094452, "reward_std": 0.096310555934906, "rewards/VisualizationJSONCombinedORM/mean": 0.4945950210094452, "rewards/VisualizationJSONCombinedORM/std": 0.16527892649173737, "step": 1612, "train_speed(iter/s)": 0.111843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 247.0, "completions/min_length": 202.0, "epoch": 1.3341604631927213, "grad_norm": 0.19406519830226898, "kl": 0.06170654296875, "learning_rate": 3.013465573546044e-06, "loss": 0.0006170086562633514, "memory(GiB)": 38.05, "reward": 0.5006802082061768, "reward_std": 0.06944534927606583, "rewards/VisualizationJSONCombinedORM/mean": 0.5006802082061768, "rewards/VisualizationJSONCombinedORM/std": 0.19717548787593842, "step": 1613, "train_speed(iter/s)": 0.11173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 246.3125, "completions/min_length": 211.0, "epoch": 1.3349875930521091, "grad_norm": 0.18277384340763092, "kl": 0.076904296875, "learning_rate": 3.0068431172950387e-06, "loss": 0.0007692538201808929, "memory(GiB)": 38.05, "reward": 0.5774955749511719, "reward_std": 0.09509144723415375, "rewards/VisualizationJSONCombinedORM/mean": 0.5774955749511719, "rewards/VisualizationJSONCombinedORM/std": 0.20985423028469086, "step": 1614, "train_speed(iter/s)": 0.111612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 251.9375, "completions/min_length": 202.0, "epoch": 1.3358147229114972, "grad_norm": 0.1484091877937317, "kl": 0.07830810546875, "learning_rate": 3.0002248155881183e-06, "loss": 0.0007818657904863358, "memory(GiB)": 38.05, "reward": 0.45130181312561035, "reward_std": 0.0703185424208641, "rewards/VisualizationJSONCombinedORM/mean": 0.45130181312561035, "rewards/VisualizationJSONCombinedORM/std": 0.24242304265499115, "step": 1615, "train_speed(iter/s)": 0.111459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 245.3125, "completions/min_length": 201.0, "epoch": 1.336641852770885, "grad_norm": 0.19613327085971832, "kl": 0.0509033203125, "learning_rate": 2.9936106822204937e-06, "loss": 0.0005086995661258698, "memory(GiB)": 38.05, "reward": 0.6360093951225281, "reward_std": 0.07094554603099823, "rewards/VisualizationJSONCombinedORM/mean": 0.6360093951225281, "rewards/VisualizationJSONCombinedORM/std": 0.13990932703018188, "step": 1616, "train_speed(iter/s)": 0.111332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 250.6875, "completions/min_length": 220.0, "epoch": 1.337468982630273, "grad_norm": 0.144490048289299, "kl": 0.0867919921875, "learning_rate": 2.987000730978696e-06, "loss": 0.0008668303489685059, "memory(GiB)": 38.05, "reward": 0.5345752835273743, "reward_std": 0.05866535007953644, "rewards/VisualizationJSONCombinedORM/mean": 0.5345752835273743, "rewards/VisualizationJSONCombinedORM/std": 0.1935245394706726, "step": 1617, "train_speed(iter/s)": 0.111178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 237.125, "completions/min_length": 211.0, "epoch": 1.3382961124896608, "grad_norm": 0.1287996768951416, "kl": 0.04339599609375, "learning_rate": 2.980394975640526e-06, "loss": 0.00043389201164245605, "memory(GiB)": 38.05, "reward": 0.4286675453186035, "reward_std": 0.0356551818549633, "rewards/VisualizationJSONCombinedORM/mean": 0.4286675453186035, "rewards/VisualizationJSONCombinedORM/std": 0.06002507358789444, "step": 1618, "train_speed(iter/s)": 0.111046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 255.0625, "completions/min_length": 213.0, "epoch": 1.3391232423490487, "grad_norm": 0.1693802773952484, "kl": 0.056884765625, "learning_rate": 2.9737934299750514e-06, "loss": 0.0005688369274139404, "memory(GiB)": 38.05, "reward": 0.4985997676849365, "reward_std": 0.06629883497953415, "rewards/VisualizationJSONCombinedORM/mean": 0.4985997676849365, "rewards/VisualizationJSONCombinedORM/std": 0.21419209241867065, "step": 1619, "train_speed(iter/s)": 0.11093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 249.375, "completions/min_length": 199.0, "epoch": 1.3399503722084367, "grad_norm": 0.21165187656879425, "kl": 0.05096435546875, "learning_rate": 2.9671961077425583e-06, "loss": 0.0005090832710266113, "memory(GiB)": 38.05, "reward": 0.5376046895980835, "reward_std": 0.10675196349620819, "rewards/VisualizationJSONCombinedORM/mean": 0.5376046895980835, "rewards/VisualizationJSONCombinedORM/std": 0.1332472860813141, "step": 1620, "train_speed(iter/s)": 0.11079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 260.8125, "completions/min_length": 220.0, "epoch": 1.3407775020678248, "grad_norm": 0.15096113085746765, "kl": 0.05010986328125, "learning_rate": 2.9606030226945325e-06, "loss": 0.0005007348954677582, "memory(GiB)": 38.05, "reward": 0.5511350631713867, "reward_std": 0.0631609559059143, "rewards/VisualizationJSONCombinedORM/mean": 0.5511350631713867, "rewards/VisualizationJSONCombinedORM/std": 0.182336688041687, "step": 1621, "train_speed(iter/s)": 0.110678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 243.75, "completions/min_length": 190.0, "epoch": 1.3416046319272126, "grad_norm": 0.18482746183872223, "kl": 0.06298828125, "learning_rate": 2.954014188573626e-06, "loss": 0.0006307819858193398, "memory(GiB)": 38.05, "reward": 0.5162565112113953, "reward_std": 0.07856222987174988, "rewards/VisualizationJSONCombinedORM/mean": 0.5162565112113953, "rewards/VisualizationJSONCombinedORM/std": 0.20211541652679443, "step": 1622, "train_speed(iter/s)": 0.110573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 227.1875, "completions/min_length": 196.0, "epoch": 1.3424317617866004, "grad_norm": 0.1817978024482727, "kl": 0.0574951171875, "learning_rate": 2.94742961911363e-06, "loss": 0.0005745626986026764, "memory(GiB)": 38.05, "reward": 0.5893456935882568, "reward_std": 0.09724428504705429, "rewards/VisualizationJSONCombinedORM/mean": 0.5893456935882568, "rewards/VisualizationJSONCombinedORM/std": 0.11577918380498886, "step": 1623, "train_speed(iter/s)": 0.110482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 248.5, "completions/min_length": 188.0, "epoch": 1.3432588916459884, "grad_norm": 0.20257000625133514, "kl": 0.06219482421875, "learning_rate": 2.940849328039447e-06, "loss": 0.0006224289536476135, "memory(GiB)": 38.05, "reward": 0.4900771677494049, "reward_std": 0.08783215284347534, "rewards/VisualizationJSONCombinedORM/mean": 0.4900771677494049, "rewards/VisualizationJSONCombinedORM/std": 0.13535946607589722, "step": 1624, "train_speed(iter/s)": 0.11034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 249.8125, "completions/min_length": 217.0, "epoch": 1.3440860215053765, "grad_norm": 0.16795477271080017, "kl": 0.04656982421875, "learning_rate": 2.9342733290670623e-06, "loss": 0.0004660412669181824, "memory(GiB)": 38.05, "reward": 0.4912702739238739, "reward_std": 0.07545971125364304, "rewards/VisualizationJSONCombinedORM/mean": 0.4912702739238739, "rewards/VisualizationJSONCombinedORM/std": 0.1003313809633255, "step": 1625, "train_speed(iter/s)": 0.110237 }, { "epoch": 1.3440860215053765, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 304.5416666666667, "eval_completions/mean_length": 251.97916666666666, "eval_completions/min_length": 217.70833333333334, "eval_kl": 0.06476847330729167, "eval_loss": 0.0006505958735942841, "eval_reward": 0.4684463695933421, "eval_reward_std": 0.06974403878363471, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4684463695933421, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06974404029703389, "eval_runtime": 274.1486, "eval_samples_per_second": 0.088, "eval_steps_per_second": 0.011, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 236.6875, "completions/min_length": 207.0, "epoch": 1.3449131513647643, "grad_norm": 0.18332546949386597, "kl": 0.0797119140625, "learning_rate": 2.9277016359035165e-06, "loss": 0.0007972642779350281, "memory(GiB)": 38.05, "reward": 0.48584651947021484, "reward_std": 0.07532595843076706, "rewards/VisualizationJSONCombinedORM/mean": 0.48584651947021484, "rewards/VisualizationJSONCombinedORM/std": 0.11934435367584229, "step": 1626, "train_speed(iter/s)": 0.108109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 241.875, "completions/min_length": 196.0, "epoch": 1.3457402812241521, "grad_norm": 0.22097454965114594, "kl": 0.0662841796875, "learning_rate": 2.9211342622468676e-06, "loss": 0.000662313774228096, "memory(GiB)": 38.05, "reward": 0.45106473565101624, "reward_std": 0.0922563225030899, "rewards/VisualizationJSONCombinedORM/mean": 0.45106473565101624, "rewards/VisualizationJSONCombinedORM/std": 0.16316334903240204, "step": 1627, "train_speed(iter/s)": 0.107998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 240.0625, "completions/min_length": 188.0, "epoch": 1.3465674110835402, "grad_norm": 0.17679743468761444, "kl": 0.033843994140625, "learning_rate": 2.914571221786179e-06, "loss": 0.0003379993140697479, "memory(GiB)": 38.05, "reward": 0.30114927887916565, "reward_std": 0.043735940009355545, "rewards/VisualizationJSONCombinedORM/mean": 0.30114927887916565, "rewards/VisualizationJSONCombinedORM/std": 0.06830105930566788, "step": 1628, "train_speed(iter/s)": 0.107927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 266.25, "completions/min_length": 209.0, "epoch": 1.347394540942928, "grad_norm": 0.19164133071899414, "kl": 0.08740234375, "learning_rate": 2.9080125282014766e-06, "loss": 0.0008742660284042358, "memory(GiB)": 38.05, "reward": 0.6222869753837585, "reward_std": 0.1176290512084961, "rewards/VisualizationJSONCombinedORM/mean": 0.6222869753837585, "rewards/VisualizationJSONCombinedORM/std": 0.11517507582902908, "step": 1629, "train_speed(iter/s)": 0.107824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 245.875, "completions/min_length": 211.0, "epoch": 1.348221670802316, "grad_norm": 0.16118085384368896, "kl": 0.0704345703125, "learning_rate": 2.9014581951637295e-06, "loss": 0.0007041655480861664, "memory(GiB)": 38.05, "reward": 0.580898106098175, "reward_std": 0.0940248966217041, "rewards/VisualizationJSONCombinedORM/mean": 0.580898106098175, "rewards/VisualizationJSONCombinedORM/std": 0.09304104000329971, "step": 1630, "train_speed(iter/s)": 0.107741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 241.5625, "completions/min_length": 189.0, "epoch": 1.3490488006617039, "grad_norm": 0.20354829728603363, "kl": 0.0670166015625, "learning_rate": 2.894908236334811e-06, "loss": 0.0006694793701171875, "memory(GiB)": 38.05, "reward": 0.6346975564956665, "reward_std": 0.11033710837364197, "rewards/VisualizationJSONCombinedORM/mean": 0.6346975564956665, "rewards/VisualizationJSONCombinedORM/std": 0.12187713384628296, "step": 1631, "train_speed(iter/s)": 0.107647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 255.5, "completions/min_length": 206.0, "epoch": 1.349875930521092, "grad_norm": 0.1847873032093048, "kl": 0.04547119140625, "learning_rate": 2.8883626653674867e-06, "loss": 0.0004544109106063843, "memory(GiB)": 38.05, "reward": 0.6133891344070435, "reward_std": 0.0834241509437561, "rewards/VisualizationJSONCombinedORM/mean": 0.6133891344070435, "rewards/VisualizationJSONCombinedORM/std": 0.11562386155128479, "step": 1632, "train_speed(iter/s)": 0.107582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 241.625, "completions/min_length": 193.0, "epoch": 1.3507030603804797, "grad_norm": 0.19288033246994019, "kl": 0.05706787109375, "learning_rate": 2.881821495905366e-06, "loss": 0.0005695521831512451, "memory(GiB)": 38.05, "reward": 0.6679561138153076, "reward_std": 0.07502289116382599, "rewards/VisualizationJSONCombinedORM/mean": 0.6679561138153076, "rewards/VisualizationJSONCombinedORM/std": 0.0774046927690506, "step": 1633, "train_speed(iter/s)": 0.107508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 266.0, "completions/min_length": 210.0, "epoch": 1.3515301902398678, "grad_norm": 0.1681412160396576, "kl": 0.04913330078125, "learning_rate": 2.8752847415828923e-06, "loss": 0.0004912838339805603, "memory(GiB)": 38.05, "reward": 0.5686613321304321, "reward_std": 0.08518218994140625, "rewards/VisualizationJSONCombinedORM/mean": 0.5686613321304321, "rewards/VisualizationJSONCombinedORM/std": 0.10238344967365265, "step": 1634, "train_speed(iter/s)": 0.107399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 236.5625, "completions/min_length": 194.0, "epoch": 1.3523573200992556, "grad_norm": 0.16643063724040985, "kl": 0.084228515625, "learning_rate": 2.868752416025297e-06, "loss": 0.0008428730070590973, "memory(GiB)": 38.05, "reward": 0.5996022820472717, "reward_std": 0.0955120399594307, "rewards/VisualizationJSONCombinedORM/mean": 0.5996022820472717, "rewards/VisualizationJSONCombinedORM/std": 0.13974671065807343, "step": 1635, "train_speed(iter/s)": 0.107328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 249.9375, "completions/min_length": 199.0, "epoch": 1.3531844499586434, "grad_norm": 0.2790754437446594, "kl": 0.07037353515625, "learning_rate": 2.862224532848591e-06, "loss": 0.0007029995322227478, "memory(GiB)": 38.05, "reward": 0.5625907182693481, "reward_std": 0.04695357754826546, "rewards/VisualizationJSONCombinedORM/mean": 0.5625907182693481, "rewards/VisualizationJSONCombinedORM/std": 0.2046327143907547, "step": 1636, "train_speed(iter/s)": 0.107197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 235.875, "completions/min_length": 205.0, "epoch": 1.3540115798180314, "grad_norm": 0.18541975319385529, "kl": 0.053466796875, "learning_rate": 2.855701105659515e-06, "loss": 0.0005343370139598846, "memory(GiB)": 38.05, "reward": 0.418915718793869, "reward_std": 0.04572390019893646, "rewards/VisualizationJSONCombinedORM/mean": 0.418915718793869, "rewards/VisualizationJSONCombinedORM/std": 0.22867552936077118, "step": 1637, "train_speed(iter/s)": 0.107099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 254.0, "completions/min_length": 199.0, "epoch": 1.3548387096774195, "grad_norm": 0.17650510370731354, "kl": 0.0389404296875, "learning_rate": 2.8491821480555283e-06, "loss": 0.00038907676935195923, "memory(GiB)": 38.05, "reward": 0.5103219151496887, "reward_std": 0.06563379615545273, "rewards/VisualizationJSONCombinedORM/mean": 0.5103219151496887, "rewards/VisualizationJSONCombinedORM/std": 0.26907649636268616, "step": 1638, "train_speed(iter/s)": 0.106988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 266.875, "completions/min_length": 218.0, "epoch": 1.3556658395368073, "grad_norm": 0.18470436334609985, "kl": 0.06317138671875, "learning_rate": 2.842667673624774e-06, "loss": 0.0006319154053926468, "memory(GiB)": 38.05, "reward": 0.5421907901763916, "reward_std": 0.10331504791975021, "rewards/VisualizationJSONCombinedORM/mean": 0.5421907901763916, "rewards/VisualizationJSONCombinedORM/std": 0.11236891150474548, "step": 1639, "train_speed(iter/s)": 0.106865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 253.75, "completions/min_length": 192.0, "epoch": 1.3564929693961951, "grad_norm": 0.19389064610004425, "kl": 0.04559326171875, "learning_rate": 2.836157695946047e-06, "loss": 0.0004557594656944275, "memory(GiB)": 38.05, "reward": 0.5693649649620056, "reward_std": 0.07719851285219193, "rewards/VisualizationJSONCombinedORM/mean": 0.5693649649620056, "rewards/VisualizationJSONCombinedORM/std": 0.22278805077075958, "step": 1640, "train_speed(iter/s)": 0.106773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/mean_length": 243.1875, "completions/min_length": 199.0, "epoch": 1.3573200992555832, "grad_norm": 0.15977099537849426, "kl": 0.0447998046875, "learning_rate": 2.8296522285887672e-06, "loss": 0.0004469510167837143, "memory(GiB)": 38.05, "reward": 0.5650753974914551, "reward_std": 0.10756848752498627, "rewards/VisualizationJSONCombinedORM/mean": 0.5650753974914551, "rewards/VisualizationJSONCombinedORM/std": 0.12089966982603073, "step": 1641, "train_speed(iter/s)": 0.106659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 248.1875, "completions/min_length": 209.0, "epoch": 1.358147229114971, "grad_norm": 0.15791389346122742, "kl": 0.0328369140625, "learning_rate": 2.8231512851129596e-06, "loss": 0.00032793357968330383, "memory(GiB)": 38.05, "reward": 0.5030616521835327, "reward_std": 0.09372857213020325, "rewards/VisualizationJSONCombinedORM/mean": 0.5030616521835327, "rewards/VisualizationJSONCombinedORM/std": 0.17255885899066925, "step": 1642, "train_speed(iter/s)": 0.106569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 279.5625, "completions/min_length": 209.0, "epoch": 1.358974358974359, "grad_norm": 0.16944730281829834, "kl": 0.0736083984375, "learning_rate": 2.8166548790692182e-06, "loss": 0.0007369667291641235, "memory(GiB)": 38.05, "reward": 0.5454185009002686, "reward_std": 0.08427213132381439, "rewards/VisualizationJSONCombinedORM/mean": 0.5454185009002686, "rewards/VisualizationJSONCombinedORM/std": 0.1526363044977188, "step": 1643, "train_speed(iter/s)": 0.106467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 228.125, "completions/min_length": 194.0, "epoch": 1.3598014888337469, "grad_norm": 0.16947151720523834, "kl": 0.03680419921875, "learning_rate": 2.810163023998673e-06, "loss": 0.0003680214285850525, "memory(GiB)": 38.05, "reward": 0.6564892530441284, "reward_std": 0.10389302670955658, "rewards/VisualizationJSONCombinedORM/mean": 0.6564892530441284, "rewards/VisualizationJSONCombinedORM/std": 0.1307172328233719, "step": 1644, "train_speed(iter/s)": 0.106383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 256.375, "completions/min_length": 216.0, "epoch": 1.360628618693135, "grad_norm": 0.23638658225536346, "kl": 0.0657958984375, "learning_rate": 2.803675733432977e-06, "loss": 0.0006594285368919373, "memory(GiB)": 38.05, "reward": 0.46892431378364563, "reward_std": 0.07188291847705841, "rewards/VisualizationJSONCombinedORM/mean": 0.46892431378364563, "rewards/VisualizationJSONCombinedORM/std": 0.16810591518878937, "step": 1645, "train_speed(iter/s)": 0.10626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 279.4375, "completions/min_length": 225.0, "epoch": 1.3614557485525227, "grad_norm": 0.16806569695472717, "kl": 0.0419921875, "learning_rate": 2.79719302089426e-06, "loss": 0.00041968002915382385, "memory(GiB)": 38.05, "reward": 0.46612560749053955, "reward_std": 0.04524446278810501, "rewards/VisualizationJSONCombinedORM/mean": 0.46612560749053955, "rewards/VisualizationJSONCombinedORM/std": 0.07710256427526474, "step": 1646, "train_speed(iter/s)": 0.10617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 239.1875, "completions/min_length": 189.0, "epoch": 1.3622828784119108, "grad_norm": 0.16897349059581757, "kl": 0.03173828125, "learning_rate": 2.790714899895117e-06, "loss": 0.00031658634543418884, "memory(GiB)": 38.05, "reward": 0.6288399696350098, "reward_std": 0.06481404602527618, "rewards/VisualizationJSONCombinedORM/mean": 0.6288399696350098, "rewards/VisualizationJSONCombinedORM/std": 0.09820538014173508, "step": 1647, "train_speed(iter/s)": 0.106079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 250.875, "completions/min_length": 215.0, "epoch": 1.3631100082712986, "grad_norm": 0.18363742530345917, "kl": 0.0877685546875, "learning_rate": 2.784241383938566e-06, "loss": 0.0008776187896728516, "memory(GiB)": 38.05, "reward": 0.6817740797996521, "reward_std": 0.10755383968353271, "rewards/VisualizationJSONCombinedORM/mean": 0.6817740797996521, "rewards/VisualizationJSONCombinedORM/std": 0.14068768918514252, "step": 1648, "train_speed(iter/s)": 0.10599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 264.3125, "completions/min_length": 195.0, "epoch": 1.3639371381306864, "grad_norm": 0.2074039876461029, "kl": 0.054931640625, "learning_rate": 2.7777724865180333e-06, "loss": 0.0005486160516738892, "memory(GiB)": 38.05, "reward": 0.4883231520652771, "reward_std": 0.09253043681383133, "rewards/VisualizationJSONCombinedORM/mean": 0.4883231520652771, "rewards/VisualizationJSONCombinedORM/std": 0.10677295178174973, "step": 1649, "train_speed(iter/s)": 0.105852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 225.75, "completions/min_length": 188.0, "epoch": 1.3647642679900744, "grad_norm": 0.1788516640663147, "kl": 0.03826904296875, "learning_rate": 2.771308221117309e-06, "loss": 0.0003830716013908386, "memory(GiB)": 38.05, "reward": 0.5970059633255005, "reward_std": 0.04914833605289459, "rewards/VisualizationJSONCombinedORM/mean": 0.5970059633255005, "rewards/VisualizationJSONCombinedORM/std": 0.12482953816652298, "step": 1650, "train_speed(iter/s)": 0.105783 }, { "epoch": 1.3647642679900744, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 295.75, "eval_completions/mean_length": 254.01041666666666, "eval_completions/min_length": 219.29166666666666, "eval_kl": 0.0541839599609375, "eval_loss": 0.000542718917131424, "eval_reward": 0.44543549480537575, "eval_reward_std": 0.06325963038640718, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44543549480537575, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06325963523704559, "eval_runtime": 269.3895, "eval_samples_per_second": 0.089, "eval_steps_per_second": 0.011, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 263.25, "completions/min_length": 215.0, "epoch": 1.3655913978494625, "grad_norm": 0.18056021630764008, "kl": 0.028167724609375, "learning_rate": 2.764848601210539e-06, "loss": 0.0002814180916175246, "memory(GiB)": 38.05, "reward": 0.5232436656951904, "reward_std": 0.024399075657129288, "rewards/VisualizationJSONCombinedORM/mean": 0.5232436656951904, "rewards/VisualizationJSONCombinedORM/std": 0.2798193693161011, "step": 1651, "train_speed(iter/s)": 0.103893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 258.1875, "completions/min_length": 194.0, "epoch": 1.3664185277088503, "grad_norm": 0.21933940052986145, "kl": 0.04925537109375, "learning_rate": 2.7583936402621753e-06, "loss": 0.0004926994442939758, "memory(GiB)": 38.05, "reward": 0.46676671504974365, "reward_std": 0.057226624339818954, "rewards/VisualizationJSONCombinedORM/mean": 0.46676671504974365, "rewards/VisualizationJSONCombinedORM/std": 0.2013067603111267, "step": 1652, "train_speed(iter/s)": 0.103789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 245.625, "completions/min_length": 195.0, "epoch": 1.3672456575682381, "grad_norm": 0.17059041559696198, "kl": 0.06451416015625, "learning_rate": 2.7519433517269665e-06, "loss": 0.0006454363465309143, "memory(GiB)": 38.05, "reward": 0.4610966444015503, "reward_std": 0.05369490012526512, "rewards/VisualizationJSONCombinedORM/mean": 0.4610966444015503, "rewards/VisualizationJSONCombinedORM/std": 0.23406873643398285, "step": 1653, "train_speed(iter/s)": 0.103711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 243.25, "completions/min_length": 202.0, "epoch": 1.3680727874276262, "grad_norm": 0.18453145027160645, "kl": 0.06640625, "learning_rate": 2.745497749049922e-06, "loss": 0.0006636828184127808, "memory(GiB)": 38.05, "reward": 0.462049663066864, "reward_std": 0.07247306406497955, "rewards/VisualizationJSONCombinedORM/mean": 0.462049663066864, "rewards/VisualizationJSONCombinedORM/std": 0.07682740688323975, "step": 1654, "train_speed(iter/s)": 0.103592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 257.6875, "completions/min_length": 205.0, "epoch": 1.368899917287014, "grad_norm": 0.24535539746284485, "kl": 0.06903076171875, "learning_rate": 2.7390568456662768e-06, "loss": 0.0006897673010826111, "memory(GiB)": 38.05, "reward": 0.4293286204338074, "reward_std": 0.08107756078243256, "rewards/VisualizationJSONCombinedORM/mean": 0.4293286204338074, "rewards/VisualizationJSONCombinedORM/std": 0.1448734551668167, "step": 1655, "train_speed(iter/s)": 0.103498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 235.0625, "completions/min_length": 209.0, "epoch": 1.369727047146402, "grad_norm": 0.17133617401123047, "kl": 0.04473876953125, "learning_rate": 2.7326206550014793e-06, "loss": 0.00044690072536468506, "memory(GiB)": 38.05, "reward": 0.554588794708252, "reward_std": 0.06639190018177032, "rewards/VisualizationJSONCombinedORM/mean": 0.554588794708252, "rewards/VisualizationJSONCombinedORM/std": 0.1810472160577774, "step": 1656, "train_speed(iter/s)": 0.103385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 249.5, "completions/min_length": 211.0, "epoch": 1.3705541770057899, "grad_norm": 0.2031472623348236, "kl": 0.06988525390625, "learning_rate": 2.726189190471148e-06, "loss": 0.0006981659680604935, "memory(GiB)": 38.05, "reward": 0.5793418884277344, "reward_std": 0.07132059335708618, "rewards/VisualizationJSONCombinedORM/mean": 0.5793418884277344, "rewards/VisualizationJSONCombinedORM/std": 0.10376588255167007, "step": 1657, "train_speed(iter/s)": 0.10329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 241.5, "completions/min_length": 191.0, "epoch": 1.371381306865178, "grad_norm": 0.21763072907924652, "kl": 0.05963134765625, "learning_rate": 2.719762465481055e-06, "loss": 0.0005970709025859833, "memory(GiB)": 38.05, "reward": 0.6023529767990112, "reward_std": 0.12925156950950623, "rewards/VisualizationJSONCombinedORM/mean": 0.6023529767990112, "rewards/VisualizationJSONCombinedORM/std": 0.18495365977287292, "step": 1658, "train_speed(iter/s)": 0.103195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 254.25, "completions/min_length": 196.0, "epoch": 1.3722084367245657, "grad_norm": 0.2379138171672821, "kl": 0.0643310546875, "learning_rate": 2.713340493427089e-06, "loss": 0.0006431713700294495, "memory(GiB)": 38.05, "reward": 0.4343317747116089, "reward_std": 0.08183589577674866, "rewards/VisualizationJSONCombinedORM/mean": 0.4343317747116089, "rewards/VisualizationJSONCombinedORM/std": 0.2512662410736084, "step": 1659, "train_speed(iter/s)": 0.1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 257.8125, "completions/min_length": 209.0, "epoch": 1.3730355665839538, "grad_norm": 0.19901372492313385, "kl": 0.06475830078125, "learning_rate": 2.7069232876952368e-06, "loss": 0.0006475094705820084, "memory(GiB)": 38.05, "reward": 0.35254210233688354, "reward_std": 0.06687179952859879, "rewards/VisualizationJSONCombinedORM/mean": 0.35254210233688354, "rewards/VisualizationJSONCombinedORM/std": 0.14505010843276978, "step": 1660, "train_speed(iter/s)": 0.103036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 271.125, "completions/min_length": 220.0, "epoch": 1.3738626964433416, "grad_norm": 0.16792383790016174, "kl": 0.1444091796875, "learning_rate": 2.700510861661544e-06, "loss": 0.0014466457068920135, "memory(GiB)": 38.05, "reward": 0.2555428445339203, "reward_std": 0.02978304587304592, "rewards/VisualizationJSONCombinedORM/mean": 0.2555428445339203, "rewards/VisualizationJSONCombinedORM/std": 0.060866039246320724, "step": 1661, "train_speed(iter/s)": 0.102958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 245.8125, "completions/min_length": 206.0, "epoch": 1.3746898263027294, "grad_norm": 0.16873447597026825, "kl": 0.04534912109375, "learning_rate": 2.694103228692099e-06, "loss": 0.00045287609100341797, "memory(GiB)": 38.05, "reward": 0.5109449028968811, "reward_std": 0.09523842483758926, "rewards/VisualizationJSONCombinedORM/mean": 0.5109449028968811, "rewards/VisualizationJSONCombinedORM/std": 0.14796082675457, "step": 1662, "train_speed(iter/s)": 0.102843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 247.0, "completions/min_length": 203.0, "epoch": 1.3755169561621174, "grad_norm": 0.17437437176704407, "kl": 0.05291748046875, "learning_rate": 2.6877004021429975e-06, "loss": 0.000529952347278595, "memory(GiB)": 38.05, "reward": 0.3552190661430359, "reward_std": 0.06036804988980293, "rewards/VisualizationJSONCombinedORM/mean": 0.3552190661430359, "rewards/VisualizationJSONCombinedORM/std": 0.08655516058206558, "step": 1663, "train_speed(iter/s)": 0.102797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 250.375, "completions/min_length": 196.0, "epoch": 1.3763440860215055, "grad_norm": 0.16859330236911774, "kl": 0.0517578125, "learning_rate": 2.6813023953603168e-06, "loss": 0.0005160942673683167, "memory(GiB)": 38.05, "reward": 0.5358294248580933, "reward_std": 0.11993975937366486, "rewards/VisualizationJSONCombinedORM/mean": 0.5358294248580933, "rewards/VisualizationJSONCombinedORM/std": 0.20954474806785583, "step": 1664, "train_speed(iter/s)": 0.102707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 251.125, "completions/min_length": 213.0, "epoch": 1.3771712158808933, "grad_norm": 0.14910386502742767, "kl": 0.06201171875, "learning_rate": 2.6749092216800848e-06, "loss": 0.0006188824772834778, "memory(GiB)": 38.05, "reward": 0.6642983555793762, "reward_std": 0.08829553425312042, "rewards/VisualizationJSONCombinedORM/mean": 0.6642983555793762, "rewards/VisualizationJSONCombinedORM/std": 0.11355067044496536, "step": 1665, "train_speed(iter/s)": 0.102616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 246.875, "completions/min_length": 205.0, "epoch": 1.3779983457402811, "grad_norm": 0.21489188075065613, "kl": 0.05865478515625, "learning_rate": 2.668520894428259e-06, "loss": 0.0005867704749107361, "memory(GiB)": 38.05, "reward": 0.7599257230758667, "reward_std": 0.0830940455198288, "rewards/VisualizationJSONCombinedORM/mean": 0.7599257230758667, "rewards/VisualizationJSONCombinedORM/std": 0.10355664789676666, "step": 1666, "train_speed(iter/s)": 0.102539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 233.5, "completions/min_length": 196.0, "epoch": 1.3788254755996692, "grad_norm": 0.2114320546388626, "kl": 0.0487060546875, "learning_rate": 2.662137426920698e-06, "loss": 0.00048675015568733215, "memory(GiB)": 38.05, "reward": 0.5919928550720215, "reward_std": 0.08718175441026688, "rewards/VisualizationJSONCombinedORM/mean": 0.5919928550720215, "rewards/VisualizationJSONCombinedORM/std": 0.09723390638828278, "step": 1667, "train_speed(iter/s)": 0.102467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 234.0625, "completions/min_length": 206.0, "epoch": 1.379652605459057, "grad_norm": 0.17279323935508728, "kl": 0.0367431640625, "learning_rate": 2.6557588324631223e-06, "loss": 0.0003675529733300209, "memory(GiB)": 38.05, "reward": 0.4733373820781708, "reward_std": 0.05544648319482803, "rewards/VisualizationJSONCombinedORM/mean": 0.4733373820781708, "rewards/VisualizationJSONCombinedORM/std": 0.06973434239625931, "step": 1668, "train_speed(iter/s)": 0.102397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 253.1875, "completions/min_length": 205.0, "epoch": 1.380479735318445, "grad_norm": 0.1618042290210724, "kl": 0.0704345703125, "learning_rate": 2.649385124351099e-06, "loss": 0.0007054135203361511, "memory(GiB)": 38.05, "reward": 0.4977903366088867, "reward_std": 0.0745631754398346, "rewards/VisualizationJSONCombinedORM/mean": 0.4977903366088867, "rewards/VisualizationJSONCombinedORM/std": 0.1275864690542221, "step": 1669, "train_speed(iter/s)": 0.10229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 293.1875, "completions/min_length": 232.0, "epoch": 1.3813068651778329, "grad_norm": 0.18062376976013184, "kl": 0.07757568359375, "learning_rate": 2.6430163158700116e-06, "loss": 0.0007761754095554352, "memory(GiB)": 38.05, "reward": 0.4301992654800415, "reward_std": 0.09641636162996292, "rewards/VisualizationJSONCombinedORM/mean": 0.4301992654800415, "rewards/VisualizationJSONCombinedORM/std": 0.09844200313091278, "step": 1670, "train_speed(iter/s)": 0.102191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 265.75, "completions/min_length": 221.0, "epoch": 1.382133995037221, "grad_norm": 0.20000387728214264, "kl": 0.0738525390625, "learning_rate": 2.636652420295033e-06, "loss": 0.0007395893335342407, "memory(GiB)": 38.05, "reward": 0.4511653780937195, "reward_std": 0.07382825762033463, "rewards/VisualizationJSONCombinedORM/mean": 0.4511653780937195, "rewards/VisualizationJSONCombinedORM/std": 0.08905086666345596, "step": 1671, "train_speed(iter/s)": 0.102105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 243.1875, "completions/min_length": 199.0, "epoch": 1.3829611248966087, "grad_norm": 0.191661074757576, "kl": 0.0950927734375, "learning_rate": 2.630293450891086e-06, "loss": 0.0009512454271316528, "memory(GiB)": 38.05, "reward": 0.6448999643325806, "reward_std": 0.11888882517814636, "rewards/VisualizationJSONCombinedORM/mean": 0.6448999643325806, "rewards/VisualizationJSONCombinedORM/std": 0.13810871541500092, "step": 1672, "train_speed(iter/s)": 0.102025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 247.0625, "completions/min_length": 193.0, "epoch": 1.3837882547559968, "grad_norm": 0.1876605749130249, "kl": 0.0833740234375, "learning_rate": 2.623939420912838e-06, "loss": 0.0008333176374435425, "memory(GiB)": 38.05, "reward": 0.41146790981292725, "reward_std": 0.029172461479902267, "rewards/VisualizationJSONCombinedORM/mean": 0.41146790981292725, "rewards/VisualizationJSONCombinedORM/std": 0.1076069325208664, "step": 1673, "train_speed(iter/s)": 0.101935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 247.125, "completions/min_length": 192.0, "epoch": 1.3846153846153846, "grad_norm": 0.17145974934101105, "kl": 0.05584716796875, "learning_rate": 2.617590343604648e-06, "loss": 0.0005582496523857117, "memory(GiB)": 38.05, "reward": 0.39734503626823425, "reward_std": 0.034897834062576294, "rewards/VisualizationJSONCombinedORM/mean": 0.39734503626823425, "rewards/VisualizationJSONCombinedORM/std": 0.03683364763855934, "step": 1674, "train_speed(iter/s)": 0.101858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 238.5625, "completions/min_length": 207.0, "epoch": 1.3854425144747724, "grad_norm": 0.19625656306743622, "kl": 0.091064453125, "learning_rate": 2.611246232200562e-06, "loss": 0.000909675844013691, "memory(GiB)": 38.05, "reward": 0.5664513111114502, "reward_std": 0.05174646154046059, "rewards/VisualizationJSONCombinedORM/mean": 0.5664513111114502, "rewards/VisualizationJSONCombinedORM/std": 0.2012309432029724, "step": 1675, "train_speed(iter/s)": 0.101784 }, { "epoch": 1.3854425144747724, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 288.125, "eval_completions/mean_length": 246.71354166666666, "eval_completions/min_length": 210.95833333333334, "eval_kl": 0.06609090169270833, "eval_loss": 0.0006635785102844238, "eval_reward": 0.4900277455647786, "eval_reward_std": 0.06490277561048667, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4900277455647786, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06490277801640332, "eval_runtime": 264.1674, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.011, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 271.6875, "completions/min_length": 189.0, "epoch": 1.3862696443341604, "grad_norm": 0.17968693375587463, "kl": 0.07379150390625, "learning_rate": 2.6049070999242708e-06, "loss": 0.0007377490401268005, "memory(GiB)": 38.05, "reward": 0.5749363899230957, "reward_std": 0.09188289940357208, "rewards/VisualizationJSONCombinedORM/mean": 0.5749363899230957, "rewards/VisualizationJSONCombinedORM/std": 0.13813380897045135, "step": 1676, "train_speed(iter/s)": 0.100084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 229.6875, "completions/min_length": 191.0, "epoch": 1.3870967741935485, "grad_norm": 0.20657013356685638, "kl": 0.038818359375, "learning_rate": 2.598572959989086e-06, "loss": 0.0003880634903907776, "memory(GiB)": 38.05, "reward": 0.6328732967376709, "reward_std": 0.106429323554039, "rewards/VisualizationJSONCombinedORM/mean": 0.6328732967376709, "rewards/VisualizationJSONCombinedORM/std": 0.10611221194267273, "step": 1677, "train_speed(iter/s)": 0.100014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 258.3125, "completions/min_length": 228.0, "epoch": 1.3879239040529363, "grad_norm": 0.2666057050228119, "kl": 0.0772705078125, "learning_rate": 2.5922438255979125e-06, "loss": 0.000775415450334549, "memory(GiB)": 38.05, "reward": 0.5734019875526428, "reward_std": 0.09790597856044769, "rewards/VisualizationJSONCombinedORM/mean": 0.5734019875526428, "rewards/VisualizationJSONCombinedORM/std": 0.11479556560516357, "step": 1678, "train_speed(iter/s)": 0.099952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 252.8125, "completions/min_length": 206.0, "epoch": 1.3887510339123241, "grad_norm": 0.1734464466571808, "kl": 0.0447998046875, "learning_rate": 2.5859197099432245e-06, "loss": 0.00044796615839004517, "memory(GiB)": 38.05, "reward": 0.6507681012153625, "reward_std": 0.06504210084676743, "rewards/VisualizationJSONCombinedORM/mean": 0.6507681012153625, "rewards/VisualizationJSONCombinedORM/std": 0.0999109074473381, "step": 1679, "train_speed(iter/s)": 0.099865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 280.1875, "completions/min_length": 216.0, "epoch": 1.3895781637717122, "grad_norm": 0.17855803668498993, "kl": 0.07568359375, "learning_rate": 2.5796006262070337e-06, "loss": 0.0007548555731773376, "memory(GiB)": 38.05, "reward": 0.48222842812538147, "reward_std": 0.06686222553253174, "rewards/VisualizationJSONCombinedORM/mean": 0.48222842812538147, "rewards/VisualizationJSONCombinedORM/std": 0.22021807730197906, "step": 1680, "train_speed(iter/s)": 0.099797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 248.875, "completions/min_length": 197.0, "epoch": 1.3904052936311002, "grad_norm": 0.23547987639904022, "kl": 0.061279296875, "learning_rate": 2.5732865875608643e-06, "loss": 0.0006142705678939819, "memory(GiB)": 38.05, "reward": 0.4046678841114044, "reward_std": 0.05773841217160225, "rewards/VisualizationJSONCombinedORM/mean": 0.4046678841114044, "rewards/VisualizationJSONCombinedORM/std": 0.07855735719203949, "step": 1681, "train_speed(iter/s)": 0.099686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 226.125, "completions/min_length": 199.0, "epoch": 1.391232423490488, "grad_norm": 0.15790945291519165, "kl": 0.06414794921875, "learning_rate": 2.5669776071657194e-06, "loss": 0.0006398409605026245, "memory(GiB)": 38.05, "reward": 0.4824570417404175, "reward_std": 0.06023857742547989, "rewards/VisualizationJSONCombinedORM/mean": 0.4824570417404175, "rewards/VisualizationJSONCombinedORM/std": 0.06510259956121445, "step": 1682, "train_speed(iter/s)": 0.099607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 254.875, "completions/min_length": 227.0, "epoch": 1.3920595533498759, "grad_norm": 0.15477816760540009, "kl": 0.0792236328125, "learning_rate": 2.560673698172066e-06, "loss": 0.0007927194237709045, "memory(GiB)": 38.05, "reward": 0.42341333627700806, "reward_std": 0.056466519832611084, "rewards/VisualizationJSONCombinedORM/mean": 0.42341333627700806, "rewards/VisualizationJSONCombinedORM/std": 0.1608443409204483, "step": 1683, "train_speed(iter/s)": 0.099554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 235.6875, "completions/min_length": 196.0, "epoch": 1.392886683209264, "grad_norm": 0.17481037974357605, "kl": 0.1092529296875, "learning_rate": 2.5543748737197953e-06, "loss": 0.0010940171778202057, "memory(GiB)": 38.05, "reward": 0.5019548535346985, "reward_std": 0.046584051102399826, "rewards/VisualizationJSONCombinedORM/mean": 0.5019548535346985, "rewards/VisualizationJSONCombinedORM/std": 0.26642706990242004, "step": 1684, "train_speed(iter/s)": 0.099488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 258.75, "completions/min_length": 240.0, "epoch": 1.3937138130686517, "grad_norm": 0.17901167273521423, "kl": 0.072021484375, "learning_rate": 2.5480811469382074e-06, "loss": 0.0007182210683822632, "memory(GiB)": 38.05, "reward": 0.5761889815330505, "reward_std": 0.09323142468929291, "rewards/VisualizationJSONCombinedORM/mean": 0.5761889815330505, "rewards/VisualizationJSONCombinedORM/std": 0.10712048411369324, "step": 1685, "train_speed(iter/s)": 0.099411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 234.5625, "completions/min_length": 187.0, "epoch": 1.3945409429280398, "grad_norm": 0.17982007563114166, "kl": 0.0821533203125, "learning_rate": 2.5417925309459623e-06, "loss": 0.0008215075358748436, "memory(GiB)": 38.05, "reward": 0.6115870475769043, "reward_std": 0.1016334742307663, "rewards/VisualizationJSONCombinedORM/mean": 0.6115870475769043, "rewards/VisualizationJSONCombinedORM/std": 0.09868323802947998, "step": 1686, "train_speed(iter/s)": 0.099327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 246.9375, "completions/min_length": 200.0, "epoch": 1.3953680727874276, "grad_norm": 0.17011621594429016, "kl": 0.0694580078125, "learning_rate": 2.5355090388510806e-06, "loss": 0.0006942972540855408, "memory(GiB)": 38.05, "reward": 0.6019582152366638, "reward_std": 0.08198636770248413, "rewards/VisualizationJSONCombinedORM/mean": 0.6019582152366638, "rewards/VisualizationJSONCombinedORM/std": 0.08727256953716278, "step": 1687, "train_speed(iter/s)": 0.099235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 237.8125, "completions/min_length": 190.0, "epoch": 1.3961952026468156, "grad_norm": 0.16820816695690155, "kl": 0.07305908203125, "learning_rate": 2.529230683750897e-06, "loss": 0.0007295571267604828, "memory(GiB)": 38.05, "reward": 0.6089550852775574, "reward_std": 0.06075243651866913, "rewards/VisualizationJSONCombinedORM/mean": 0.6089550852775574, "rewards/VisualizationJSONCombinedORM/std": 0.1450989544391632, "step": 1688, "train_speed(iter/s)": 0.099145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 253.125, "completions/min_length": 207.0, "epoch": 1.3970223325062034, "grad_norm": 0.1828160434961319, "kl": 0.1439208984375, "learning_rate": 2.522957478732043e-06, "loss": 0.0014358209446072578, "memory(GiB)": 38.05, "reward": 0.6427037715911865, "reward_std": 0.12332317233085632, "rewards/VisualizationJSONCombinedORM/mean": 0.6427037715911865, "rewards/VisualizationJSONCombinedORM/std": 0.1281658560037613, "step": 1689, "train_speed(iter/s)": 0.099038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 232.0625, "completions/min_length": 205.0, "epoch": 1.3978494623655915, "grad_norm": 0.20370501279830933, "kl": 0.08184814453125, "learning_rate": 2.51668943687041e-06, "loss": 0.0008185431361198425, "memory(GiB)": 38.05, "reward": 0.42032134532928467, "reward_std": 0.0630793645977974, "rewards/VisualizationJSONCombinedORM/mean": 0.42032134532928467, "rewards/VisualizationJSONCombinedORM/std": 0.20225030183792114, "step": 1690, "train_speed(iter/s)": 0.098982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 241.6875, "completions/min_length": 207.0, "epoch": 1.3986765922249793, "grad_norm": 0.19048453867435455, "kl": 0.06634521484375, "learning_rate": 2.5104265712311266e-06, "loss": 0.0006627514958381653, "memory(GiB)": 38.05, "reward": 0.40557482838630676, "reward_std": 0.05669781193137169, "rewards/VisualizationJSONCombinedORM/mean": 0.40557482838630676, "rewards/VisualizationJSONCombinedORM/std": 0.10320647060871124, "step": 1691, "train_speed(iter/s)": 0.098895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 266.4375, "completions/min_length": 223.0, "epoch": 1.3995037220843671, "grad_norm": 0.21530450880527496, "kl": 0.0611572265625, "learning_rate": 2.5041688948685367e-06, "loss": 0.0006121471524238586, "memory(GiB)": 38.05, "reward": 0.6606772541999817, "reward_std": 0.10671497881412506, "rewards/VisualizationJSONCombinedORM/mean": 0.6606772541999817, "rewards/VisualizationJSONCombinedORM/std": 0.10377373546361923, "step": 1692, "train_speed(iter/s)": 0.098821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 246.75, "completions/min_length": 210.0, "epoch": 1.4003308519437552, "grad_norm": 0.22512876987457275, "kl": 0.0579833984375, "learning_rate": 2.497916420826165e-06, "loss": 0.0005793347954750061, "memory(GiB)": 38.05, "reward": 0.6405295729637146, "reward_std": 0.11460460722446442, "rewards/VisualizationJSONCombinedORM/mean": 0.6405295729637146, "rewards/VisualizationJSONCombinedORM/std": 0.17505241930484772, "step": 1693, "train_speed(iter/s)": 0.09875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 252.1875, "completions/min_length": 190.0, "epoch": 1.4011579818031432, "grad_norm": 0.17617857456207275, "kl": 0.10369873046875, "learning_rate": 2.4916691621366984e-06, "loss": 0.0010362900793552399, "memory(GiB)": 38.05, "reward": 0.551203727722168, "reward_std": 0.08066040277481079, "rewards/VisualizationJSONCombinedORM/mean": 0.551203727722168, "rewards/VisualizationJSONCombinedORM/std": 0.0916314348578453, "step": 1694, "train_speed(iter/s)": 0.098677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 223.375, "completions/min_length": 195.0, "epoch": 1.401985111662531, "grad_norm": 0.16523468494415283, "kl": 0.049560546875, "learning_rate": 2.485427131821939e-06, "loss": 0.0004963874816894531, "memory(GiB)": 38.05, "reward": 0.5665196180343628, "reward_std": 0.0790814757347107, "rewards/VisualizationJSONCombinedORM/mean": 0.5665196180343628, "rewards/VisualizationJSONCombinedORM/std": 0.16947734355926514, "step": 1695, "train_speed(iter/s)": 0.098606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 238.375, "completions/min_length": 195.0, "epoch": 1.4028122415219189, "grad_norm": 0.18171143531799316, "kl": 0.09375, "learning_rate": 2.479190342892804e-06, "loss": 0.0009371563792228699, "memory(GiB)": 38.05, "reward": 0.5607275366783142, "reward_std": 0.054403096437454224, "rewards/VisualizationJSONCombinedORM/mean": 0.5607275366783142, "rewards/VisualizationJSONCombinedORM/std": 0.17767976224422455, "step": 1696, "train_speed(iter/s)": 0.098519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 237.3125, "completions/min_length": 217.0, "epoch": 1.403639371381307, "grad_norm": 0.16064773499965668, "kl": 0.05377197265625, "learning_rate": 2.4729588083492795e-06, "loss": 0.0005371421575546265, "memory(GiB)": 38.05, "reward": 0.6935431957244873, "reward_std": 0.13437144458293915, "rewards/VisualizationJSONCombinedORM/mean": 0.6935431957244873, "rewards/VisualizationJSONCombinedORM/std": 0.13233059644699097, "step": 1697, "train_speed(iter/s)": 0.098455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 225.1875, "completions/min_length": 203.0, "epoch": 1.4044665012406947, "grad_norm": 0.15450438857078552, "kl": 0.0621337890625, "learning_rate": 2.466732541180404e-06, "loss": 0.0006216689944267273, "memory(GiB)": 38.05, "reward": 0.48212558031082153, "reward_std": 0.05972491577267647, "rewards/VisualizationJSONCombinedORM/mean": 0.48212558031082153, "rewards/VisualizationJSONCombinedORM/std": 0.061319783329963684, "step": 1698, "train_speed(iter/s)": 0.098378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 236.1875, "completions/min_length": 190.0, "epoch": 1.4052936311000828, "grad_norm": 0.13838230073451996, "kl": 0.072265625, "learning_rate": 2.4605115543642307e-06, "loss": 0.0007228069007396698, "memory(GiB)": 38.05, "reward": 0.40255656838417053, "reward_std": 0.06281161308288574, "rewards/VisualizationJSONCombinedORM/mean": 0.40255656838417053, "rewards/VisualizationJSONCombinedORM/std": 0.0737900361418724, "step": 1699, "train_speed(iter/s)": 0.0983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 246.125, "completions/min_length": 204.0, "epoch": 1.4061207609594706, "grad_norm": 0.27226778864860535, "kl": 0.06634521484375, "learning_rate": 2.4542958608678075e-06, "loss": 0.0006634797900915146, "memory(GiB)": 38.05, "reward": 0.47403013706207275, "reward_std": 0.08330580592155457, "rewards/VisualizationJSONCombinedORM/mean": 0.47403013706207275, "rewards/VisualizationJSONCombinedORM/std": 0.2374376356601715, "step": 1700, "train_speed(iter/s)": 0.098224 }, { "epoch": 1.4061207609594706, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 290.5416666666667, "eval_completions/mean_length": 245.84895833333334, "eval_completions/min_length": 214.375, "eval_kl": 0.08404541015625, "eval_loss": 0.0008499361574649811, "eval_reward": 0.48822543770074844, "eval_reward_std": 0.07736410984459023, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.48822543770074844, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07736411162962516, "eval_runtime": 265.2853, "eval_samples_per_second": 0.09, "eval_steps_per_second": 0.011, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 234.4375, "completions/min_length": 203.0, "epoch": 1.4069478908188586, "grad_norm": 0.1902731955051422, "kl": 0.0655517578125, "learning_rate": 2.4480854736471523e-06, "loss": 0.0006559304893016815, "memory(GiB)": 38.05, "reward": 0.6349145174026489, "reward_std": 0.06414473056793213, "rewards/VisualizationJSONCombinedORM/mean": 0.6349145174026489, "rewards/VisualizationJSONCombinedORM/std": 0.15693001449108124, "step": 1701, "train_speed(iter/s)": 0.096664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 245.6875, "completions/min_length": 207.0, "epoch": 1.4077750206782464, "grad_norm": 0.17764076590538025, "kl": 0.06787109375, "learning_rate": 2.4418804056472228e-06, "loss": 0.0006786063313484192, "memory(GiB)": 38.05, "reward": 0.4833843410015106, "reward_std": 0.06829570233821869, "rewards/VisualizationJSONCombinedORM/mean": 0.4833843410015106, "rewards/VisualizationJSONCombinedORM/std": 0.06897593289613724, "step": 1702, "train_speed(iter/s)": 0.096567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 265.0, "completions/min_length": 226.0, "epoch": 1.4086021505376345, "grad_norm": 0.1952662169933319, "kl": 0.0849609375, "learning_rate": 2.435680669801885e-06, "loss": 0.0008502379059791565, "memory(GiB)": 38.05, "reward": 0.5411779284477234, "reward_std": 0.08385826647281647, "rewards/VisualizationJSONCombinedORM/mean": 0.5411779284477234, "rewards/VisualizationJSONCombinedORM/std": 0.1623935103416443, "step": 1703, "train_speed(iter/s)": 0.096482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 254.875, "completions/min_length": 225.0, "epoch": 1.4094292803970223, "grad_norm": 0.1879546046257019, "kl": 0.096923828125, "learning_rate": 2.429486279033892e-06, "loss": 0.00096902996301651, "memory(GiB)": 38.05, "reward": 0.612080454826355, "reward_std": 0.09366379678249359, "rewards/VisualizationJSONCombinedORM/mean": 0.612080454826355, "rewards/VisualizationJSONCombinedORM/std": 0.14821083843708038, "step": 1704, "train_speed(iter/s)": 0.096403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 251.4375, "completions/min_length": 197.0, "epoch": 1.4102564102564101, "grad_norm": 0.18609710037708282, "kl": 0.087890625, "learning_rate": 2.423297246254857e-06, "loss": 0.0008770152926445007, "memory(GiB)": 38.05, "reward": 0.4742201864719391, "reward_std": 0.07669629156589508, "rewards/VisualizationJSONCombinedORM/mean": 0.4742201864719391, "rewards/VisualizationJSONCombinedORM/std": 0.1837160885334015, "step": 1705, "train_speed(iter/s)": 0.096343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 248.4375, "completions/min_length": 190.0, "epoch": 1.4110835401157982, "grad_norm": 0.19089825451374054, "kl": 0.08154296875, "learning_rate": 2.4171135843652256e-06, "loss": 0.0008152946829795837, "memory(GiB)": 38.05, "reward": 0.5243187546730042, "reward_std": 0.1069956123828888, "rewards/VisualizationJSONCombinedORM/mean": 0.5243187546730042, "rewards/VisualizationJSONCombinedORM/std": 0.11143678426742554, "step": 1706, "train_speed(iter/s)": 0.09628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 267.5625, "completions/min_length": 206.0, "epoch": 1.4119106699751862, "grad_norm": 0.18366318941116333, "kl": 0.07196044921875, "learning_rate": 2.410935306254247e-06, "loss": 0.0007200315594673157, "memory(GiB)": 38.05, "reward": 0.6265835165977478, "reward_std": 0.1053728312253952, "rewards/VisualizationJSONCombinedORM/mean": 0.6265835165977478, "rewards/VisualizationJSONCombinedORM/std": 0.10727526992559433, "step": 1707, "train_speed(iter/s)": 0.096208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 241.9375, "completions/min_length": 200.0, "epoch": 1.412737799834574, "grad_norm": 0.21153251826763153, "kl": 0.06671142578125, "learning_rate": 2.4047624247999484e-06, "loss": 0.0006661973893642426, "memory(GiB)": 38.05, "reward": 0.5189058780670166, "reward_std": 0.09970605373382568, "rewards/VisualizationJSONCombinedORM/mean": 0.5189058780670166, "rewards/VisualizationJSONCombinedORM/std": 0.1372365951538086, "step": 1708, "train_speed(iter/s)": 0.096107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 259.3125, "completions/min_length": 194.0, "epoch": 1.4135649296939619, "grad_norm": 0.16732144355773926, "kl": 0.1033935546875, "learning_rate": 2.398594952869106e-06, "loss": 0.0010357312858104706, "memory(GiB)": 38.05, "reward": 0.4993736147880554, "reward_std": 0.04803970828652382, "rewards/VisualizationJSONCombinedORM/mean": 0.4993736147880554, "rewards/VisualizationJSONCombinedORM/std": 0.2556758522987366, "step": 1709, "train_speed(iter/s)": 0.096033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 235.5625, "completions/min_length": 207.0, "epoch": 1.41439205955335, "grad_norm": 0.1922534555196762, "kl": 0.0589599609375, "learning_rate": 2.3924329033172246e-06, "loss": 0.0005901940166950226, "memory(GiB)": 38.05, "reward": 0.6980793476104736, "reward_std": 0.12214665114879608, "rewards/VisualizationJSONCombinedORM/mean": 0.6980793476104736, "rewards/VisualizationJSONCombinedORM/std": 0.16197997331619263, "step": 1710, "train_speed(iter/s)": 0.095957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 259.25, "completions/min_length": 213.0, "epoch": 1.4152191894127377, "grad_norm": 0.2294125258922577, "kl": 0.0919189453125, "learning_rate": 2.386276288988506e-06, "loss": 0.0009191446006298065, "memory(GiB)": 38.05, "reward": 0.5142511129379272, "reward_std": 0.08081350475549698, "rewards/VisualizationJSONCombinedORM/mean": 0.5142511129379272, "rewards/VisualizationJSONCombinedORM/std": 0.10282278060913086, "step": 1711, "train_speed(iter/s)": 0.095852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 234.125, "completions/min_length": 197.0, "epoch": 1.4160463192721258, "grad_norm": 0.17481321096420288, "kl": 0.04534912109375, "learning_rate": 2.38012512271582e-06, "loss": 0.0004532933235168457, "memory(GiB)": 38.05, "reward": 0.49115854501724243, "reward_std": 0.06964201480150223, "rewards/VisualizationJSONCombinedORM/mean": 0.49115854501724243, "rewards/VisualizationJSONCombinedORM/std": 0.1671074628829956, "step": 1712, "train_speed(iter/s)": 0.095787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 235.0625, "completions/min_length": 208.0, "epoch": 1.4168734491315136, "grad_norm": 0.19192937016487122, "kl": 0.06256103515625, "learning_rate": 2.3739794173206806e-06, "loss": 0.0006251484155654907, "memory(GiB)": 38.05, "reward": 0.4050447642803192, "reward_std": 0.0817679911851883, "rewards/VisualizationJSONCombinedORM/mean": 0.4050447642803192, "rewards/VisualizationJSONCombinedORM/std": 0.1084175631403923, "step": 1713, "train_speed(iter/s)": 0.095733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 256.625, "completions/min_length": 207.0, "epoch": 1.4177005789909016, "grad_norm": 0.23593653738498688, "kl": 0.087646484375, "learning_rate": 2.3678391856132203e-06, "loss": 0.0008753389120101929, "memory(GiB)": 38.05, "reward": 0.5841360688209534, "reward_std": 0.08311061561107635, "rewards/VisualizationJSONCombinedORM/mean": 0.5841360688209534, "rewards/VisualizationJSONCombinedORM/std": 0.13626424968242645, "step": 1714, "train_speed(iter/s)": 0.095657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 246.75, "completions/min_length": 213.0, "epoch": 1.4185277088502894, "grad_norm": 0.18218167126178741, "kl": 0.04534912109375, "learning_rate": 2.3617044403921667e-06, "loss": 0.0004531443119049072, "memory(GiB)": 38.05, "reward": 0.4585976004600525, "reward_std": 0.08142246305942535, "rewards/VisualizationJSONCombinedORM/mean": 0.4585976004600525, "rewards/VisualizationJSONCombinedORM/std": 0.21631479263305664, "step": 1715, "train_speed(iter/s)": 0.095619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 263.5625, "completions/min_length": 209.0, "epoch": 1.4193548387096775, "grad_norm": 0.1776731163263321, "kl": 0.04327392578125, "learning_rate": 2.3555751944448036e-06, "loss": 0.00043210387229919434, "memory(GiB)": 38.05, "reward": 0.6441746950149536, "reward_std": 0.07341741025447845, "rewards/VisualizationJSONCombinedORM/mean": 0.6441746950149536, "rewards/VisualizationJSONCombinedORM/std": 0.09394550323486328, "step": 1716, "train_speed(iter/s)": 0.095563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 271.125, "completions/min_length": 221.0, "epoch": 1.4201819685690653, "grad_norm": 0.15134724974632263, "kl": 0.05181884765625, "learning_rate": 2.3494514605469535e-06, "loss": 0.0005179047584533691, "memory(GiB)": 38.05, "reward": 0.6487385034561157, "reward_std": 0.08420510590076447, "rewards/VisualizationJSONCombinedORM/mean": 0.6487385034561157, "rewards/VisualizationJSONCombinedORM/std": 0.09550508856773376, "step": 1717, "train_speed(iter/s)": 0.095474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 234.25, "completions/min_length": 199.0, "epoch": 1.4210090984284531, "grad_norm": 0.16336461901664734, "kl": 0.1253662109375, "learning_rate": 2.343333251462954e-06, "loss": 0.0012556463479995728, "memory(GiB)": 38.05, "reward": 0.7069574594497681, "reward_std": 0.09260024130344391, "rewards/VisualizationJSONCombinedORM/mean": 0.7069574594497681, "rewards/VisualizationJSONCombinedORM/std": 0.09517063945531845, "step": 1718, "train_speed(iter/s)": 0.095406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 243.6875, "completions/min_length": 186.0, "epoch": 1.4218362282878412, "grad_norm": 0.2611885666847229, "kl": 0.0791015625, "learning_rate": 2.337220579945626e-06, "loss": 0.0007904320955276489, "memory(GiB)": 38.05, "reward": 0.48541054129600525, "reward_std": 0.0769965797662735, "rewards/VisualizationJSONCombinedORM/mean": 0.48541054129600525, "rewards/VisualizationJSONCombinedORM/std": 0.304320752620697, "step": 1719, "train_speed(iter/s)": 0.095337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 248.4375, "completions/min_length": 188.0, "epoch": 1.4226633581472292, "grad_norm": 0.19184859097003937, "kl": 0.042236328125, "learning_rate": 2.3311134587362426e-06, "loss": 0.00042267516255378723, "memory(GiB)": 38.05, "reward": 0.3623327910900116, "reward_std": 0.04148200899362564, "rewards/VisualizationJSONCombinedORM/mean": 0.3623327910900116, "rewards/VisualizationJSONCombinedORM/std": 0.10599017888307571, "step": 1720, "train_speed(iter/s)": 0.095274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 265.4375, "completions/min_length": 210.0, "epoch": 1.423490488006617, "grad_norm": 0.22630015015602112, "kl": 0.072509765625, "learning_rate": 2.325011900564515e-06, "loss": 0.0007265843451023102, "memory(GiB)": 38.05, "reward": 0.5224531292915344, "reward_std": 0.05630069226026535, "rewards/VisualizationJSONCombinedORM/mean": 0.5224531292915344, "rewards/VisualizationJSONCombinedORM/std": 0.2223750799894333, "step": 1721, "train_speed(iter/s)": 0.095192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 236.375, "completions/min_length": 201.0, "epoch": 1.4243176178660049, "grad_norm": 0.18949942290782928, "kl": 0.0845947265625, "learning_rate": 2.3189159181485517e-06, "loss": 0.0008468180894851685, "memory(GiB)": 38.05, "reward": 0.47208911180496216, "reward_std": 0.06209621578454971, "rewards/VisualizationJSONCombinedORM/mean": 0.47208911180496216, "rewards/VisualizationJSONCombinedORM/std": 0.19946438074111938, "step": 1722, "train_speed(iter/s)": 0.095144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 263.6875, "completions/min_length": 200.0, "epoch": 1.425144747725393, "grad_norm": 0.1939249336719513, "kl": 0.0567626953125, "learning_rate": 2.3128255241948434e-06, "loss": 0.0005678310990333557, "memory(GiB)": 38.05, "reward": 0.4067949652671814, "reward_std": 0.038748521357774734, "rewards/VisualizationJSONCombinedORM/mean": 0.4067949652671814, "rewards/VisualizationJSONCombinedORM/std": 0.1483125239610672, "step": 1723, "train_speed(iter/s)": 0.095086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 237.8125, "completions/min_length": 191.0, "epoch": 1.425971877584781, "grad_norm": 0.256219744682312, "kl": 0.0660400390625, "learning_rate": 2.306740731398234e-06, "loss": 0.0006617829203605652, "memory(GiB)": 38.05, "reward": 0.5491739511489868, "reward_std": 0.09458106756210327, "rewards/VisualizationJSONCombinedORM/mean": 0.5491739511489868, "rewards/VisualizationJSONCombinedORM/std": 0.09802288562059402, "step": 1724, "train_speed(iter/s)": 0.095023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 250.3125, "completions/min_length": 217.0, "epoch": 1.4267990074441688, "grad_norm": 0.16163113713264465, "kl": 0.12408447265625, "learning_rate": 2.300661552441888e-06, "loss": 0.0012390632182359695, "memory(GiB)": 38.05, "reward": 0.3611384630203247, "reward_std": 0.04975789040327072, "rewards/VisualizationJSONCombinedORM/mean": 0.3611384630203247, "rewards/VisualizationJSONCombinedORM/std": 0.10333042591810226, "step": 1725, "train_speed(iter/s)": 0.094957 }, { "epoch": 1.4267990074441688, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 287.0, "eval_completions/mean_length": 246.34895833333334, "eval_completions/min_length": 214.66666666666666, "eval_kl": 0.058756510416666664, "eval_loss": 0.0005897085065953434, "eval_reward": 0.4507008617122968, "eval_reward_std": 0.07316911011002958, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4507008617122968, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07316911274877687, "eval_runtime": 264.2091, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.011, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 236.0, "completions/min_length": 206.0, "epoch": 1.4276261373035566, "grad_norm": 0.16964906454086304, "kl": 0.03802490234375, "learning_rate": 2.2945879999972676e-06, "loss": 0.0003817155957221985, "memory(GiB)": 38.05, "reward": 0.5338233709335327, "reward_std": 0.04957310110330582, "rewards/VisualizationJSONCombinedORM/mean": 0.5338233709335327, "rewards/VisualizationJSONCombinedORM/std": 0.19204039871692657, "step": 1726, "train_speed(iter/s)": 0.093544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 255.25, "completions/min_length": 193.0, "epoch": 1.4284532671629446, "grad_norm": 0.23366360366344452, "kl": 0.0594482421875, "learning_rate": 2.288520086724112e-06, "loss": 0.00059523805975914, "memory(GiB)": 38.05, "reward": 0.43866267800331116, "reward_std": 0.06502803415060043, "rewards/VisualizationJSONCombinedORM/mean": 0.43866267800331116, "rewards/VisualizationJSONCombinedORM/std": 0.1676756739616394, "step": 1727, "train_speed(iter/s)": 0.093462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 255.0625, "completions/min_length": 224.0, "epoch": 1.4292803970223324, "grad_norm": 0.17944073677062988, "kl": 0.0914306640625, "learning_rate": 2.2824578252704042e-06, "loss": 0.0009139403700828552, "memory(GiB)": 38.05, "reward": 0.5852118730545044, "reward_std": 0.09730522334575653, "rewards/VisualizationJSONCombinedORM/mean": 0.5852118730545044, "rewards/VisualizationJSONCombinedORM/std": 0.10455738008022308, "step": 1728, "train_speed(iter/s)": 0.093377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 240.0, "completions/min_length": 210.0, "epoch": 1.4301075268817205, "grad_norm": 0.17275458574295044, "kl": 0.03411865234375, "learning_rate": 2.276401228272344e-06, "loss": 0.00034186244010925293, "memory(GiB)": 38.05, "reward": 0.49320128560066223, "reward_std": 0.06421677768230438, "rewards/VisualizationJSONCombinedORM/mean": 0.49320128560066223, "rewards/VisualizationJSONCombinedORM/std": 0.2611158490180969, "step": 1729, "train_speed(iter/s)": 0.093322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 250.125, "completions/min_length": 207.0, "epoch": 1.4309346567411083, "grad_norm": 0.23636896908283234, "kl": 0.0372314453125, "learning_rate": 2.2703503083543288e-06, "loss": 0.00037235021591186523, "memory(GiB)": 38.05, "reward": 0.46810799837112427, "reward_std": 0.07168708741664886, "rewards/VisualizationJSONCombinedORM/mean": 0.46810799837112427, "rewards/VisualizationJSONCombinedORM/std": 0.25716984272003174, "step": 1730, "train_speed(iter/s)": 0.093257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 245.125, "completions/min_length": 194.0, "epoch": 1.4317617866004964, "grad_norm": 0.16860409080982208, "kl": 0.04351806640625, "learning_rate": 2.264305078128916e-06, "loss": 0.0004350673407316208, "memory(GiB)": 38.05, "reward": 0.5029296875, "reward_std": 0.05136634781956673, "rewards/VisualizationJSONCombinedORM/mean": 0.5029296875, "rewards/VisualizationJSONCombinedORM/std": 0.12050236761569977, "step": 1731, "train_speed(iter/s)": 0.093189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 241.5, "completions/min_length": 199.0, "epoch": 1.4325889164598842, "grad_norm": 0.16484704613685608, "kl": 0.0946044921875, "learning_rate": 2.258265550196812e-06, "loss": 0.0009448975324630737, "memory(GiB)": 38.05, "reward": 0.4487890601158142, "reward_std": 0.07740520685911179, "rewards/VisualizationJSONCombinedORM/mean": 0.4487890601158142, "rewards/VisualizationJSONCombinedORM/std": 0.12187893688678741, "step": 1732, "train_speed(iter/s)": 0.093112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 259.0625, "completions/min_length": 209.0, "epoch": 1.4334160463192722, "grad_norm": 0.1689784675836563, "kl": 0.0494384765625, "learning_rate": 2.252231737146828e-06, "loss": 0.0004941988736391068, "memory(GiB)": 38.05, "reward": 0.6346081495285034, "reward_std": 0.08577923476696014, "rewards/VisualizationJSONCombinedORM/mean": 0.6346081495285034, "rewards/VisualizationJSONCombinedORM/std": 0.1325279325246811, "step": 1733, "train_speed(iter/s)": 0.093045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 254.6875, "completions/min_length": 216.0, "epoch": 1.43424317617866, "grad_norm": 0.16717112064361572, "kl": 0.058349609375, "learning_rate": 2.2462036515558726e-06, "loss": 0.000583253800868988, "memory(GiB)": 38.05, "reward": 0.39039602875709534, "reward_std": 0.014516245573759079, "rewards/VisualizationJSONCombinedORM/mean": 0.39039602875709534, "rewards/VisualizationJSONCombinedORM/std": 0.06276363134384155, "step": 1734, "train_speed(iter/s)": 0.092975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 262.75, "completions/min_length": 220.0, "epoch": 1.4350703060380479, "grad_norm": 0.17952710390090942, "kl": 0.06829833984375, "learning_rate": 2.2401813059889067e-06, "loss": 0.0006813574582338333, "memory(GiB)": 38.05, "reward": 0.40877217054367065, "reward_std": 0.05963084101676941, "rewards/VisualizationJSONCombinedORM/mean": 0.40877217054367065, "rewards/VisualizationJSONCombinedORM/std": 0.06938137114048004, "step": 1735, "train_speed(iter/s)": 0.092895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 261.875, "completions/min_length": 206.0, "epoch": 1.435897435897436, "grad_norm": 0.15537220239639282, "kl": 0.0740966796875, "learning_rate": 2.234164712998935e-06, "loss": 0.0007396657019853592, "memory(GiB)": 38.05, "reward": 0.5228443145751953, "reward_std": 0.0359862744808197, "rewards/VisualizationJSONCombinedORM/mean": 0.5228443145751953, "rewards/VisualizationJSONCombinedORM/std": 0.1976841241121292, "step": 1736, "train_speed(iter/s)": 0.092818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 252.5, "completions/min_length": 229.0, "epoch": 1.436724565756824, "grad_norm": 0.13060279190540314, "kl": 0.039306640625, "learning_rate": 2.2281538851269635e-06, "loss": 0.00039276108145713806, "memory(GiB)": 38.05, "reward": 0.551544725894928, "reward_std": 0.060999054461717606, "rewards/VisualizationJSONCombinedORM/mean": 0.551544725894928, "rewards/VisualizationJSONCombinedORM/std": 0.23567794263362885, "step": 1737, "train_speed(iter/s)": 0.09277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 251.3125, "completions/min_length": 194.0, "epoch": 1.4375516956162118, "grad_norm": 0.5964269042015076, "kl": 0.04937744140625, "learning_rate": 2.2221488349019903e-06, "loss": 0.0004932470619678497, "memory(GiB)": 38.05, "reward": 0.5988490581512451, "reward_std": 0.09214122593402863, "rewards/VisualizationJSONCombinedORM/mean": 0.5988490581512451, "rewards/VisualizationJSONCombinedORM/std": 0.19711239635944366, "step": 1738, "train_speed(iter/s)": 0.092708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 234.8125, "completions/min_length": 211.0, "epoch": 1.4383788254755996, "grad_norm": 0.1821955442428589, "kl": 0.057861328125, "learning_rate": 2.2161495748409617e-06, "loss": 0.0005783811211585999, "memory(GiB)": 38.05, "reward": 0.7006762027740479, "reward_std": 0.13148784637451172, "rewards/VisualizationJSONCombinedORM/mean": 0.7006762027740479, "rewards/VisualizationJSONCombinedORM/std": 0.13078224658966064, "step": 1739, "train_speed(iter/s)": 0.092628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 253.1875, "completions/min_length": 201.0, "epoch": 1.4392059553349876, "grad_norm": 0.20554110407829285, "kl": 0.060791015625, "learning_rate": 2.2101561174487606e-06, "loss": 0.000607714056968689, "memory(GiB)": 38.05, "reward": 0.34639349579811096, "reward_std": 0.04209046810865402, "rewards/VisualizationJSONCombinedORM/mean": 0.34639349579811096, "rewards/VisualizationJSONCombinedORM/std": 0.17182281613349915, "step": 1740, "train_speed(iter/s)": 0.092541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 261.0625, "completions/min_length": 225.0, "epoch": 1.4400330851943755, "grad_norm": 0.17839470505714417, "kl": 0.08062744140625, "learning_rate": 2.2041684752181763e-06, "loss": 0.0008045677095651627, "memory(GiB)": 38.05, "reward": 0.3750186264514923, "reward_std": 0.05940868332982063, "rewards/VisualizationJSONCombinedORM/mean": 0.3750186264514923, "rewards/VisualizationJSONCombinedORM/std": 0.0679989904165268, "step": 1741, "train_speed(iter/s)": 0.092492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 248.0, "completions/min_length": 209.0, "epoch": 1.4408602150537635, "grad_norm": 0.23407408595085144, "kl": 0.04412841796875, "learning_rate": 2.1981866606298684e-06, "loss": 0.0004409179091453552, "memory(GiB)": 38.05, "reward": 0.6110429763793945, "reward_std": 0.11200286448001862, "rewards/VisualizationJSONCombinedORM/mean": 0.6110429763793945, "rewards/VisualizationJSONCombinedORM/std": 0.13689365983009338, "step": 1742, "train_speed(iter/s)": 0.092442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 251.125, "completions/min_length": 214.0, "epoch": 1.4416873449131513, "grad_norm": 0.17669148743152618, "kl": 0.1148681640625, "learning_rate": 2.19221068615236e-06, "loss": 0.0011488422751426697, "memory(GiB)": 38.05, "reward": 0.6539263725280762, "reward_std": 0.14509545266628265, "rewards/VisualizationJSONCombinedORM/mean": 0.6539263725280762, "rewards/VisualizationJSONCombinedORM/std": 0.1563284546136856, "step": 1743, "train_speed(iter/s)": 0.092354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 244.9375, "completions/min_length": 203.0, "epoch": 1.4425144747725394, "grad_norm": 0.2053314596414566, "kl": 0.06207275390625, "learning_rate": 2.186240564241992e-06, "loss": 0.0006194394081830978, "memory(GiB)": 38.05, "reward": 0.5286207795143127, "reward_std": 0.1203216165304184, "rewards/VisualizationJSONCombinedORM/mean": 0.5286207795143127, "rewards/VisualizationJSONCombinedORM/std": 0.1477683186531067, "step": 1744, "train_speed(iter/s)": 0.092291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 221.3125, "completions/min_length": 167.0, "epoch": 1.4433416046319272, "grad_norm": 0.18715527653694153, "kl": 0.05157470703125, "learning_rate": 2.1802763073429126e-06, "loss": 0.0005157478153705597, "memory(GiB)": 38.05, "reward": 0.4414294362068176, "reward_std": 0.0480278879404068, "rewards/VisualizationJSONCombinedORM/mean": 0.4414294362068176, "rewards/VisualizationJSONCombinedORM/std": 0.0732240304350853, "step": 1745, "train_speed(iter/s)": 0.092234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 237.0, "completions/min_length": 188.0, "epoch": 1.4441687344913152, "grad_norm": 0.17387038469314575, "kl": 0.0762939453125, "learning_rate": 2.174317927887041e-06, "loss": 0.00076247937977314, "memory(GiB)": 38.05, "reward": 0.4942909777164459, "reward_std": 0.07285857200622559, "rewards/VisualizationJSONCombinedORM/mean": 0.4942909777164459, "rewards/VisualizationJSONCombinedORM/std": 0.2414817214012146, "step": 1746, "train_speed(iter/s)": 0.092186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 291.75, "completions/min_length": 221.0, "epoch": 1.444995864350703, "grad_norm": 0.18176770210266113, "kl": 0.134033203125, "learning_rate": 2.1683654382940484e-06, "loss": 0.0013427883386611938, "memory(GiB)": 38.05, "reward": 0.7173507809638977, "reward_std": 0.1364223062992096, "rewards/VisualizationJSONCombinedORM/mean": 0.7173507809638977, "rewards/VisualizationJSONCombinedORM/std": 0.13401548564434052, "step": 1747, "train_speed(iter/s)": 0.092091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 259.1875, "completions/min_length": 183.0, "epoch": 1.4458229942100909, "grad_norm": 0.18580415844917297, "kl": 0.055419921875, "learning_rate": 2.162418850971325e-06, "loss": 0.0005545131862163544, "memory(GiB)": 38.05, "reward": 0.6354774236679077, "reward_std": 0.06600581109523773, "rewards/VisualizationJSONCombinedORM/mean": 0.6354774236679077, "rewards/VisualizationJSONCombinedORM/std": 0.12946313619613647, "step": 1748, "train_speed(iter/s)": 0.092029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 251.0, "completions/min_length": 216.0, "epoch": 1.446650124069479, "grad_norm": 0.1751297116279602, "kl": 0.04925537109375, "learning_rate": 2.1564781783139645e-06, "loss": 0.000492386519908905, "memory(GiB)": 38.05, "reward": 0.4006626605987549, "reward_std": 0.03837113827466965, "rewards/VisualizationJSONCombinedORM/mean": 0.4006626605987549, "rewards/VisualizationJSONCombinedORM/std": 0.07020058482885361, "step": 1749, "train_speed(iter/s)": 0.091972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 244.1875, "completions/min_length": 212.0, "epoch": 1.447477253928867, "grad_norm": 0.2514462471008301, "kl": 0.04388427734375, "learning_rate": 2.1505434327047246e-06, "loss": 0.0004386892542243004, "memory(GiB)": 38.05, "reward": 0.5535655617713928, "reward_std": 0.12116320431232452, "rewards/VisualizationJSONCombinedORM/mean": 0.5535655617713928, "rewards/VisualizationJSONCombinedORM/std": 0.13279400765895844, "step": 1750, "train_speed(iter/s)": 0.091912 }, { "epoch": 1.447477253928867, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 306.3333333333333, "eval_completions/mean_length": 254.72395833333334, "eval_completions/min_length": 216.29166666666666, "eval_kl": 0.0701904296875, "eval_loss": 0.0007059040362946689, "eval_reward": 0.46785940291980904, "eval_reward_std": 0.06863242271356285, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46785940291980904, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06863242372249563, "eval_runtime": 275.7662, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 254.9375, "completions/min_length": 193.0, "epoch": 1.4483043837882548, "grad_norm": 0.18405365943908691, "kl": 0.05322265625, "learning_rate": 2.1446146265140144e-06, "loss": 0.000532299280166626, "memory(GiB)": 38.05, "reward": 0.5248857736587524, "reward_std": 0.13272017240524292, "rewards/VisualizationJSONCombinedORM/mean": 0.5248857736587524, "rewards/VisualizationJSONCombinedORM/std": 0.24631907045841217, "step": 1751, "train_speed(iter/s)": 0.090538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 255.0625, "completions/min_length": 218.0, "epoch": 1.4491315136476426, "grad_norm": 0.15556935966014862, "kl": 0.041259765625, "learning_rate": 2.138691772099863e-06, "loss": 0.00041349977254867554, "memory(GiB)": 38.05, "reward": 0.6267537474632263, "reward_std": 0.045001547783613205, "rewards/VisualizationJSONCombinedORM/mean": 0.6267537474632263, "rewards/VisualizationJSONCombinedORM/std": 0.06762322783470154, "step": 1752, "train_speed(iter/s)": 0.090475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 237.1875, "completions/min_length": 196.0, "epoch": 1.4499586435070306, "grad_norm": 0.17635977268218994, "kl": 0.079833984375, "learning_rate": 2.13277488180789e-06, "loss": 0.0007984023541212082, "memory(GiB)": 38.05, "reward": 0.4487963318824768, "reward_std": 0.06742729246616364, "rewards/VisualizationJSONCombinedORM/mean": 0.4487963318824768, "rewards/VisualizationJSONCombinedORM/std": 0.08252225816249847, "step": 1753, "train_speed(iter/s)": 0.090406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 255.875, "completions/min_length": 211.0, "epoch": 1.4507857733664185, "grad_norm": 0.18280275166034698, "kl": 0.05487060546875, "learning_rate": 2.1268639679712814e-06, "loss": 0.0005473345518112183, "memory(GiB)": 38.05, "reward": 0.7315454483032227, "reward_std": 0.07196174561977386, "rewards/VisualizationJSONCombinedORM/mean": 0.7315454483032227, "rewards/VisualizationJSONCombinedORM/std": 0.0718836635351181, "step": 1754, "train_speed(iter/s)": 0.090361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 250.5, "completions/min_length": 206.0, "epoch": 1.4516129032258065, "grad_norm": 0.1750737875699997, "kl": 0.1376953125, "learning_rate": 2.1209590429107734e-06, "loss": 0.0013737678527832031, "memory(GiB)": 38.05, "reward": 0.524544358253479, "reward_std": 0.10929505527019501, "rewards/VisualizationJSONCombinedORM/mean": 0.524544358253479, "rewards/VisualizationJSONCombinedORM/std": 0.15864874422550201, "step": 1755, "train_speed(iter/s)": 0.090274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 229.5625, "completions/min_length": 195.0, "epoch": 1.4524400330851943, "grad_norm": 0.25891438126564026, "kl": 0.04266357421875, "learning_rate": 2.115060118934616e-06, "loss": 0.00042685866355895996, "memory(GiB)": 38.05, "reward": 0.38789159059524536, "reward_std": 0.07416893541812897, "rewards/VisualizationJSONCombinedORM/mean": 0.38789159059524536, "rewards/VisualizationJSONCombinedORM/std": 0.12515300512313843, "step": 1756, "train_speed(iter/s)": 0.090215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 248.25, "completions/min_length": 203.0, "epoch": 1.4532671629445824, "grad_norm": 0.17116035521030426, "kl": 0.1142578125, "learning_rate": 2.1091672083385456e-06, "loss": 0.0011435560882091522, "memory(GiB)": 38.05, "reward": 0.5189288258552551, "reward_std": 0.04882625490427017, "rewards/VisualizationJSONCombinedORM/mean": 0.5189288258552551, "rewards/VisualizationJSONCombinedORM/std": 0.23616594076156616, "step": 1757, "train_speed(iter/s)": 0.090145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 238.5625, "completions/min_length": 187.0, "epoch": 1.4540942928039702, "grad_norm": 0.1822146773338318, "kl": 0.046630859375, "learning_rate": 2.1032803234057725e-06, "loss": 0.00046669691801071167, "memory(GiB)": 38.05, "reward": 0.7359293103218079, "reward_std": 0.10603515803813934, "rewards/VisualizationJSONCombinedORM/mean": 0.7359293103218079, "rewards/VisualizationJSONCombinedORM/std": 0.1081879585981369, "step": 1758, "train_speed(iter/s)": 0.090093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 248.625, "completions/min_length": 206.0, "epoch": 1.4549214226633582, "grad_norm": 0.18289433419704437, "kl": 0.0726318359375, "learning_rate": 2.097399476406939e-06, "loss": 0.0007273387163877487, "memory(GiB)": 38.05, "reward": 0.4372941255569458, "reward_std": 0.07845334708690643, "rewards/VisualizationJSONCombinedORM/mean": 0.4372941255569458, "rewards/VisualizationJSONCombinedORM/std": 0.14404301345348358, "step": 1759, "train_speed(iter/s)": 0.090033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 248.4375, "completions/min_length": 205.0, "epoch": 1.455748552522746, "grad_norm": 0.18158270418643951, "kl": 0.05096435546875, "learning_rate": 2.0915246796001077e-06, "loss": 0.0005097463726997375, "memory(GiB)": 38.05, "reward": 0.6978643536567688, "reward_std": 0.09462956339120865, "rewards/VisualizationJSONCombinedORM/mean": 0.6978643536567688, "rewards/VisualizationJSONCombinedORM/std": 0.09436450898647308, "step": 1760, "train_speed(iter/s)": 0.089981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 247.625, "completions/min_length": 200.0, "epoch": 1.4565756823821339, "grad_norm": 0.2036944031715393, "kl": 0.122314453125, "learning_rate": 2.0856559452307305e-06, "loss": 0.001223146915435791, "memory(GiB)": 38.05, "reward": 0.5680193901062012, "reward_std": 0.12453365325927734, "rewards/VisualizationJSONCombinedORM/mean": 0.5680193901062012, "rewards/VisualizationJSONCombinedORM/std": 0.17075569927692413, "step": 1761, "train_speed(iter/s)": 0.089922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 257.5, "completions/min_length": 223.0, "epoch": 1.457402812241522, "grad_norm": 0.14036747813224792, "kl": 0.12127685546875, "learning_rate": 2.0797932855316183e-06, "loss": 0.0012129321694374084, "memory(GiB)": 38.05, "reward": 0.41842013597488403, "reward_std": 0.055108994245529175, "rewards/VisualizationJSONCombinedORM/mean": 0.41842013597488403, "rewards/VisualizationJSONCombinedORM/std": 0.07808361947536469, "step": 1762, "train_speed(iter/s)": 0.08986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 241.25, "completions/min_length": 195.0, "epoch": 1.45822994210091, "grad_norm": 0.15548738837242126, "kl": 0.08837890625, "learning_rate": 2.0739367127229197e-06, "loss": 0.0008842535316944122, "memory(GiB)": 38.05, "reward": 0.4798984229564667, "reward_std": 0.058761440217494965, "rewards/VisualizationJSONCombinedORM/mean": 0.4798984229564667, "rewards/VisualizationJSONCombinedORM/std": 0.11189748346805573, "step": 1763, "train_speed(iter/s)": 0.089812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 250.6875, "completions/min_length": 187.0, "epoch": 1.4590570719602978, "grad_norm": 0.17868778109550476, "kl": 0.076171875, "learning_rate": 2.0680862390121015e-06, "loss": 0.0007632225751876831, "memory(GiB)": 38.05, "reward": 0.48222222924232483, "reward_std": 0.05688336119055748, "rewards/VisualizationJSONCombinedORM/mean": 0.48222222924232483, "rewards/VisualizationJSONCombinedORM/std": 0.20602500438690186, "step": 1764, "train_speed(iter/s)": 0.089753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 257.1875, "completions/min_length": 192.0, "epoch": 1.4598842018196856, "grad_norm": 0.19515442848205566, "kl": 0.06060791015625, "learning_rate": 2.0622418765939156e-06, "loss": 0.0006066262722015381, "memory(GiB)": 38.05, "reward": 0.4742598831653595, "reward_std": 0.042930349707603455, "rewards/VisualizationJSONCombinedORM/mean": 0.4742598831653595, "rewards/VisualizationJSONCombinedORM/std": 0.31784358620643616, "step": 1765, "train_speed(iter/s)": 0.089689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 270.1875, "completions/min_length": 229.0, "epoch": 1.4607113316790736, "grad_norm": 0.3101840615272522, "kl": 0.172607421875, "learning_rate": 2.056403637650371e-06, "loss": 0.0017252042889595032, "memory(GiB)": 38.05, "reward": 0.4307708740234375, "reward_std": 0.05769815295934677, "rewards/VisualizationJSONCombinedORM/mean": 0.4307708740234375, "rewards/VisualizationJSONCombinedORM/std": 0.22518876194953918, "step": 1766, "train_speed(iter/s)": 0.089624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 236.4375, "completions/min_length": 202.0, "epoch": 1.4615384615384617, "grad_norm": 0.21895915269851685, "kl": 0.083984375, "learning_rate": 2.050571534350716e-06, "loss": 0.0008404180407524109, "memory(GiB)": 38.05, "reward": 0.46333563327789307, "reward_std": 0.08378999680280685, "rewards/VisualizationJSONCombinedORM/mean": 0.46333563327789307, "rewards/VisualizationJSONCombinedORM/std": 0.0942709892988205, "step": 1767, "train_speed(iter/s)": 0.089555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 233.9375, "completions/min_length": 192.0, "epoch": 1.4623655913978495, "grad_norm": 0.18576741218566895, "kl": 0.05963134765625, "learning_rate": 2.0447455788514105e-06, "loss": 0.0005972683429718018, "memory(GiB)": 38.05, "reward": 0.5255731344223022, "reward_std": 0.06165917590260506, "rewards/VisualizationJSONCombinedORM/mean": 0.5255731344223022, "rewards/VisualizationJSONCombinedORM/std": 0.3389005661010742, "step": 1768, "train_speed(iter/s)": 0.089505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 252.3125, "completions/min_length": 215.0, "epoch": 1.4631927212572373, "grad_norm": 0.24231983721256256, "kl": 0.07220458984375, "learning_rate": 2.038925783296101e-06, "loss": 0.0007232353091239929, "memory(GiB)": 38.05, "reward": 0.6100842952728271, "reward_std": 0.07945045828819275, "rewards/VisualizationJSONCombinedORM/mean": 0.6100842952728271, "rewards/VisualizationJSONCombinedORM/std": 0.08972759544849396, "step": 1769, "train_speed(iter/s)": 0.089441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 237.3125, "completions/min_length": 190.0, "epoch": 1.4640198511166254, "grad_norm": 0.18443939089775085, "kl": 0.05706787109375, "learning_rate": 2.0331121598155905e-06, "loss": 0.0005700178444385529, "memory(GiB)": 38.05, "reward": 0.7652111649513245, "reward_std": 0.06936325132846832, "rewards/VisualizationJSONCombinedORM/mean": 0.7652111649513245, "rewards/VisualizationJSONCombinedORM/std": 0.07616671919822693, "step": 1770, "train_speed(iter/s)": 0.089379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 263.8125, "completions/min_length": 208.0, "epoch": 1.4648469809760132, "grad_norm": 0.18258613348007202, "kl": 0.0777587890625, "learning_rate": 2.027304720527818e-06, "loss": 0.0007778294384479523, "memory(GiB)": 38.05, "reward": 0.48513591289520264, "reward_std": 0.06654499471187592, "rewards/VisualizationJSONCombinedORM/mean": 0.48513591289520264, "rewards/VisualizationJSONCombinedORM/std": 0.16351936757564545, "step": 1771, "train_speed(iter/s)": 0.089324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 257.625, "completions/min_length": 203.0, "epoch": 1.4656741108354012, "grad_norm": 0.18327206373214722, "kl": 0.104736328125, "learning_rate": 2.0215034775378336e-06, "loss": 0.0010487958788871765, "memory(GiB)": 38.05, "reward": 0.6166377663612366, "reward_std": 0.09687410295009613, "rewards/VisualizationJSONCombinedORM/mean": 0.6166377663612366, "rewards/VisualizationJSONCombinedORM/std": 0.17035290598869324, "step": 1772, "train_speed(iter/s)": 0.089268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 235.4375, "completions/min_length": 198.0, "epoch": 1.466501240694789, "grad_norm": 0.1844603419303894, "kl": 0.0902099609375, "learning_rate": 2.0157084429377717e-06, "loss": 0.0009022057056427002, "memory(GiB)": 38.05, "reward": 0.5354676246643066, "reward_std": 0.09577418863773346, "rewards/VisualizationJSONCombinedORM/mean": 0.5354676246643066, "rewards/VisualizationJSONCombinedORM/std": 0.1644797921180725, "step": 1773, "train_speed(iter/s)": 0.089196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 260.1875, "completions/min_length": 208.0, "epoch": 1.467328370554177, "grad_norm": 0.21229006350040436, "kl": 0.0914306640625, "learning_rate": 2.009919628806826e-06, "loss": 0.000913817435503006, "memory(GiB)": 38.05, "reward": 0.3273817300796509, "reward_std": 0.039614662528038025, "rewards/VisualizationJSONCombinedORM/mean": 0.3273817300796509, "rewards/VisualizationJSONCombinedORM/std": 0.04029469937086105, "step": 1774, "train_speed(iter/s)": 0.089148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 235.8125, "completions/min_length": 201.0, "epoch": 1.468155500413565, "grad_norm": 0.1882331520318985, "kl": 0.1492919921875, "learning_rate": 2.0041370472112236e-06, "loss": 0.0014877207577228546, "memory(GiB)": 38.05, "reward": 0.4680226743221283, "reward_std": 0.06107129901647568, "rewards/VisualizationJSONCombinedORM/mean": 0.4680226743221283, "rewards/VisualizationJSONCombinedORM/std": 0.21015426516532898, "step": 1775, "train_speed(iter/s)": 0.0891 }, { "epoch": 1.468155500413565, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 292.5, "eval_completions/mean_length": 252.53125, "eval_completions/min_length": 217.54166666666666, "eval_kl": 0.08954874674479167, "eval_loss": 0.0009001381695270538, "eval_reward": 0.5033976659178734, "eval_reward_std": 0.058628644716615476, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.5033976659178734, "eval_rewards/VisualizationJSONCombinedORM/std": 0.058628646889701486, "eval_runtime": 266.892, "eval_samples_per_second": 0.09, "eval_steps_per_second": 0.011, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 251.6875, "completions/min_length": 217.0, "epoch": 1.468982630272953, "grad_norm": 0.17254899442195892, "kl": 0.1024169921875, "learning_rate": 1.9983607102041974e-06, "loss": 0.0010231882333755493, "memory(GiB)": 38.05, "reward": 0.6913204193115234, "reward_std": 0.05423276871442795, "rewards/VisualizationJSONCombinedORM/mean": 0.6913204193115234, "rewards/VisualizationJSONCombinedORM/std": 0.06940874457359314, "step": 1776, "train_speed(iter/s)": 0.087885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 266.0625, "completions/min_length": 235.0, "epoch": 1.4698097601323408, "grad_norm": 0.17433054745197296, "kl": 0.0587158203125, "learning_rate": 1.992590629825969e-06, "loss": 0.0005877576768398285, "memory(GiB)": 38.05, "reward": 0.60861736536026, "reward_std": 0.058574430644512177, "rewards/VisualizationJSONCombinedORM/mean": 0.60861736536026, "rewards/VisualizationJSONCombinedORM/std": 0.10984983295202255, "step": 1777, "train_speed(iter/s)": 0.087841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 259.875, "completions/min_length": 198.0, "epoch": 1.4706368899917286, "grad_norm": 0.28317686915397644, "kl": 0.07568359375, "learning_rate": 1.9868268181037186e-06, "loss": 0.0007559321820735931, "memory(GiB)": 38.05, "reward": 0.3940379321575165, "reward_std": 0.07657173275947571, "rewards/VisualizationJSONCombinedORM/mean": 0.3940379321575165, "rewards/VisualizationJSONCombinedORM/std": 0.0784166157245636, "step": 1778, "train_speed(iter/s)": 0.08779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 261.4375, "completions/min_length": 225.0, "epoch": 1.4714640198511166, "grad_norm": 0.17796535789966583, "kl": 0.0584716796875, "learning_rate": 1.981069287051557e-06, "loss": 0.0005861781537532806, "memory(GiB)": 38.05, "reward": 0.5268898010253906, "reward_std": 0.056065402925014496, "rewards/VisualizationJSONCombinedORM/mean": 0.5268898010253906, "rewards/VisualizationJSONCombinedORM/std": 0.20226605236530304, "step": 1779, "train_speed(iter/s)": 0.08773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 241.8125, "completions/min_length": 219.0, "epoch": 1.4722911497105047, "grad_norm": 0.18537715077400208, "kl": 0.08453369140625, "learning_rate": 1.9753180486705013e-06, "loss": 0.0008446648716926575, "memory(GiB)": 38.05, "reward": 0.7295503616333008, "reward_std": 0.06632036715745926, "rewards/VisualizationJSONCombinedORM/mean": 0.7295503616333008, "rewards/VisualizationJSONCombinedORM/std": 0.07909129559993744, "step": 1780, "train_speed(iter/s)": 0.087675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 258.5625, "completions/min_length": 207.0, "epoch": 1.4731182795698925, "grad_norm": 0.14107926189899445, "kl": 0.0533447265625, "learning_rate": 1.9695731149484595e-06, "loss": 0.0005334876477718353, "memory(GiB)": 38.05, "reward": 0.748570442199707, "reward_std": 0.08398297429084778, "rewards/VisualizationJSONCombinedORM/mean": 0.748570442199707, "rewards/VisualizationJSONCombinedORM/std": 0.08117927610874176, "step": 1781, "train_speed(iter/s)": 0.087603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 252.875, "completions/min_length": 203.0, "epoch": 1.4739454094292803, "grad_norm": 0.22837892174720764, "kl": 0.0760498046875, "learning_rate": 1.963834497860192e-06, "loss": 0.0007615238428115845, "memory(GiB)": 38.05, "reward": 0.5843149423599243, "reward_std": 0.12073169648647308, "rewards/VisualizationJSONCombinedORM/mean": 0.5843149423599243, "rewards/VisualizationJSONCombinedORM/std": 0.1284470111131668, "step": 1782, "train_speed(iter/s)": 0.087554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 260.75, "completions/min_length": 204.0, "epoch": 1.4747725392886684, "grad_norm": 0.16311757266521454, "kl": 0.040191650390625, "learning_rate": 1.9581022093672995e-06, "loss": 0.00040203332901000977, "memory(GiB)": 38.05, "reward": 0.5700415968894958, "reward_std": 0.09031186252832413, "rewards/VisualizationJSONCombinedORM/mean": 0.5700415968894958, "rewards/VisualizationJSONCombinedORM/std": 0.0901811346411705, "step": 1783, "train_speed(iter/s)": 0.087503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 239.5625, "completions/min_length": 201.0, "epoch": 1.4755996691480562, "grad_norm": 0.21520410478115082, "kl": 0.04010009765625, "learning_rate": 1.95237626141818e-06, "loss": 0.0004004165530204773, "memory(GiB)": 38.05, "reward": 0.5467172265052795, "reward_std": 0.0858037918806076, "rewards/VisualizationJSONCombinedORM/mean": 0.5467172265052795, "rewards/VisualizationJSONCombinedORM/std": 0.2815353572368622, "step": 1784, "train_speed(iter/s)": 0.08745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 274.8125, "completions/min_length": 207.0, "epoch": 1.4764267990074442, "grad_norm": 0.21124662458896637, "kl": 0.106689453125, "learning_rate": 1.9466566659480257e-06, "loss": 0.0010671019554138184, "memory(GiB)": 38.05, "reward": 0.4789992570877075, "reward_std": 0.09106998145580292, "rewards/VisualizationJSONCombinedORM/mean": 0.4789992570877075, "rewards/VisualizationJSONCombinedORM/std": 0.17552046477794647, "step": 1785, "train_speed(iter/s)": 0.087367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 273.3125, "completions/min_length": 221.0, "epoch": 1.477253928866832, "grad_norm": 0.18331414461135864, "kl": 0.075927734375, "learning_rate": 1.9409434348787824e-06, "loss": 0.0007590055465698242, "memory(GiB)": 38.05, "reward": 0.6338942050933838, "reward_std": 0.06719320267438889, "rewards/VisualizationJSONCombinedORM/mean": 0.6338942050933838, "rewards/VisualizationJSONCombinedORM/std": 0.1958298683166504, "step": 1786, "train_speed(iter/s)": 0.087311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 262.5625, "completions/min_length": 219.0, "epoch": 1.47808105872622, "grad_norm": 0.23865678906440735, "kl": 0.04949951171875, "learning_rate": 1.9352365801191354e-06, "loss": 0.0004945285618305206, "memory(GiB)": 38.05, "reward": 0.44463050365448, "reward_std": 0.05839301645755768, "rewards/VisualizationJSONCombinedORM/mean": 0.44463050365448, "rewards/VisualizationJSONCombinedORM/std": 0.19145673513412476, "step": 1787, "train_speed(iter/s)": 0.087258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 251.8125, "completions/min_length": 223.0, "epoch": 1.478908188585608, "grad_norm": 0.18566134572029114, "kl": 0.0814208984375, "learning_rate": 1.9295361135644724e-06, "loss": 0.0008138604462146759, "memory(GiB)": 38.05, "reward": 0.6446406245231628, "reward_std": 0.07737906277179718, "rewards/VisualizationJSONCombinedORM/mean": 0.6446406245231628, "rewards/VisualizationJSONCombinedORM/std": 0.1443660855293274, "step": 1788, "train_speed(iter/s)": 0.08721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 257.875, "completions/min_length": 199.0, "epoch": 1.479735318444996, "grad_norm": 0.17460320889949799, "kl": 0.143310546875, "learning_rate": 1.9238420470968665e-06, "loss": 0.001437108963727951, "memory(GiB)": 38.05, "reward": 0.5154551267623901, "reward_std": 0.07364316284656525, "rewards/VisualizationJSONCombinedORM/mean": 0.5154551267623901, "rewards/VisualizationJSONCombinedORM/std": 0.25599488615989685, "step": 1789, "train_speed(iter/s)": 0.087143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 253.625, "completions/min_length": 183.0, "epoch": 1.4805624483043838, "grad_norm": 0.18267209827899933, "kl": 0.072021484375, "learning_rate": 1.9181543925850544e-06, "loss": 0.0007204953581094742, "memory(GiB)": 38.05, "reward": 0.23155304789543152, "reward_std": 0.023126985877752304, "rewards/VisualizationJSONCombinedORM/mean": 0.23155304789543152, "rewards/VisualizationJSONCombinedORM/std": 0.0259141493588686, "step": 1790, "train_speed(iter/s)": 0.087064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 248.3125, "completions/min_length": 199.0, "epoch": 1.4813895781637716, "grad_norm": 0.17873691022396088, "kl": 0.0543212890625, "learning_rate": 1.9124731618844056e-06, "loss": 0.0005435273051261902, "memory(GiB)": 38.05, "reward": 0.3176223635673523, "reward_std": 0.041751470416784286, "rewards/VisualizationJSONCombinedORM/mean": 0.3176223635673523, "rewards/VisualizationJSONCombinedORM/std": 0.04104606434702873, "step": 1791, "train_speed(iter/s)": 0.08699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 275.6875, "completions/min_length": 216.0, "epoch": 1.4822167080231596, "grad_norm": 0.19569309055805206, "kl": 0.06170654296875, "learning_rate": 1.9067983668369038e-06, "loss": 0.0006180517375469208, "memory(GiB)": 38.05, "reward": 0.5410326719284058, "reward_std": 0.0635543242096901, "rewards/VisualizationJSONCombinedORM/mean": 0.5410326719284058, "rewards/VisualizationJSONCombinedORM/std": 0.1598234921693802, "step": 1792, "train_speed(iter/s)": 0.086947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 240.5625, "completions/min_length": 189.0, "epoch": 1.4830438378825477, "grad_norm": 0.18172866106033325, "kl": 0.0535888671875, "learning_rate": 1.9011300192711063e-06, "loss": 0.0005360133945941925, "memory(GiB)": 38.05, "reward": 0.44645971059799194, "reward_std": 0.07266572117805481, "rewards/VisualizationJSONCombinedORM/mean": 0.44645971059799194, "rewards/VisualizationJSONCombinedORM/std": 0.08011291176080704, "step": 1793, "train_speed(iter/s)": 0.086912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 254.625, "completions/min_length": 219.0, "epoch": 1.4838709677419355, "grad_norm": 0.18985657393932343, "kl": 0.084228515625, "learning_rate": 1.8954681310021434e-06, "loss": 0.0008412301540374756, "memory(GiB)": 38.05, "reward": 0.6341286897659302, "reward_std": 0.11246734857559204, "rewards/VisualizationJSONCombinedORM/mean": 0.6341286897659302, "rewards/VisualizationJSONCombinedORM/std": 0.1253097951412201, "step": 1794, "train_speed(iter/s)": 0.086843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 276.0625, "completions/min_length": 212.0, "epoch": 1.4846980976013233, "grad_norm": 0.20380619168281555, "kl": 0.0855712890625, "learning_rate": 1.8898127138316775e-06, "loss": 0.0008555129170417786, "memory(GiB)": 38.05, "reward": 0.47359952330589294, "reward_std": 0.08154143393039703, "rewards/VisualizationJSONCombinedORM/mean": 0.47359952330589294, "rewards/VisualizationJSONCombinedORM/std": 0.08367961645126343, "step": 1795, "train_speed(iter/s)": 0.086773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 250.375, "completions/min_length": 204.0, "epoch": 1.4855252274607114, "grad_norm": 0.17253698408603668, "kl": 0.0355224609375, "learning_rate": 1.8841637795478835e-06, "loss": 0.0003550015389919281, "memory(GiB)": 38.05, "reward": 0.5184056162834167, "reward_std": 0.08099842071533203, "rewards/VisualizationJSONCombinedORM/mean": 0.5184056162834167, "rewards/VisualizationJSONCombinedORM/std": 0.08576960861682892, "step": 1796, "train_speed(iter/s)": 0.086716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 260.375, "completions/min_length": 210.0, "epoch": 1.4863523573200992, "grad_norm": 0.18739311397075653, "kl": 0.04656982421875, "learning_rate": 1.8785213399254209e-06, "loss": 0.0004664519801735878, "memory(GiB)": 38.05, "reward": 0.6470023989677429, "reward_std": 0.07378686964511871, "rewards/VisualizationJSONCombinedORM/mean": 0.6470023989677429, "rewards/VisualizationJSONCombinedORM/std": 0.07610158622264862, "step": 1797, "train_speed(iter/s)": 0.086667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 255.1875, "completions/min_length": 203.0, "epoch": 1.4871794871794872, "grad_norm": 0.19947515428066254, "kl": 0.06744384765625, "learning_rate": 1.872885406725412e-06, "loss": 0.0006743972189724445, "memory(GiB)": 38.05, "reward": 0.6473093032836914, "reward_std": 0.07627122104167938, "rewards/VisualizationJSONCombinedORM/mean": 0.6473093032836914, "rewards/VisualizationJSONCombinedORM/std": 0.07709619402885437, "step": 1798, "train_speed(iter/s)": 0.086606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 271.3125, "completions/min_length": 192.0, "epoch": 1.488006617038875, "grad_norm": 0.16585205495357513, "kl": 0.04400634765625, "learning_rate": 1.8672559916954192e-06, "loss": 0.0004397183656692505, "memory(GiB)": 38.05, "reward": 0.40904176235198975, "reward_std": 0.0502186194062233, "rewards/VisualizationJSONCombinedORM/mean": 0.40904176235198975, "rewards/VisualizationJSONCombinedORM/std": 0.05156046524643898, "step": 1799, "train_speed(iter/s)": 0.08655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 261.5625, "completions/min_length": 210.0, "epoch": 1.488833746898263, "grad_norm": 0.21590134501457214, "kl": 0.044189453125, "learning_rate": 1.8616331065694193e-06, "loss": 0.00044180452823638916, "memory(GiB)": 38.05, "reward": 0.7089932560920715, "reward_std": 0.1139322966337204, "rewards/VisualizationJSONCombinedORM/mean": 0.7089932560920715, "rewards/VisualizationJSONCombinedORM/std": 0.11687003821134567, "step": 1800, "train_speed(iter/s)": 0.086489 }, { "epoch": 1.488833746898263, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 297.25, "eval_completions/mean_length": 251.13020833333334, "eval_completions/min_length": 217.20833333333334, "eval_kl": 0.06524149576822917, "eval_loss": 0.0006561900372616947, "eval_reward": 0.4469567524890105, "eval_reward_std": 0.0617431936164697, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4469567524890105, "eval_rewards/VisualizationJSONCombinedORM/std": 0.061743195091063775, "eval_runtime": 270.214, "eval_samples_per_second": 0.089, "eval_steps_per_second": 0.011, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 281.4375, "completions/min_length": 235.0, "epoch": 1.489660876757651, "grad_norm": 0.19496630132198334, "kl": 0.05853271484375, "learning_rate": 1.856016763067775e-06, "loss": 0.0005849972367286682, "memory(GiB)": 38.05, "reward": 0.5678651928901672, "reward_std": 0.0768325924873352, "rewards/VisualizationJSONCombinedORM/mean": 0.5678651928901672, "rewards/VisualizationJSONCombinedORM/std": 0.15381266176700592, "step": 1801, "train_speed(iter/s)": 0.085324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 268.625, "completions/min_length": 230.0, "epoch": 1.490488006617039, "grad_norm": 0.18590128421783447, "kl": 0.1087646484375, "learning_rate": 1.8504069728972124e-06, "loss": 0.001086607575416565, "memory(GiB)": 38.05, "reward": 0.5242615938186646, "reward_std": 0.07830414175987244, "rewards/VisualizationJSONCombinedORM/mean": 0.5242615938186646, "rewards/VisualizationJSONCombinedORM/std": 0.10559765249490738, "step": 1802, "train_speed(iter/s)": 0.085284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 238.625, "completions/min_length": 181.0, "epoch": 1.4913151364764268, "grad_norm": 0.1636808216571808, "kl": 0.095947265625, "learning_rate": 1.844803747750803e-06, "loss": 0.0009595360606908798, "memory(GiB)": 38.05, "reward": 0.6611757278442383, "reward_std": 0.07793384045362473, "rewards/VisualizationJSONCombinedORM/mean": 0.6611757278442383, "rewards/VisualizationJSONCombinedORM/std": 0.07895103096961975, "step": 1803, "train_speed(iter/s)": 0.085221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 248.5625, "completions/min_length": 204.0, "epoch": 1.4921422663358146, "grad_norm": 0.28678786754608154, "kl": 0.2119140625, "learning_rate": 1.8392070993079326e-06, "loss": 0.0021172985434532166, "memory(GiB)": 38.05, "reward": 0.565540075302124, "reward_std": 0.10096503794193268, "rewards/VisualizationJSONCombinedORM/mean": 0.565540075302124, "rewards/VisualizationJSONCombinedORM/std": 0.10148178786039352, "step": 1804, "train_speed(iter/s)": 0.08518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 255.125, "completions/min_length": 212.0, "epoch": 1.4929693961952026, "grad_norm": 0.17315532267093658, "kl": 0.07366943359375, "learning_rate": 1.8336170392342738e-06, "loss": 0.0007370561361312866, "memory(GiB)": 38.05, "reward": 0.48961320519447327, "reward_std": 0.08615052700042725, "rewards/VisualizationJSONCombinedORM/mean": 0.48961320519447327, "rewards/VisualizationJSONCombinedORM/std": 0.23465584218502045, "step": 1805, "train_speed(iter/s)": 0.085142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 255.875, "completions/min_length": 193.0, "epoch": 1.4937965260545907, "grad_norm": 0.21100036799907684, "kl": 0.06396484375, "learning_rate": 1.8280335791817733e-06, "loss": 0.0006387631874531507, "memory(GiB)": 38.05, "reward": 0.38219165802001953, "reward_std": 0.04148568958044052, "rewards/VisualizationJSONCombinedORM/mean": 0.38219165802001953, "rewards/VisualizationJSONCombinedORM/std": 0.11927451193332672, "step": 1806, "train_speed(iter/s)": 0.085097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 281.0625, "completions/min_length": 234.0, "epoch": 1.4946236559139785, "grad_norm": 0.18960405886173248, "kl": 0.07403564453125, "learning_rate": 1.8224567307886142e-06, "loss": 0.0007382705807685852, "memory(GiB)": 38.05, "reward": 0.5095044374465942, "reward_std": 0.06616276502609253, "rewards/VisualizationJSONCombinedORM/mean": 0.5095044374465942, "rewards/VisualizationJSONCombinedORM/std": 0.18221895396709442, "step": 1807, "train_speed(iter/s)": 0.085066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 256.875, "completions/min_length": 213.0, "epoch": 1.4954507857733663, "grad_norm": 0.16648708283901215, "kl": 0.04058837890625, "learning_rate": 1.8168865056792029e-06, "loss": 0.0004058564081788063, "memory(GiB)": 38.05, "reward": 0.39468270540237427, "reward_std": 0.0573466494679451, "rewards/VisualizationJSONCombinedORM/mean": 0.39468270540237427, "rewards/VisualizationJSONCombinedORM/std": 0.12476719170808792, "step": 1808, "train_speed(iter/s)": 0.085009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 245.5, "completions/min_length": 191.0, "epoch": 1.4962779156327544, "grad_norm": 0.21997807919979095, "kl": 0.10400390625, "learning_rate": 1.8113229154641403e-06, "loss": 0.0010378733277320862, "memory(GiB)": 38.05, "reward": 0.430642694234848, "reward_std": 0.04566153883934021, "rewards/VisualizationJSONCombinedORM/mean": 0.430642694234848, "rewards/VisualizationJSONCombinedORM/std": 0.07237089425325394, "step": 1809, "train_speed(iter/s)": 0.084946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 248.1875, "completions/min_length": 203.0, "epoch": 1.4971050454921422, "grad_norm": 0.1662590652704239, "kl": 0.095947265625, "learning_rate": 1.8057659717401948e-06, "loss": 0.0009602438658475876, "memory(GiB)": 38.05, "reward": 0.477378785610199, "reward_std": 0.08275727182626724, "rewards/VisualizationJSONCombinedORM/mean": 0.477378785610199, "rewards/VisualizationJSONCombinedORM/std": 0.15591640770435333, "step": 1810, "train_speed(iter/s)": 0.084897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 271.625, "completions/min_length": 230.0, "epoch": 1.4979321753515302, "grad_norm": 0.1963823139667511, "kl": 0.05780029296875, "learning_rate": 1.8002156860902785e-06, "loss": 0.000577559694647789, "memory(GiB)": 38.05, "reward": 0.3872959315776825, "reward_std": 0.054417725652456284, "rewards/VisualizationJSONCombinedORM/mean": 0.3872959315776825, "rewards/VisualizationJSONCombinedORM/std": 0.10330716520547867, "step": 1811, "train_speed(iter/s)": 0.084853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 246.625, "completions/min_length": 193.0, "epoch": 1.498759305210918, "grad_norm": 0.23606377840042114, "kl": 0.0836181640625, "learning_rate": 1.7946720700834324e-06, "loss": 0.0008373763412237167, "memory(GiB)": 38.05, "reward": 0.46970781683921814, "reward_std": 0.07889935374259949, "rewards/VisualizationJSONCombinedORM/mean": 0.46970781683921814, "rewards/VisualizationJSONCombinedORM/std": 0.1390303522348404, "step": 1812, "train_speed(iter/s)": 0.08481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 254.5625, "completions/min_length": 215.0, "epoch": 1.499586435070306, "grad_norm": 0.1552978754043579, "kl": 0.05474853515625, "learning_rate": 1.7891351352747915e-06, "loss": 0.000546775758266449, "memory(GiB)": 38.05, "reward": 0.3976951539516449, "reward_std": 0.03713616728782654, "rewards/VisualizationJSONCombinedORM/mean": 0.3976951539516449, "rewards/VisualizationJSONCombinedORM/std": 0.07727210968732834, "step": 1813, "train_speed(iter/s)": 0.08477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 249.3125, "completions/min_length": 214.0, "epoch": 1.500413564929694, "grad_norm": 0.1972963958978653, "kl": 0.060791015625, "learning_rate": 1.7836048932055643e-06, "loss": 0.0006079003214836121, "memory(GiB)": 38.05, "reward": 0.6468549966812134, "reward_std": 0.09739820659160614, "rewards/VisualizationJSONCombinedORM/mean": 0.6468549966812134, "rewards/VisualizationJSONCombinedORM/std": 0.1726033091545105, "step": 1814, "train_speed(iter/s)": 0.084716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 255.0, "completions/min_length": 217.0, "epoch": 1.501240694789082, "grad_norm": 0.2020108699798584, "kl": 0.04931640625, "learning_rate": 1.7780813554030068e-06, "loss": 0.0004931464791297913, "memory(GiB)": 38.05, "reward": 0.36391258239746094, "reward_std": 0.04164716228842735, "rewards/VisualizationJSONCombinedORM/mean": 0.36391258239746094, "rewards/VisualizationJSONCombinedORM/std": 0.1307062953710556, "step": 1815, "train_speed(iter/s)": 0.084679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 270.5, "completions/min_length": 226.0, "epoch": 1.5020678246484698, "grad_norm": 0.18174397945404053, "kl": 0.1016845703125, "learning_rate": 1.7725645333804054e-06, "loss": 0.001017604023218155, "memory(GiB)": 38.05, "reward": 0.29178130626678467, "reward_std": 0.04453621804714203, "rewards/VisualizationJSONCombinedORM/mean": 0.29178130626678467, "rewards/VisualizationJSONCombinedORM/std": 0.06798389554023743, "step": 1816, "train_speed(iter/s)": 0.084625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 254.0625, "completions/min_length": 221.0, "epoch": 1.5028949545078576, "grad_norm": 0.15879476070404053, "kl": 0.036407470703125, "learning_rate": 1.7670544386370464e-06, "loss": 0.0003634132444858551, "memory(GiB)": 38.05, "reward": 0.6401207447052002, "reward_std": 0.06123758852481842, "rewards/VisualizationJSONCombinedORM/mean": 0.6401207447052002, "rewards/VisualizationJSONCombinedORM/std": 0.11851874738931656, "step": 1817, "train_speed(iter/s)": 0.084574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 264.75, "completions/min_length": 227.0, "epoch": 1.5037220843672456, "grad_norm": 0.19537222385406494, "kl": 0.064208984375, "learning_rate": 1.7615510826581906e-06, "loss": 0.0006424710154533386, "memory(GiB)": 38.05, "reward": 0.46997934579849243, "reward_std": 0.09867362678050995, "rewards/VisualizationJSONCombinedORM/mean": 0.46997934579849243, "rewards/VisualizationJSONCombinedORM/std": 0.10203411430120468, "step": 1818, "train_speed(iter/s)": 0.084508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 260.1875, "completions/min_length": 211.0, "epoch": 1.5045492142266337, "grad_norm": 0.20839352905750275, "kl": 0.033111572265625, "learning_rate": 1.7560544769150578e-06, "loss": 0.0003307461738586426, "memory(GiB)": 38.05, "reward": 0.5361382365226746, "reward_std": 0.0700320228934288, "rewards/VisualizationJSONCombinedORM/mean": 0.5361382365226746, "rewards/VisualizationJSONCombinedORM/std": 0.07192350924015045, "step": 1819, "train_speed(iter/s)": 0.084464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 229.625, "completions/min_length": 199.0, "epoch": 1.5053763440860215, "grad_norm": 0.16748763620853424, "kl": 0.05126953125, "learning_rate": 1.7505646328647913e-06, "loss": 0.00051155686378479, "memory(GiB)": 38.05, "reward": 0.5481551885604858, "reward_std": 0.059738028794527054, "rewards/VisualizationJSONCombinedORM/mean": 0.5481551885604858, "rewards/VisualizationJSONCombinedORM/std": 0.29100117087364197, "step": 1820, "train_speed(iter/s)": 0.084415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 252.5, "completions/min_length": 204.0, "epoch": 1.5062034739454093, "grad_norm": 0.17989186942577362, "kl": 0.08056640625, "learning_rate": 1.7450815619504474e-06, "loss": 0.0008061155676841736, "memory(GiB)": 38.05, "reward": 0.34232962131500244, "reward_std": 0.049690283834934235, "rewards/VisualizationJSONCombinedORM/mean": 0.34232962131500244, "rewards/VisualizationJSONCombinedORM/std": 0.06360454857349396, "step": 1821, "train_speed(iter/s)": 0.084364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 258.6875, "completions/min_length": 237.0, "epoch": 1.5070306038047974, "grad_norm": 0.18557189404964447, "kl": 0.06951904296875, "learning_rate": 1.7396052756009574e-06, "loss": 0.0006961151957511902, "memory(GiB)": 38.05, "reward": 0.4908212721347809, "reward_std": 0.057382021099328995, "rewards/VisualizationJSONCombinedORM/mean": 0.4908212721347809, "rewards/VisualizationJSONCombinedORM/std": 0.2677179276943207, "step": 1822, "train_speed(iter/s)": 0.084328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 293.9375, "completions/min_length": 209.0, "epoch": 1.5078577336641854, "grad_norm": 0.17211581766605377, "kl": 0.0670166015625, "learning_rate": 1.7341357852311175e-06, "loss": 0.0006708726286888123, "memory(GiB)": 38.05, "reward": 0.5069414377212524, "reward_std": 0.06087270379066467, "rewards/VisualizationJSONCombinedORM/mean": 0.5069414377212524, "rewards/VisualizationJSONCombinedORM/std": 0.07091408222913742, "step": 1823, "train_speed(iter/s)": 0.084292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 242.125, "completions/min_length": 185.0, "epoch": 1.5086848635235732, "grad_norm": 0.14980581402778625, "kl": 0.0443115234375, "learning_rate": 1.7286731022415515e-06, "loss": 0.00044314656406641006, "memory(GiB)": 38.05, "reward": 0.6373772025108337, "reward_std": 0.05655146390199661, "rewards/VisualizationJSONCombinedORM/mean": 0.6373772025108337, "rewards/VisualizationJSONCombinedORM/std": 0.14347916841506958, "step": 1824, "train_speed(iter/s)": 0.084247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 244.3125, "completions/min_length": 192.0, "epoch": 1.509511993382961, "grad_norm": 0.2165466547012329, "kl": 0.1123046875, "learning_rate": 1.7232172380186995e-06, "loss": 0.001119408756494522, "memory(GiB)": 38.05, "reward": 0.5652698278427124, "reward_std": 0.09290362149477005, "rewards/VisualizationJSONCombinedORM/mean": 0.5652698278427124, "rewards/VisualizationJSONCombinedORM/std": 0.14138221740722656, "step": 1825, "train_speed(iter/s)": 0.084202 }, { "epoch": 1.509511993382961, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 303.625, "eval_completions/mean_length": 257.796875, "eval_completions/min_length": 220.91666666666666, "eval_kl": 0.060923258463541664, "eval_loss": 0.0006108556990511715, "eval_reward": 0.45954625060160953, "eval_reward_std": 0.0673544827538232, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45954625060160953, "eval_rewards/VisualizationJSONCombinedORM/std": 0.067354487022385, "eval_runtime": 274.3927, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 246.0, "completions/min_length": 218.0, "epoch": 1.510339123242349, "grad_norm": 0.22277961671352386, "kl": 0.05657958984375, "learning_rate": 1.7177682039347875e-06, "loss": 0.0005653649568557739, "memory(GiB)": 38.05, "reward": 0.4185066819190979, "reward_std": 0.053115442395210266, "rewards/VisualizationJSONCombinedORM/mean": 0.4185066819190979, "rewards/VisualizationJSONCombinedORM/std": 0.13171280920505524, "step": 1826, "train_speed(iter/s)": 0.083116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 262.75, "completions/min_length": 223.0, "epoch": 1.5111662531017371, "grad_norm": 0.24815472960472107, "kl": 0.100830078125, "learning_rate": 1.7123260113478014e-06, "loss": 0.001007363200187683, "memory(GiB)": 38.05, "reward": 0.32799816131591797, "reward_std": 0.031206533312797546, "rewards/VisualizationJSONCombinedORM/mean": 0.32799816131591797, "rewards/VisualizationJSONCombinedORM/std": 0.0386769063770771, "step": 1827, "train_speed(iter/s)": 0.083069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 265.5, "completions/min_length": 206.0, "epoch": 1.511993382961125, "grad_norm": 0.1865357756614685, "kl": 0.04833984375, "learning_rate": 1.706890671601471e-06, "loss": 0.0004837680608034134, "memory(GiB)": 38.05, "reward": 0.5764502882957458, "reward_std": 0.08504975587129593, "rewards/VisualizationJSONCombinedORM/mean": 0.5764502882957458, "rewards/VisualizationJSONCombinedORM/std": 0.09046825021505356, "step": 1828, "train_speed(iter/s)": 0.083025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 252.125, "completions/min_length": 188.0, "epoch": 1.5128205128205128, "grad_norm": 0.20291084051132202, "kl": 0.081298828125, "learning_rate": 1.7014621960252376e-06, "loss": 0.0008123070001602173, "memory(GiB)": 38.05, "reward": 0.5808672904968262, "reward_std": 0.09682810306549072, "rewards/VisualizationJSONCombinedORM/mean": 0.5808672904968262, "rewards/VisualizationJSONCombinedORM/std": 0.12829755246639252, "step": 1829, "train_speed(iter/s)": 0.08298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 248.0625, "completions/min_length": 203.0, "epoch": 1.5136476426799006, "grad_norm": 0.19462841749191284, "kl": 0.08349609375, "learning_rate": 1.6960405959342402e-06, "loss": 0.000834805890917778, "memory(GiB)": 38.05, "reward": 0.5495286583900452, "reward_std": 0.06311127543449402, "rewards/VisualizationJSONCombinedORM/mean": 0.5495286583900452, "rewards/VisualizationJSONCombinedORM/std": 0.08606830984354019, "step": 1830, "train_speed(iter/s)": 0.082934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 268.9375, "completions/min_length": 214.0, "epoch": 1.5144747725392886, "grad_norm": 0.22460205852985382, "kl": 0.048583984375, "learning_rate": 1.6906258826292799e-06, "loss": 0.0004867464303970337, "memory(GiB)": 38.05, "reward": 0.4497947692871094, "reward_std": 0.06589536368846893, "rewards/VisualizationJSONCombinedORM/mean": 0.4497947692871094, "rewards/VisualizationJSONCombinedORM/std": 0.10506498068571091, "step": 1831, "train_speed(iter/s)": 0.082901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 261.9375, "completions/min_length": 217.0, "epoch": 1.5153019023986767, "grad_norm": 0.16579769551753998, "kl": 0.1072998046875, "learning_rate": 1.6852180673968093e-06, "loss": 0.001074615865945816, "memory(GiB)": 38.05, "reward": 0.3818854093551636, "reward_std": 0.06863725185394287, "rewards/VisualizationJSONCombinedORM/mean": 0.3818854093551636, "rewards/VisualizationJSONCombinedORM/std": 0.09504609555006027, "step": 1832, "train_speed(iter/s)": 0.082839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 259.875, "completions/min_length": 212.0, "epoch": 1.5161290322580645, "grad_norm": 0.20105446875095367, "kl": 0.0863037109375, "learning_rate": 1.6798171615088977e-06, "loss": 0.0008622929453849792, "memory(GiB)": 38.05, "reward": 0.49666208028793335, "reward_std": 0.07396453619003296, "rewards/VisualizationJSONCombinedORM/mean": 0.49666208028793335, "rewards/VisualizationJSONCombinedORM/std": 0.11937258392572403, "step": 1833, "train_speed(iter/s)": 0.082787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 287.25, "completions/min_length": 228.0, "epoch": 1.5169561621174523, "grad_norm": 0.16544471681118011, "kl": 0.0880126953125, "learning_rate": 1.6744231762232178e-06, "loss": 0.0008795186877250671, "memory(GiB)": 38.05, "reward": 0.6544798612594604, "reward_std": 0.09731448441743851, "rewards/VisualizationJSONCombinedORM/mean": 0.6544798612594604, "rewards/VisualizationJSONCombinedORM/std": 0.09509742259979248, "step": 1834, "train_speed(iter/s)": 0.082729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 264.6875, "completions/min_length": 200.0, "epoch": 1.5177832919768404, "grad_norm": 0.21365417540073395, "kl": 0.039306640625, "learning_rate": 1.66903612278301e-06, "loss": 0.00039261579513549805, "memory(GiB)": 38.05, "reward": 0.662635862827301, "reward_std": 0.09040866047143936, "rewards/VisualizationJSONCombinedORM/mean": 0.662635862827301, "rewards/VisualizationJSONCombinedORM/std": 0.12685436010360718, "step": 1835, "train_speed(iter/s)": 0.082687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 255.4375, "completions/min_length": 206.0, "epoch": 1.5186104218362284, "grad_norm": 0.1990850269794464, "kl": 0.08013916015625, "learning_rate": 1.6636560124170713e-06, "loss": 0.0008013583719730377, "memory(GiB)": 38.05, "reward": 0.48353511095046997, "reward_std": 0.08541665226221085, "rewards/VisualizationJSONCombinedORM/mean": 0.48353511095046997, "rewards/VisualizationJSONCombinedORM/std": 0.13190260529518127, "step": 1836, "train_speed(iter/s)": 0.082634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 254.1875, "completions/min_length": 194.0, "epoch": 1.5194375516956162, "grad_norm": 0.199835404753685, "kl": 0.058837890625, "learning_rate": 1.6582828563397268e-06, "loss": 0.0005883201956748962, "memory(GiB)": 38.05, "reward": 0.5749577283859253, "reward_std": 0.08274005353450775, "rewards/VisualizationJSONCombinedORM/mean": 0.5749577283859253, "rewards/VisualizationJSONCombinedORM/std": 0.14163056015968323, "step": 1837, "train_speed(iter/s)": 0.082582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 271.375, "completions/min_length": 238.0, "epoch": 1.520264681555004, "grad_norm": 0.23633259534835815, "kl": 0.10723876953125, "learning_rate": 1.6529166657508033e-06, "loss": 0.0010737739503383636, "memory(GiB)": 38.05, "reward": 0.7113588452339172, "reward_std": 0.10650627315044403, "rewards/VisualizationJSONCombinedORM/mean": 0.7113588452339172, "rewards/VisualizationJSONCombinedORM/std": 0.11752217262983322, "step": 1838, "train_speed(iter/s)": 0.082532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 272.3125, "completions/min_length": 219.0, "epoch": 1.521091811414392, "grad_norm": 0.23780275881290436, "kl": 0.0703125, "learning_rate": 1.6475574518356074e-06, "loss": 0.0007032640278339386, "memory(GiB)": 38.05, "reward": 0.556530773639679, "reward_std": 0.09070984274148941, "rewards/VisualizationJSONCombinedORM/mean": 0.556530773639679, "rewards/VisualizationJSONCombinedORM/std": 0.1334957778453827, "step": 1839, "train_speed(iter/s)": 0.082482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 256.9375, "completions/min_length": 192.0, "epoch": 1.5219189412737801, "grad_norm": 0.1948973834514618, "kl": 0.05584716796875, "learning_rate": 1.642205225764908e-06, "loss": 0.0005577579140663147, "memory(GiB)": 38.05, "reward": 0.6441951394081116, "reward_std": 0.08629225194454193, "rewards/VisualizationJSONCombinedORM/mean": 0.6441951394081116, "rewards/VisualizationJSONCombinedORM/std": 0.14238518476486206, "step": 1840, "train_speed(iter/s)": 0.082421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 261.0625, "completions/min_length": 210.0, "epoch": 1.522746071133168, "grad_norm": 0.20428048074245453, "kl": 0.069580078125, "learning_rate": 1.6368599986949068e-06, "loss": 0.0006954818964004517, "memory(GiB)": 38.05, "reward": 0.5229085683822632, "reward_std": 0.062433838844299316, "rewards/VisualizationJSONCombinedORM/mean": 0.5229085683822632, "rewards/VisualizationJSONCombinedORM/std": 0.1743958741426468, "step": 1841, "train_speed(iter/s)": 0.08236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 267.75, "completions/min_length": 230.0, "epoch": 1.5235732009925558, "grad_norm": 0.13524730503559113, "kl": 0.086181640625, "learning_rate": 1.6315217817672142e-06, "loss": 0.0008607320487499237, "memory(GiB)": 38.05, "reward": 0.6196099519729614, "reward_std": 0.08288457244634628, "rewards/VisualizationJSONCombinedORM/mean": 0.6196099519729614, "rewards/VisualizationJSONCombinedORM/std": 0.09320051223039627, "step": 1842, "train_speed(iter/s)": 0.082316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 268.6875, "completions/min_length": 190.0, "epoch": 1.5244003308519436, "grad_norm": 0.15530157089233398, "kl": 0.04296875, "learning_rate": 1.6261905861088329e-06, "loss": 0.0004300549626350403, "memory(GiB)": 38.05, "reward": 0.7256659269332886, "reward_std": 0.07193306088447571, "rewards/VisualizationJSONCombinedORM/mean": 0.7256659269332886, "rewards/VisualizationJSONCombinedORM/std": 0.07285196334123611, "step": 1843, "train_speed(iter/s)": 0.082275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 263.8125, "completions/min_length": 211.0, "epoch": 1.5252274607113316, "grad_norm": 0.18064729869365692, "kl": 0.067138671875, "learning_rate": 1.6208664228321254e-06, "loss": 0.0006709247827529907, "memory(GiB)": 38.05, "reward": 0.685932993888855, "reward_std": 0.09495396912097931, "rewards/VisualizationJSONCombinedORM/mean": 0.685932993888855, "rewards/VisualizationJSONCombinedORM/std": 0.12225493788719177, "step": 1844, "train_speed(iter/s)": 0.082212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 274.0, "completions/min_length": 222.0, "epoch": 1.5260545905707197, "grad_norm": 0.17099086940288544, "kl": 0.07958984375, "learning_rate": 1.6155493030348019e-06, "loss": 0.000794967170804739, "memory(GiB)": 38.05, "reward": 0.6516436338424683, "reward_std": 0.08398859202861786, "rewards/VisualizationJSONCombinedORM/mean": 0.6516436338424683, "rewards/VisualizationJSONCombinedORM/std": 0.11140691488981247, "step": 1845, "train_speed(iter/s)": 0.082152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 264.625, "completions/min_length": 207.0, "epoch": 1.5268817204301075, "grad_norm": 0.21548369526863098, "kl": 0.0599365234375, "learning_rate": 1.610239237799885e-06, "loss": 0.0005991198122501373, "memory(GiB)": 38.05, "reward": 0.4299008250236511, "reward_std": 0.09360173344612122, "rewards/VisualizationJSONCombinedORM/mean": 0.4299008250236511, "rewards/VisualizationJSONCombinedORM/std": 0.12452135235071182, "step": 1846, "train_speed(iter/s)": 0.08209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 254.6875, "completions/min_length": 216.0, "epoch": 1.5277088502894953, "grad_norm": 0.17281395196914673, "kl": 0.07061767578125, "learning_rate": 1.6049362381956979e-06, "loss": 0.0007079951465129852, "memory(GiB)": 38.05, "reward": 0.6298339366912842, "reward_std": 0.08057788014411926, "rewards/VisualizationJSONCombinedORM/mean": 0.6298339366912842, "rewards/VisualizationJSONCombinedORM/std": 0.10531787574291229, "step": 1847, "train_speed(iter/s)": 0.082052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 255.875, "completions/min_length": 211.0, "epoch": 1.5285359801488834, "grad_norm": 0.1921868920326233, "kl": 0.1038818359375, "learning_rate": 1.5996403152758315e-06, "loss": 0.0010377578437328339, "memory(GiB)": 38.05, "reward": 0.33474281430244446, "reward_std": 0.07840888202190399, "rewards/VisualizationJSONCombinedORM/mean": 0.33474281430244446, "rewards/VisualizationJSONCombinedORM/std": 0.1094915121793747, "step": 1848, "train_speed(iter/s)": 0.081987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 282.375, "completions/min_length": 237.0, "epoch": 1.5293631100082714, "grad_norm": 0.14642465114593506, "kl": 0.03704833984375, "learning_rate": 1.5943514800791303e-06, "loss": 0.0003710072487592697, "memory(GiB)": 38.05, "reward": 0.669063925743103, "reward_std": 0.07623867690563202, "rewards/VisualizationJSONCombinedORM/mean": 0.669063925743103, "rewards/VisualizationJSONCombinedORM/std": 0.09016788005828857, "step": 1849, "train_speed(iter/s)": 0.081926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 246.5, "completions/min_length": 197.0, "epoch": 1.5301902398676592, "grad_norm": 0.1741955429315567, "kl": 0.114501953125, "learning_rate": 1.5890697436296648e-06, "loss": 0.0011427775025367737, "memory(GiB)": 38.05, "reward": 0.5832844972610474, "reward_std": 0.09349033236503601, "rewards/VisualizationJSONCombinedORM/mean": 0.5832844972610474, "rewards/VisualizationJSONCombinedORM/std": 0.16124486923217773, "step": 1850, "train_speed(iter/s)": 0.081889 }, { "epoch": 1.5301902398676592, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 309.875, "eval_completions/mean_length": 266.90625, "eval_completions/min_length": 228.75, "eval_kl": 0.07292683919270833, "eval_loss": 0.0007322852616198361, "eval_reward": 0.4985649560888608, "eval_reward_std": 0.06543114692127953, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4985649560888608, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06543114866750936, "eval_runtime": 277.9632, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 259.25, "completions/min_length": 215.0, "epoch": 1.531017369727047, "grad_norm": 0.1944752186536789, "kl": 0.090087890625, "learning_rate": 1.583795116936705e-06, "loss": 0.0009010620415210724, "memory(GiB)": 38.05, "reward": 0.5138668417930603, "reward_std": 0.06901176273822784, "rewards/VisualizationJSONCombinedORM/mean": 0.5138668417930603, "rewards/VisualizationJSONCombinedORM/std": 0.2096090465784073, "step": 1851, "train_speed(iter/s)": 0.080838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 277.5625, "completions/min_length": 211.0, "epoch": 1.531844499586435, "grad_norm": 0.2393481433391571, "kl": 0.1126708984375, "learning_rate": 1.5785276109947028e-06, "loss": 0.0011261068284511566, "memory(GiB)": 38.05, "reward": 0.31312423944473267, "reward_std": 0.036626242101192474, "rewards/VisualizationJSONCombinedORM/mean": 0.31312423944473267, "rewards/VisualizationJSONCombinedORM/std": 0.03663581609725952, "step": 1852, "train_speed(iter/s)": 0.080805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 260.875, "completions/min_length": 231.0, "epoch": 1.5326716294458231, "grad_norm": 0.15291525423526764, "kl": 0.09344482421875, "learning_rate": 1.5732672367832696e-06, "loss": 0.0009333118796348572, "memory(GiB)": 38.05, "reward": 0.575843870639801, "reward_std": 0.05373229831457138, "rewards/VisualizationJSONCombinedORM/mean": 0.575843870639801, "rewards/VisualizationJSONCombinedORM/std": 0.05903144180774689, "step": 1853, "train_speed(iter/s)": 0.080768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 266.4375, "completions/min_length": 193.0, "epoch": 1.533498759305211, "grad_norm": 0.1727239340543747, "kl": 0.0382080078125, "learning_rate": 1.5680140052671516e-06, "loss": 0.00038191676139831543, "memory(GiB)": 38.05, "reward": 0.7157468795776367, "reward_std": 0.09714514017105103, "rewards/VisualizationJSONCombinedORM/mean": 0.7157468795776367, "rewards/VisualizationJSONCombinedORM/std": 0.09982427954673767, "step": 1854, "train_speed(iter/s)": 0.080734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 237.875, "completions/min_length": 193.0, "epoch": 1.5343258891645988, "grad_norm": 0.1836106777191162, "kl": 0.0849609375, "learning_rate": 1.5627679273962042e-06, "loss": 0.0008502788841724396, "memory(GiB)": 38.05, "reward": 0.6937262415885925, "reward_std": 0.10096714645624161, "rewards/VisualizationJSONCombinedORM/mean": 0.6937262415885925, "rewards/VisualizationJSONCombinedORM/std": 0.10099904984235764, "step": 1855, "train_speed(iter/s)": 0.080679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 270.75, "completions/min_length": 215.0, "epoch": 1.5351530190239868, "grad_norm": 0.15313246846199036, "kl": 0.0604248046875, "learning_rate": 1.5575290141053712e-06, "loss": 0.0006063133478164673, "memory(GiB)": 38.05, "reward": 0.583737850189209, "reward_std": 0.06309860199689865, "rewards/VisualizationJSONCombinedORM/mean": 0.583737850189209, "rewards/VisualizationJSONCombinedORM/std": 0.14625117182731628, "step": 1856, "train_speed(iter/s)": 0.080633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 243.875, "completions/min_length": 208.0, "epoch": 1.5359801488833746, "grad_norm": 0.24152638018131256, "kl": 0.1033935546875, "learning_rate": 1.5522972763146653e-06, "loss": 0.001032104715704918, "memory(GiB)": 38.05, "reward": 0.3603491485118866, "reward_std": 0.06813902407884598, "rewards/VisualizationJSONCombinedORM/mean": 0.3603491485118866, "rewards/VisualizationJSONCombinedORM/std": 0.10924787819385529, "step": 1857, "train_speed(iter/s)": 0.080598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 280.5625, "completions/min_length": 239.0, "epoch": 1.5368072787427627, "grad_norm": 0.22231020033359528, "kl": 0.06658935546875, "learning_rate": 1.5470727249291423e-06, "loss": 0.0006666034460067749, "memory(GiB)": 38.05, "reward": 0.39241915941238403, "reward_std": 0.05722697824239731, "rewards/VisualizationJSONCombinedORM/mean": 0.39241915941238403, "rewards/VisualizationJSONCombinedORM/std": 0.07236488163471222, "step": 1858, "train_speed(iter/s)": 0.080549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 250.5, "completions/min_length": 209.0, "epoch": 1.5376344086021505, "grad_norm": 0.18660683929920197, "kl": 0.1085205078125, "learning_rate": 1.5418553708388785e-06, "loss": 0.001084011048078537, "memory(GiB)": 38.05, "reward": 0.5409086346626282, "reward_std": 0.12759724259376526, "rewards/VisualizationJSONCombinedORM/mean": 0.5409086346626282, "rewards/VisualizationJSONCombinedORM/std": 0.17994172871112823, "step": 1859, "train_speed(iter/s)": 0.080521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 252.3125, "completions/min_length": 216.0, "epoch": 1.5384615384615383, "grad_norm": 0.2603488564491272, "kl": 0.0838623046875, "learning_rate": 1.5366452249189462e-06, "loss": 0.0008396655321121216, "memory(GiB)": 38.05, "reward": 0.7348495721817017, "reward_std": 0.1521550714969635, "rewards/VisualizationJSONCombinedORM/mean": 0.7348495721817017, "rewards/VisualizationJSONCombinedORM/std": 0.15157847106456757, "step": 1860, "train_speed(iter/s)": 0.080477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 261.375, "completions/min_length": 228.0, "epoch": 1.5392886683209264, "grad_norm": 0.1836412400007248, "kl": 0.05029296875, "learning_rate": 1.531442298029392e-06, "loss": 0.0005026170983910561, "memory(GiB)": 38.05, "reward": 0.48808661103248596, "reward_std": 0.07180089503526688, "rewards/VisualizationJSONCombinedORM/mean": 0.48808661103248596, "rewards/VisualizationJSONCombinedORM/std": 0.07514282315969467, "step": 1861, "train_speed(iter/s)": 0.080429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 274.0, "completions/min_length": 206.0, "epoch": 1.5401157981803144, "grad_norm": 0.1967005580663681, "kl": 0.04052734375, "learning_rate": 1.52624660101522e-06, "loss": 0.00040594860911369324, "memory(GiB)": 38.05, "reward": 0.63880455493927, "reward_std": 0.058020807802677155, "rewards/VisualizationJSONCombinedORM/mean": 0.63880455493927, "rewards/VisualizationJSONCombinedORM/std": 0.269944965839386, "step": 1862, "train_speed(iter/s)": 0.080386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 225.875, "completions/min_length": 193.0, "epoch": 1.5409429280397022, "grad_norm": 0.15120412409305573, "kl": 0.1275634765625, "learning_rate": 1.521058144706362e-06, "loss": 0.0012777000665664673, "memory(GiB)": 38.05, "reward": 0.506284236907959, "reward_std": 0.05089334025979042, "rewards/VisualizationJSONCombinedORM/mean": 0.506284236907959, "rewards/VisualizationJSONCombinedORM/std": 0.25491011142730713, "step": 1863, "train_speed(iter/s)": 0.080353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 266.3125, "completions/min_length": 208.0, "epoch": 1.54177005789909, "grad_norm": 0.15558670461177826, "kl": 0.039520263671875, "learning_rate": 1.5158769399176559e-06, "loss": 0.00039503350853919983, "memory(GiB)": 38.05, "reward": 0.5438332557678223, "reward_std": 0.07937383651733398, "rewards/VisualizationJSONCombinedORM/mean": 0.5438332557678223, "rewards/VisualizationJSONCombinedORM/std": 0.09888153523206711, "step": 1864, "train_speed(iter/s)": 0.080323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 277.0, "completions/min_length": 228.0, "epoch": 1.542597187758478, "grad_norm": 0.16011221706867218, "kl": 0.03155517578125, "learning_rate": 1.5107029974488234e-06, "loss": 0.00031568482518196106, "memory(GiB)": 38.05, "reward": 0.7210587859153748, "reward_std": 0.09089851379394531, "rewards/VisualizationJSONCombinedORM/mean": 0.7210587859153748, "rewards/VisualizationJSONCombinedORM/std": 0.14195369184017181, "step": 1865, "train_speed(iter/s)": 0.080282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 255.3125, "completions/min_length": 220.0, "epoch": 1.5434243176178661, "grad_norm": 0.22630977630615234, "kl": 0.0623779296875, "learning_rate": 1.505536328084453e-06, "loss": 0.0006238371133804321, "memory(GiB)": 38.05, "reward": 0.405593603849411, "reward_std": 0.06354529410600662, "rewards/VisualizationJSONCombinedORM/mean": 0.405593603849411, "rewards/VisualizationJSONCombinedORM/std": 0.06430162489414215, "step": 1866, "train_speed(iter/s)": 0.080225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 283.8125, "completions/min_length": 231.0, "epoch": 1.544251447477254, "grad_norm": 0.1910908818244934, "kl": 0.0460205078125, "learning_rate": 1.5003769425939718e-06, "loss": 0.0004601888358592987, "memory(GiB)": 38.05, "reward": 0.647129476070404, "reward_std": 0.10127703845500946, "rewards/VisualizationJSONCombinedORM/mean": 0.647129476070404, "rewards/VisualizationJSONCombinedORM/std": 0.100630022585392, "step": 1867, "train_speed(iter/s)": 0.080177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 253.3125, "completions/min_length": 206.0, "epoch": 1.5450785773366418, "grad_norm": 0.1797870546579361, "kl": 0.05975341796875, "learning_rate": 1.4952248517316215e-06, "loss": 0.0005989130586385727, "memory(GiB)": 38.05, "reward": 0.6362500190734863, "reward_std": 0.08624954521656036, "rewards/VisualizationJSONCombinedORM/mean": 0.6362500190734863, "rewards/VisualizationJSONCombinedORM/std": 0.11352244764566422, "step": 1868, "train_speed(iter/s)": 0.080143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 257.9375, "completions/min_length": 218.0, "epoch": 1.5459057071960298, "grad_norm": 0.18338973820209503, "kl": 0.05438232421875, "learning_rate": 1.4900800662364395e-06, "loss": 0.0005424469709396362, "memory(GiB)": 38.05, "reward": 0.6902273893356323, "reward_std": 0.07047519087791443, "rewards/VisualizationJSONCombinedORM/mean": 0.6902273893356323, "rewards/VisualizationJSONCombinedORM/std": 0.1131758913397789, "step": 1869, "train_speed(iter/s)": 0.080099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 285.125, "completions/min_length": 230.0, "epoch": 1.5467328370554179, "grad_norm": 0.1622323989868164, "kl": 0.04534912109375, "learning_rate": 1.4849425968322384e-06, "loss": 0.00045502930879592896, "memory(GiB)": 38.05, "reward": 0.804698646068573, "reward_std": 0.050868548452854156, "rewards/VisualizationJSONCombinedORM/mean": 0.804698646068573, "rewards/VisualizationJSONCombinedORM/std": 0.07808645069599152, "step": 1870, "train_speed(iter/s)": 0.08005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 269.3125, "completions/min_length": 210.0, "epoch": 1.5475599669148057, "grad_norm": 0.1764257401227951, "kl": 0.08416748046875, "learning_rate": 1.4798124542275794e-06, "loss": 0.0008431300520896912, "memory(GiB)": 38.05, "reward": 0.6219435930252075, "reward_std": 0.09794735163450241, "rewards/VisualizationJSONCombinedORM/mean": 0.6219435930252075, "rewards/VisualizationJSONCombinedORM/std": 0.10573951154947281, "step": 1871, "train_speed(iter/s)": 0.080008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 262.0625, "completions/min_length": 215.0, "epoch": 1.5483870967741935, "grad_norm": 0.16961310803890228, "kl": 0.08734130859375, "learning_rate": 1.4746896491157541e-06, "loss": 0.0008743386715650558, "memory(GiB)": 38.05, "reward": 0.6043524146080017, "reward_std": 0.07040295004844666, "rewards/VisualizationJSONCombinedORM/mean": 0.6043524146080017, "rewards/VisualizationJSONCombinedORM/std": 0.2420719712972641, "step": 1872, "train_speed(iter/s)": 0.079948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 249.5625, "completions/min_length": 201.0, "epoch": 1.5492142266335813, "grad_norm": 0.18516521155834198, "kl": 0.027191162109375, "learning_rate": 1.4695741921747563e-06, "loss": 0.0002719461917877197, "memory(GiB)": 38.05, "reward": 0.5917648673057556, "reward_std": 0.16025590896606445, "rewards/VisualizationJSONCombinedORM/mean": 0.5917648673057556, "rewards/VisualizationJSONCombinedORM/std": 0.17987586557865143, "step": 1873, "train_speed(iter/s)": 0.079905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 270.6875, "completions/min_length": 231.0, "epoch": 1.5500413564929694, "grad_norm": 0.19843903183937073, "kl": 0.030792236328125, "learning_rate": 1.4644660940672628e-06, "loss": 0.00030821189284324646, "memory(GiB)": 38.05, "reward": 0.48980939388275146, "reward_std": 0.07026267051696777, "rewards/VisualizationJSONCombinedORM/mean": 0.48980939388275146, "rewards/VisualizationJSONCombinedORM/std": 0.1143503338098526, "step": 1874, "train_speed(iter/s)": 0.079858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 282.1875, "completions/min_length": 218.0, "epoch": 1.5508684863523574, "grad_norm": 0.1725948303937912, "kl": 0.1395263671875, "learning_rate": 1.4593653654406149e-06, "loss": 0.0013943389058113098, "memory(GiB)": 38.05, "reward": 0.6601643562316895, "reward_std": 0.09891980141401291, "rewards/VisualizationJSONCombinedORM/mean": 0.6601643562316895, "rewards/VisualizationJSONCombinedORM/std": 0.10476519912481308, "step": 1875, "train_speed(iter/s)": 0.079805 }, { "epoch": 1.5508684863523574, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 317.875, "eval_completions/mean_length": 265.9479166666667, "eval_completions/min_length": 228.5, "eval_kl": 0.06575520833333333, "eval_loss": 0.0006633028388023376, "eval_reward": 0.47306613499919575, "eval_reward_std": 0.06395756887892882, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47306613499919575, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06395756965503097, "eval_runtime": 282.4939, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 273.5625, "completions/min_length": 244.0, "epoch": 1.5516956162117452, "grad_norm": 0.24411676824092865, "kl": 0.07135009765625, "learning_rate": 1.4542720169267933e-06, "loss": 0.0007135048508644104, "memory(GiB)": 38.05, "reward": 0.7184439897537231, "reward_std": 0.1093498095870018, "rewards/VisualizationJSONCombinedORM/mean": 0.7184439897537231, "rewards/VisualizationJSONCombinedORM/std": 0.12205228209495544, "step": 1876, "train_speed(iter/s)": 0.07882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 270.0, "completions/min_length": 203.0, "epoch": 1.552522746071133, "grad_norm": 0.17607749998569489, "kl": 0.07958984375, "learning_rate": 1.4491860591423917e-06, "loss": 0.0007943697273731232, "memory(GiB)": 38.05, "reward": 0.6861793994903564, "reward_std": 0.06733040511608124, "rewards/VisualizationJSONCombinedORM/mean": 0.6861793994903564, "rewards/VisualizationJSONCombinedORM/std": 0.16081088781356812, "step": 1877, "train_speed(iter/s)": 0.078775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 261.5625, "completions/min_length": 224.0, "epoch": 1.553349875930521, "grad_norm": 0.1726403385400772, "kl": 0.0355224609375, "learning_rate": 1.4441075026885999e-06, "loss": 0.00035602040588855743, "memory(GiB)": 38.05, "reward": 0.6800854802131653, "reward_std": 0.05945145711302757, "rewards/VisualizationJSONCombinedORM/mean": 0.6800854802131653, "rewards/VisualizationJSONCombinedORM/std": 0.10950241982936859, "step": 1878, "train_speed(iter/s)": 0.078736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 272.0625, "completions/min_length": 237.0, "epoch": 1.5541770057899091, "grad_norm": 0.21013279259204865, "kl": 0.05303955078125, "learning_rate": 1.439036358151182e-06, "loss": 0.0005286317318677902, "memory(GiB)": 38.05, "reward": 0.3340238332748413, "reward_std": 0.03909311443567276, "rewards/VisualizationJSONCombinedORM/mean": 0.3340238332748413, "rewards/VisualizationJSONCombinedORM/std": 0.03894045203924179, "step": 1879, "train_speed(iter/s)": 0.078698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 276.8125, "completions/min_length": 192.0, "epoch": 1.555004135649297, "grad_norm": 0.17479774355888367, "kl": 0.06915283203125, "learning_rate": 1.433972636100452e-06, "loss": 0.0006907694041728973, "memory(GiB)": 38.05, "reward": 0.3698795437812805, "reward_std": 0.06376572698354721, "rewards/VisualizationJSONCombinedORM/mean": 0.3698795437812805, "rewards/VisualizationJSONCombinedORM/std": 0.09165007621049881, "step": 1880, "train_speed(iter/s)": 0.07866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 258.0625, "completions/min_length": 209.0, "epoch": 1.5558312655086848, "grad_norm": 0.15494996309280396, "kl": 0.030853271484375, "learning_rate": 1.4289163470912537e-06, "loss": 0.00030784308910369873, "memory(GiB)": 38.05, "reward": 0.5622032880783081, "reward_std": 0.04970700666308403, "rewards/VisualizationJSONCombinedORM/mean": 0.5622032880783081, "rewards/VisualizationJSONCombinedORM/std": 0.18523909151554108, "step": 1881, "train_speed(iter/s)": 0.078612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 282.875, "completions/min_length": 210.0, "epoch": 1.5566583953680728, "grad_norm": 0.19520795345306396, "kl": 0.0550537109375, "learning_rate": 1.423867501662934e-06, "loss": 0.0005502142012119293, "memory(GiB)": 38.05, "reward": 0.6530551910400391, "reward_std": 0.12247948348522186, "rewards/VisualizationJSONCombinedORM/mean": 0.6530551910400391, "rewards/VisualizationJSONCombinedORM/std": 0.12021984905004501, "step": 1882, "train_speed(iter/s)": 0.078569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 282.75, "completions/min_length": 228.0, "epoch": 1.5574855252274609, "grad_norm": 0.20421116054058075, "kl": 0.0751953125, "learning_rate": 1.4188261103393247e-06, "loss": 0.000752672553062439, "memory(GiB)": 38.05, "reward": 0.5121006369590759, "reward_std": 0.12403196096420288, "rewards/VisualizationJSONCombinedORM/mean": 0.5121006369590759, "rewards/VisualizationJSONCombinedORM/std": 0.16680824756622314, "step": 1883, "train_speed(iter/s)": 0.078519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 267.0, "completions/min_length": 222.0, "epoch": 1.5583126550868487, "grad_norm": 0.18197159469127655, "kl": 0.0841064453125, "learning_rate": 1.4137921836287238e-06, "loss": 0.0008400604128837585, "memory(GiB)": 38.05, "reward": 0.6510991454124451, "reward_std": 0.06599573791027069, "rewards/VisualizationJSONCombinedORM/mean": 0.6510991454124451, "rewards/VisualizationJSONCombinedORM/std": 0.12057352811098099, "step": 1884, "train_speed(iter/s)": 0.078486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 275.375, "completions/min_length": 191.0, "epoch": 1.5591397849462365, "grad_norm": 0.16700714826583862, "kl": 0.04656982421875, "learning_rate": 1.4087657320238696e-06, "loss": 0.00046549364924430847, "memory(GiB)": 38.05, "reward": 0.590421199798584, "reward_std": 0.0443313829600811, "rewards/VisualizationJSONCombinedORM/mean": 0.590421199798584, "rewards/VisualizationJSONCombinedORM/std": 0.0848710834980011, "step": 1885, "train_speed(iter/s)": 0.078442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 259.3125, "completions/min_length": 199.0, "epoch": 1.5599669148056243, "grad_norm": 0.2672188878059387, "kl": 0.04248046875, "learning_rate": 1.4037467660019156e-06, "loss": 0.0004246719181537628, "memory(GiB)": 38.05, "reward": 0.5505495071411133, "reward_std": 0.10448049753904343, "rewards/VisualizationJSONCombinedORM/mean": 0.5505495071411133, "rewards/VisualizationJSONCombinedORM/std": 0.1865559220314026, "step": 1886, "train_speed(iter/s)": 0.078405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 280.25, "completions/min_length": 204.0, "epoch": 1.5607940446650124, "grad_norm": 0.18273384869098663, "kl": 0.05670166015625, "learning_rate": 1.3987352960244134e-06, "loss": 0.0005679428577423096, "memory(GiB)": 38.05, "reward": 0.2831101417541504, "reward_std": 0.031064148992300034, "rewards/VisualizationJSONCombinedORM/mean": 0.2831101417541504, "rewards/VisualizationJSONCombinedORM/std": 0.11066215485334396, "step": 1887, "train_speed(iter/s)": 0.078367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 239.4375, "completions/min_length": 207.0, "epoch": 1.5616211745244004, "grad_norm": 0.15441904962062836, "kl": 0.032135009765625, "learning_rate": 1.3937313325372919e-06, "loss": 0.00032116472721099854, "memory(GiB)": 38.05, "reward": 0.6845883131027222, "reward_std": 0.10212205350399017, "rewards/VisualizationJSONCombinedORM/mean": 0.6845883131027222, "rewards/VisualizationJSONCombinedORM/std": 0.1159459799528122, "step": 1888, "train_speed(iter/s)": 0.078322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 249.3125, "completions/min_length": 202.0, "epoch": 1.5624483043837882, "grad_norm": 0.2376791387796402, "kl": 0.0582275390625, "learning_rate": 1.3887348859708344e-06, "loss": 0.0005825087428092957, "memory(GiB)": 38.05, "reward": 0.5160889625549316, "reward_std": 0.09200604259967804, "rewards/VisualizationJSONCombinedORM/mean": 0.5160889625549316, "rewards/VisualizationJSONCombinedORM/std": 0.09476172178983688, "step": 1889, "train_speed(iter/s)": 0.078301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 242.875, "completions/min_length": 211.0, "epoch": 1.563275434243176, "grad_norm": 0.16079430282115936, "kl": 0.07763671875, "learning_rate": 1.383745966739652e-06, "loss": 0.0007768571376800537, "memory(GiB)": 38.05, "reward": 0.4733181297779083, "reward_std": 0.08805815875530243, "rewards/VisualizationJSONCombinedORM/mean": 0.4733181297779083, "rewards/VisualizationJSONCombinedORM/std": 0.20582963526248932, "step": 1890, "train_speed(iter/s)": 0.078269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 267.9375, "completions/min_length": 222.0, "epoch": 1.564102564102564, "grad_norm": 0.18639340996742249, "kl": 0.042236328125, "learning_rate": 1.3787645852426663e-06, "loss": 0.00042211636900901794, "memory(GiB)": 38.05, "reward": 0.6705464124679565, "reward_std": 0.09554139524698257, "rewards/VisualizationJSONCombinedORM/mean": 0.6705464124679565, "rewards/VisualizationJSONCombinedORM/std": 0.09420286864042282, "step": 1891, "train_speed(iter/s)": 0.078229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 255.6875, "completions/min_length": 211.0, "epoch": 1.5649296939619521, "grad_norm": 0.21297848224639893, "kl": 0.04681396484375, "learning_rate": 1.37379075186309e-06, "loss": 0.00046772509813308716, "memory(GiB)": 38.05, "reward": 0.5141573548316956, "reward_std": 0.04055078700184822, "rewards/VisualizationJSONCombinedORM/mean": 0.5141573548316956, "rewards/VisualizationJSONCombinedORM/std": 0.12127643078565598, "step": 1892, "train_speed(iter/s)": 0.078186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 278.625, "completions/min_length": 228.0, "epoch": 1.56575682382134, "grad_norm": 0.15389204025268555, "kl": 0.0313720703125, "learning_rate": 1.3688244769684007e-06, "loss": 0.00031362101435661316, "memory(GiB)": 38.05, "reward": 0.779667854309082, "reward_std": 0.059988729655742645, "rewards/VisualizationJSONCombinedORM/mean": 0.779667854309082, "rewards/VisualizationJSONCombinedORM/std": 0.0742676854133606, "step": 1893, "train_speed(iter/s)": 0.078158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 239.5625, "completions/min_length": 190.0, "epoch": 1.5665839536807278, "grad_norm": 0.18312552571296692, "kl": 0.05755615234375, "learning_rate": 1.3638657709103238e-06, "loss": 0.0005750358104705811, "memory(GiB)": 38.05, "reward": 0.679640531539917, "reward_std": 0.09073522686958313, "rewards/VisualizationJSONCombinedORM/mean": 0.679640531539917, "rewards/VisualizationJSONCombinedORM/std": 0.11371315270662308, "step": 1894, "train_speed(iter/s)": 0.078116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 268.0625, "completions/min_length": 245.0, "epoch": 1.5674110835401158, "grad_norm": 0.15537407994270325, "kl": 0.05621337890625, "learning_rate": 1.3589146440248034e-06, "loss": 0.0005625039339065552, "memory(GiB)": 38.05, "reward": 0.533268928527832, "reward_std": 0.060730163007974625, "rewards/VisualizationJSONCombinedORM/mean": 0.533268928527832, "rewards/VisualizationJSONCombinedORM/std": 0.1833077073097229, "step": 1895, "train_speed(iter/s)": 0.078075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 263.375, "completions/min_length": 213.0, "epoch": 1.5682382133995039, "grad_norm": 0.14725486934185028, "kl": 0.025390625, "learning_rate": 1.3539711066319873e-06, "loss": 0.0002532331272959709, "memory(GiB)": 38.05, "reward": 0.5557767152786255, "reward_std": 0.07336437702178955, "rewards/VisualizationJSONCombinedORM/mean": 0.5557767152786255, "rewards/VisualizationJSONCombinedORM/std": 0.07155545800924301, "step": 1896, "train_speed(iter/s)": 0.078042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 277.0, "completions/min_length": 210.0, "epoch": 1.5690653432588917, "grad_norm": 0.15079501271247864, "kl": 0.0570068359375, "learning_rate": 1.3490351690362046e-06, "loss": 0.0005683377385139465, "memory(GiB)": 38.05, "reward": 0.6813034415245056, "reward_std": 0.043456219136714935, "rewards/VisualizationJSONCombinedORM/mean": 0.6813034415245056, "rewards/VisualizationJSONCombinedORM/std": 0.10461729019880295, "step": 1897, "train_speed(iter/s)": 0.077996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 277.875, "completions/min_length": 229.0, "epoch": 1.5698924731182795, "grad_norm": 0.17280036211013794, "kl": 0.0557861328125, "learning_rate": 1.3441068415259462e-06, "loss": 0.0005572810769081116, "memory(GiB)": 38.05, "reward": 0.5521223545074463, "reward_std": 0.056985124945640564, "rewards/VisualizationJSONCombinedORM/mean": 0.5521223545074463, "rewards/VisualizationJSONCombinedORM/std": 0.2997489273548126, "step": 1898, "train_speed(iter/s)": 0.077947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 250.625, "completions/min_length": 218.0, "epoch": 1.5707196029776673, "grad_norm": 0.15721140801906586, "kl": 0.132080078125, "learning_rate": 1.339186134373835e-06, "loss": 0.001323603093624115, "memory(GiB)": 38.05, "reward": 0.4659494161605835, "reward_std": 0.08544948697090149, "rewards/VisualizationJSONCombinedORM/mean": 0.4659494161605835, "rewards/VisualizationJSONCombinedORM/std": 0.1923782378435135, "step": 1899, "train_speed(iter/s)": 0.077895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 263.75, "completions/min_length": 219.0, "epoch": 1.5715467328370554, "grad_norm": 0.2249826043844223, "kl": 0.093994140625, "learning_rate": 1.334273057836611e-06, "loss": 0.0009386315941810608, "memory(GiB)": 38.05, "reward": 0.2793447971343994, "reward_std": 0.03942926973104477, "rewards/VisualizationJSONCombinedORM/mean": 0.2793447971343994, "rewards/VisualizationJSONCombinedORM/std": 0.05611162632703781, "step": 1900, "train_speed(iter/s)": 0.077866 }, { "epoch": 1.5715467328370554, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 307.9583333333333, "eval_completions/mean_length": 261.7135416666667, "eval_completions/min_length": 222.70833333333334, "eval_kl": 0.061147054036458336, "eval_loss": 0.0006148951943032444, "eval_reward": 0.466165787850817, "eval_reward_std": 0.06405742610028635, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.466165787850817, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0640574285838132, "eval_runtime": 276.7776, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 276.5625, "completions/min_length": 217.0, "epoch": 1.5723738626964434, "grad_norm": 0.19924457371234894, "kl": 0.0457763671875, "learning_rate": 1.3293676221551123e-06, "loss": 0.0004588812589645386, "memory(GiB)": 38.05, "reward": 0.5241686701774597, "reward_std": 0.05426386743783951, "rewards/VisualizationJSONCombinedORM/mean": 0.5241686701774597, "rewards/VisualizationJSONCombinedORM/std": 0.1804000586271286, "step": 1901, "train_speed(iter/s)": 0.076965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 250.0, "completions/min_length": 234.0, "epoch": 1.5732009925558312, "grad_norm": 0.20808398723602295, "kl": 0.0318603515625, "learning_rate": 1.3244698375542492e-06, "loss": 0.000318232923746109, "memory(GiB)": 38.05, "reward": 0.47809267044067383, "reward_std": 0.056800372898578644, "rewards/VisualizationJSONCombinedORM/mean": 0.47809267044067383, "rewards/VisualizationJSONCombinedORM/std": 0.09617260098457336, "step": 1902, "train_speed(iter/s)": 0.076939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 250.625, "completions/min_length": 209.0, "epoch": 1.574028122415219, "grad_norm": 0.20816704630851746, "kl": 0.0662841796875, "learning_rate": 1.319579714242981e-06, "loss": 0.0006633074954152107, "memory(GiB)": 38.05, "reward": 0.5817887783050537, "reward_std": 0.12537959218025208, "rewards/VisualizationJSONCombinedORM/mean": 0.5817887783050537, "rewards/VisualizationJSONCombinedORM/std": 0.1523614525794983, "step": 1903, "train_speed(iter/s)": 0.076888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 261.1875, "completions/min_length": 220.0, "epoch": 1.574855252274607, "grad_norm": 0.16659803688526154, "kl": 0.0445556640625, "learning_rate": 1.3146972624143024e-06, "loss": 0.00044533610343933105, "memory(GiB)": 38.05, "reward": 0.6004773378372192, "reward_std": 0.06751018017530441, "rewards/VisualizationJSONCombinedORM/mean": 0.6004773378372192, "rewards/VisualizationJSONCombinedORM/std": 0.1879737824201584, "step": 1904, "train_speed(iter/s)": 0.076851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 261.375, "completions/min_length": 236.0, "epoch": 1.5756823821339951, "grad_norm": 0.20417973399162292, "kl": 0.12164306640625, "learning_rate": 1.3098224922452129e-06, "loss": 0.0012135282158851624, "memory(GiB)": 38.05, "reward": 0.400002121925354, "reward_std": 0.062006592750549316, "rewards/VisualizationJSONCombinedORM/mean": 0.400002121925354, "rewards/VisualizationJSONCombinedORM/std": 0.11688302457332611, "step": 1905, "train_speed(iter/s)": 0.076825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 257.0, "completions/min_length": 204.0, "epoch": 1.576509511993383, "grad_norm": 0.2747855484485626, "kl": 0.029510498046875, "learning_rate": 1.3049554138967052e-06, "loss": 0.00029477477073669434, "memory(GiB)": 38.05, "reward": 0.672509491443634, "reward_std": 0.07910436391830444, "rewards/VisualizationJSONCombinedORM/mean": 0.672509491443634, "rewards/VisualizationJSONCombinedORM/std": 0.0903540849685669, "step": 1906, "train_speed(iter/s)": 0.076784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 272.0625, "completions/min_length": 217.0, "epoch": 1.5773366418527708, "grad_norm": 0.1378229707479477, "kl": 0.030792236328125, "learning_rate": 1.300096037513734e-06, "loss": 0.0003081299364566803, "memory(GiB)": 38.05, "reward": 0.780210018157959, "reward_std": 0.06937795132398605, "rewards/VisualizationJSONCombinedORM/mean": 0.780210018157959, "rewards/VisualizationJSONCombinedORM/std": 0.07106629759073257, "step": 1907, "train_speed(iter/s)": 0.076754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 260.9375, "completions/min_length": 222.0, "epoch": 1.5781637717121588, "grad_norm": 0.22728444635868073, "kl": 0.0692138671875, "learning_rate": 1.2952443732252058e-06, "loss": 0.0006906129419803619, "memory(GiB)": 38.05, "reward": 0.36940157413482666, "reward_std": 0.05790993198752403, "rewards/VisualizationJSONCombinedORM/mean": 0.36940157413482666, "rewards/VisualizationJSONCombinedORM/std": 0.06972485035657883, "step": 1908, "train_speed(iter/s)": 0.076725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 248.375, "completions/min_length": 220.0, "epoch": 1.5789909015715469, "grad_norm": 0.18577849864959717, "kl": 0.066162109375, "learning_rate": 1.290400431143945e-06, "loss": 0.0006625577807426453, "memory(GiB)": 38.05, "reward": 0.6398837566375732, "reward_std": 0.11843061447143555, "rewards/VisualizationJSONCombinedORM/mean": 0.6398837566375732, "rewards/VisualizationJSONCombinedORM/std": 0.1160813421010971, "step": 1909, "train_speed(iter/s)": 0.076695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 269.9375, "completions/min_length": 218.0, "epoch": 1.5798180314309347, "grad_norm": 0.18443052470684052, "kl": 0.07598876953125, "learning_rate": 1.2855642213666858e-06, "loss": 0.0007600300014019012, "memory(GiB)": 38.05, "reward": 0.46524783968925476, "reward_std": 0.0666126012802124, "rewards/VisualizationJSONCombinedORM/mean": 0.46524783968925476, "rewards/VisualizationJSONCombinedORM/std": 0.14899584650993347, "step": 1910, "train_speed(iter/s)": 0.076662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 260.3125, "completions/min_length": 202.0, "epoch": 1.5806451612903225, "grad_norm": 0.1716187596321106, "kl": 0.04803466796875, "learning_rate": 1.2807357539740446e-06, "loss": 0.0004802793264389038, "memory(GiB)": 38.05, "reward": 0.4307360053062439, "reward_std": 0.0658188909292221, "rewards/VisualizationJSONCombinedORM/mean": 0.4307360053062439, "rewards/VisualizationJSONCombinedORM/std": 0.07012435793876648, "step": 1911, "train_speed(iter/s)": 0.076625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 254.625, "completions/min_length": 233.0, "epoch": 1.5814722911497106, "grad_norm": 0.1472364068031311, "kl": 0.05224609375, "learning_rate": 1.2759150390304953e-06, "loss": 0.0005215294659137726, "memory(GiB)": 38.05, "reward": 0.3734999895095825, "reward_std": 0.055715858936309814, "rewards/VisualizationJSONCombinedORM/mean": 0.3734999895095825, "rewards/VisualizationJSONCombinedORM/std": 0.07977309823036194, "step": 1912, "train_speed(iter/s)": 0.076597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 250.625, "completions/min_length": 199.0, "epoch": 1.5822994210090986, "grad_norm": 0.1957148164510727, "kl": 0.0634765625, "learning_rate": 1.2711020865843555e-06, "loss": 0.0006354451179504395, "memory(GiB)": 38.05, "reward": 0.5824030637741089, "reward_std": 0.11169059574604034, "rewards/VisualizationJSONCombinedORM/mean": 0.5824030637741089, "rewards/VisualizationJSONCombinedORM/std": 0.1478888988494873, "step": 1913, "train_speed(iter/s)": 0.076572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 272.375, "completions/min_length": 226.0, "epoch": 1.5831265508684864, "grad_norm": 0.19753456115722656, "kl": 0.05419921875, "learning_rate": 1.266296906667762e-06, "loss": 0.0005415380001068115, "memory(GiB)": 38.05, "reward": 0.5411562919616699, "reward_std": 0.09729649871587753, "rewards/VisualizationJSONCombinedORM/mean": 0.5411562919616699, "rewards/VisualizationJSONCombinedORM/std": 0.21318121254444122, "step": 1914, "train_speed(iter/s)": 0.076535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 247.625, "completions/min_length": 206.0, "epoch": 1.5839536807278742, "grad_norm": 0.19646111130714417, "kl": 0.075927734375, "learning_rate": 1.2614995092966537e-06, "loss": 0.0007587149739265442, "memory(GiB)": 38.05, "reward": 0.5646045804023743, "reward_std": 0.07932565361261368, "rewards/VisualizationJSONCombinedORM/mean": 0.5646045804023743, "rewards/VisualizationJSONCombinedORM/std": 0.17595994472503662, "step": 1915, "train_speed(iter/s)": 0.076516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 268.875, "completions/min_length": 208.0, "epoch": 1.584780810587262, "grad_norm": 0.27709391713142395, "kl": 0.07244873046875, "learning_rate": 1.256709904470741e-06, "loss": 0.0007244274020195007, "memory(GiB)": 38.05, "reward": 0.6553891897201538, "reward_std": 0.07675801217556, "rewards/VisualizationJSONCombinedORM/mean": 0.6553891897201538, "rewards/VisualizationJSONCombinedORM/std": 0.15717105567455292, "step": 1916, "train_speed(iter/s)": 0.076477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 249.125, "completions/min_length": 215.0, "epoch": 1.58560794044665, "grad_norm": 0.23657093942165375, "kl": 0.116943359375, "learning_rate": 1.2519281021734975e-06, "loss": 0.0011675218120217323, "memory(GiB)": 38.05, "reward": 0.5554623007774353, "reward_std": 0.07130850851535797, "rewards/VisualizationJSONCombinedORM/mean": 0.5554623007774353, "rewards/VisualizationJSONCombinedORM/std": 0.19533109664916992, "step": 1917, "train_speed(iter/s)": 0.076441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 257.9375, "completions/min_length": 221.0, "epoch": 1.5864350703060381, "grad_norm": 0.1541454941034317, "kl": 0.132080078125, "learning_rate": 1.2471541123721292e-06, "loss": 0.001322433352470398, "memory(GiB)": 38.05, "reward": 0.40816831588745117, "reward_std": 0.05455699935555458, "rewards/VisualizationJSONCombinedORM/mean": 0.40816831588745117, "rewards/VisualizationJSONCombinedORM/std": 0.11926205456256866, "step": 1918, "train_speed(iter/s)": 0.076415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 287.25, "completions/min_length": 225.0, "epoch": 1.587262200165426, "grad_norm": 0.1372477114200592, "kl": 0.06085205078125, "learning_rate": 1.2423879450175613e-06, "loss": 0.0006087422370910645, "memory(GiB)": 38.05, "reward": 0.4595453143119812, "reward_std": 0.05814613029360771, "rewards/VisualizationJSONCombinedORM/mean": 0.4595453143119812, "rewards/VisualizationJSONCombinedORM/std": 0.141622856259346, "step": 1919, "train_speed(iter/s)": 0.076388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 256.625, "completions/min_length": 223.0, "epoch": 1.5880893300248138, "grad_norm": 0.20235523581504822, "kl": 0.1064453125, "learning_rate": 1.2376296100444092e-06, "loss": 0.0010645091533660889, "memory(GiB)": 38.05, "reward": 0.5790824890136719, "reward_std": 0.11084957420825958, "rewards/VisualizationJSONCombinedORM/mean": 0.5790824890136719, "rewards/VisualizationJSONCombinedORM/std": 0.13050083816051483, "step": 1920, "train_speed(iter/s)": 0.076335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 281.0, "completions/min_length": 228.0, "epoch": 1.5889164598842018, "grad_norm": 0.2079779952764511, "kl": 0.064208984375, "learning_rate": 1.232879117370968e-06, "loss": 0.0006420426070690155, "memory(GiB)": 38.05, "reward": 0.635413408279419, "reward_std": 0.10979527235031128, "rewards/VisualizationJSONCombinedORM/mean": 0.635413408279419, "rewards/VisualizationJSONCombinedORM/std": 0.1500919610261917, "step": 1921, "train_speed(iter/s)": 0.076282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 260.875, "completions/min_length": 192.0, "epoch": 1.5897435897435899, "grad_norm": 0.13698361814022064, "kl": 0.034423828125, "learning_rate": 1.2281364768991804e-06, "loss": 0.00034462474286556244, "memory(GiB)": 38.05, "reward": 0.3761935234069824, "reward_std": 0.04514910653233528, "rewards/VisualizationJSONCombinedORM/mean": 0.3761935234069824, "rewards/VisualizationJSONCombinedORM/std": 0.05777715891599655, "step": 1922, "train_speed(iter/s)": 0.076247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 248.9375, "completions/min_length": 207.0, "epoch": 1.5905707196029777, "grad_norm": 0.2113477885723114, "kl": 0.0858154296875, "learning_rate": 1.2234016985146284e-06, "loss": 0.0008599720895290375, "memory(GiB)": 38.05, "reward": 0.44570958614349365, "reward_std": 0.05630745738744736, "rewards/VisualizationJSONCombinedORM/mean": 0.44570958614349365, "rewards/VisualizationJSONCombinedORM/std": 0.22243793308734894, "step": 1923, "train_speed(iter/s)": 0.076215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 251.4375, "completions/min_length": 201.0, "epoch": 1.5913978494623655, "grad_norm": 0.15123304724693298, "kl": 0.08380126953125, "learning_rate": 1.2186747920864993e-06, "loss": 0.0008387863636016846, "memory(GiB)": 38.05, "reward": 0.7847683429718018, "reward_std": 0.0964599996805191, "rewards/VisualizationJSONCombinedORM/mean": 0.7847683429718018, "rewards/VisualizationJSONCombinedORM/std": 0.10773744434118271, "step": 1924, "train_speed(iter/s)": 0.07617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 268.4375, "completions/min_length": 213.0, "epoch": 1.5922249793217536, "grad_norm": 0.16829092800617218, "kl": 0.15673828125, "learning_rate": 1.2139557674675773e-06, "loss": 0.0015706159174442291, "memory(GiB)": 38.05, "reward": 0.5860636234283447, "reward_std": 0.09198582172393799, "rewards/VisualizationJSONCombinedORM/mean": 0.5860636234283447, "rewards/VisualizationJSONCombinedORM/std": 0.09414932876825333, "step": 1925, "train_speed(iter/s)": 0.076133 }, { "epoch": 1.5922249793217536, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 317.9583333333333, "eval_completions/mean_length": 266.9114583333333, "eval_completions/min_length": 228.41666666666666, "eval_kl": 0.069671630859375, "eval_loss": 0.0006930275703780353, "eval_reward": 0.4667417071759701, "eval_reward_std": 0.06848528624201815, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4667417071759701, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06848528806585819, "eval_runtime": 282.643, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 257.875, "completions/min_length": 207.0, "epoch": 1.5930521091811416, "grad_norm": 0.19142034649848938, "kl": 0.099853515625, "learning_rate": 1.2092446344942165e-06, "loss": 0.0009989440441131592, "memory(GiB)": 38.05, "reward": 0.5723063945770264, "reward_std": 0.06680724769830704, "rewards/VisualizationJSONCombinedORM/mean": 0.5723063945770264, "rewards/VisualizationJSONCombinedORM/std": 0.20665645599365234, "step": 1926, "train_speed(iter/s)": 0.075261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 252.625, "completions/min_length": 206.0, "epoch": 1.5938792390405294, "grad_norm": 0.17525917291641235, "kl": 0.1424560546875, "learning_rate": 1.2045414029863196e-06, "loss": 0.0014256499707698822, "memory(GiB)": 38.05, "reward": 0.6287509202957153, "reward_std": 0.05861685425043106, "rewards/VisualizationJSONCombinedORM/mean": 0.6287509202957153, "rewards/VisualizationJSONCombinedORM/std": 0.16299684345722198, "step": 1927, "train_speed(iter/s)": 0.07522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 238.125, "completions/min_length": 211.0, "epoch": 1.5947063688999172, "grad_norm": 0.19350217282772064, "kl": 0.104248046875, "learning_rate": 1.199846082747323e-06, "loss": 0.0010453499853610992, "memory(GiB)": 38.05, "reward": 0.5414822697639465, "reward_std": 0.09094345569610596, "rewards/VisualizationJSONCombinedORM/mean": 0.5414822697639465, "rewards/VisualizationJSONCombinedORM/std": 0.1650272011756897, "step": 1928, "train_speed(iter/s)": 0.075192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 258.6875, "completions/min_length": 209.0, "epoch": 1.595533498759305, "grad_norm": 0.2205369472503662, "kl": 0.0743408203125, "learning_rate": 1.1951586835641682e-06, "loss": 0.0007432252168655396, "memory(GiB)": 38.05, "reward": 0.6758173704147339, "reward_std": 0.06719724833965302, "rewards/VisualizationJSONCombinedORM/mean": 0.6758173704147339, "rewards/VisualizationJSONCombinedORM/std": 0.07363761961460114, "step": 1929, "train_speed(iter/s)": 0.07516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 275.0, "completions/min_length": 223.0, "epoch": 1.596360628618693, "grad_norm": 0.19685401022434235, "kl": 0.0692138671875, "learning_rate": 1.1904792152072914e-06, "loss": 0.0006914809346199036, "memory(GiB)": 38.05, "reward": 0.3650805950164795, "reward_std": 0.05679577961564064, "rewards/VisualizationJSONCombinedORM/mean": 0.3650805950164795, "rewards/VisualizationJSONCombinedORM/std": 0.11583177000284195, "step": 1930, "train_speed(iter/s)": 0.075129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 252.3125, "completions/min_length": 209.0, "epoch": 1.5971877584780811, "grad_norm": 0.1583748310804367, "kl": 0.03900146484375, "learning_rate": 1.185807687430592e-06, "loss": 0.00039002299308776855, "memory(GiB)": 38.05, "reward": 0.6088706254959106, "reward_std": 0.05517110973596573, "rewards/VisualizationJSONCombinedORM/mean": 0.6088706254959106, "rewards/VisualizationJSONCombinedORM/std": 0.17734448611736298, "step": 1931, "train_speed(iter/s)": 0.075098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 247.75, "completions/min_length": 220.0, "epoch": 1.598014888337469, "grad_norm": 0.1679995357990265, "kl": 0.06573486328125, "learning_rate": 1.1811441099714232e-06, "loss": 0.0006570667028427124, "memory(GiB)": 38.05, "reward": 0.453538179397583, "reward_std": 0.04166950285434723, "rewards/VisualizationJSONCombinedORM/mean": 0.453538179397583, "rewards/VisualizationJSONCombinedORM/std": 0.24749188125133514, "step": 1932, "train_speed(iter/s)": 0.075064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 270.625, "completions/min_length": 213.0, "epoch": 1.5988420181968568, "grad_norm": 0.1789478063583374, "kl": 0.0511474609375, "learning_rate": 1.1764884925505626e-06, "loss": 0.0005116909742355347, "memory(GiB)": 38.05, "reward": 0.45712700486183167, "reward_std": 0.052142433822155, "rewards/VisualizationJSONCombinedORM/mean": 0.45712700486183167, "rewards/VisualizationJSONCombinedORM/std": 0.2118956595659256, "step": 1933, "train_speed(iter/s)": 0.075029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 258.0, "completions/min_length": 231.0, "epoch": 1.5996691480562448, "grad_norm": 0.2183215171098709, "kl": 0.06964111328125, "learning_rate": 1.171840844872198e-06, "loss": 0.0006970111280679703, "memory(GiB)": 38.05, "reward": 0.5608550310134888, "reward_std": 0.11459028720855713, "rewards/VisualizationJSONCombinedORM/mean": 0.5608550310134888, "rewards/VisualizationJSONCombinedORM/std": 0.17571137845516205, "step": 1934, "train_speed(iter/s)": 0.074998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 253.3125, "completions/min_length": 195.0, "epoch": 1.6004962779156329, "grad_norm": 0.19041123986244202, "kl": 0.05718994140625, "learning_rate": 1.1672011766239067e-06, "loss": 0.000571615993976593, "memory(GiB)": 38.05, "reward": 0.7689238786697388, "reward_std": 0.07991880178451538, "rewards/VisualizationJSONCombinedORM/mean": 0.7689238786697388, "rewards/VisualizationJSONCombinedORM/std": 0.11077333241701126, "step": 1935, "train_speed(iter/s)": 0.074965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 281.3125, "completions/min_length": 223.0, "epoch": 1.6013234077750207, "grad_norm": 0.17722205817699432, "kl": 0.05914306640625, "learning_rate": 1.16256949747663e-06, "loss": 0.0005902498960494995, "memory(GiB)": 38.05, "reward": 0.44920408725738525, "reward_std": 0.07547052949666977, "rewards/VisualizationJSONCombinedORM/mean": 0.44920408725738525, "rewards/VisualizationJSONCombinedORM/std": 0.1837300807237625, "step": 1936, "train_speed(iter/s)": 0.074923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 246.5625, "completions/min_length": 203.0, "epoch": 1.6021505376344085, "grad_norm": 0.15553249418735504, "kl": 0.03692626953125, "learning_rate": 1.1579458170846563e-06, "loss": 0.0003688521683216095, "memory(GiB)": 38.05, "reward": 0.5448787808418274, "reward_std": 0.06559035927057266, "rewards/VisualizationJSONCombinedORM/mean": 0.5448787808418274, "rewards/VisualizationJSONCombinedORM/std": 0.1250815987586975, "step": 1937, "train_speed(iter/s)": 0.074896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 276.8125, "completions/min_length": 243.0, "epoch": 1.6029776674937966, "grad_norm": 0.15097850561141968, "kl": 0.064697265625, "learning_rate": 1.1533301450856054e-06, "loss": 0.0006469376385211945, "memory(GiB)": 38.05, "reward": 0.46723848581314087, "reward_std": 0.04998408257961273, "rewards/VisualizationJSONCombinedORM/mean": 0.46723848581314087, "rewards/VisualizationJSONCombinedORM/std": 0.21596470475196838, "step": 1938, "train_speed(iter/s)": 0.074865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 272.375, "completions/min_length": 237.0, "epoch": 1.6038047973531846, "grad_norm": 0.1868264228105545, "kl": 0.05584716796875, "learning_rate": 1.1487224911004031e-06, "loss": 0.0005577802658081055, "memory(GiB)": 38.05, "reward": 0.4985124468803406, "reward_std": 0.06952562183141708, "rewards/VisualizationJSONCombinedORM/mean": 0.4985124468803406, "rewards/VisualizationJSONCombinedORM/std": 0.10774713009595871, "step": 1939, "train_speed(iter/s)": 0.07483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 240.375, "completions/min_length": 192.0, "epoch": 1.6046319272125724, "grad_norm": 0.1813998967409134, "kl": 0.023162841796875, "learning_rate": 1.1441228647332602e-06, "loss": 0.00023186206817626953, "memory(GiB)": 38.05, "reward": 0.5555574893951416, "reward_std": 0.06645510345697403, "rewards/VisualizationJSONCombinedORM/mean": 0.5555574893951416, "rewards/VisualizationJSONCombinedORM/std": 0.06478334218263626, "step": 1940, "train_speed(iter/s)": 0.074801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 268.125, "completions/min_length": 218.0, "epoch": 1.6054590570719602, "grad_norm": 0.20227620005607605, "kl": 0.0809326171875, "learning_rate": 1.1395312755716548e-06, "loss": 0.0008102580904960632, "memory(GiB)": 38.05, "reward": 0.5746595859527588, "reward_std": 0.11095092445611954, "rewards/VisualizationJSONCombinedORM/mean": 0.5746595859527588, "rewards/VisualizationJSONCombinedORM/std": 0.12106849253177643, "step": 1941, "train_speed(iter/s)": 0.074766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 268.5, "completions/min_length": 232.0, "epoch": 1.606286186931348, "grad_norm": 0.22209349274635315, "kl": 0.068267822265625, "learning_rate": 1.134947733186315e-06, "loss": 0.0006837174296379089, "memory(GiB)": 38.05, "reward": 0.6457544565200806, "reward_std": 0.11228520423173904, "rewards/VisualizationJSONCombinedORM/mean": 0.6457544565200806, "rewards/VisualizationJSONCombinedORM/std": 0.12193308025598526, "step": 1942, "train_speed(iter/s)": 0.074737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 271.3125, "completions/min_length": 217.0, "epoch": 1.607113316790736, "grad_norm": 0.1988697201013565, "kl": 0.04949951171875, "learning_rate": 1.1303722471311961e-06, "loss": 0.0004957020282745361, "memory(GiB)": 38.05, "reward": 0.4438311457633972, "reward_std": 0.04664073511958122, "rewards/VisualizationJSONCombinedORM/mean": 0.4438311457633972, "rewards/VisualizationJSONCombinedORM/std": 0.1835462898015976, "step": 1943, "train_speed(iter/s)": 0.074715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 266.0, "completions/min_length": 211.0, "epoch": 1.6079404466501241, "grad_norm": 0.19639158248901367, "kl": 0.05499267578125, "learning_rate": 1.1258048269434569e-06, "loss": 0.0005477406084537506, "memory(GiB)": 38.05, "reward": 0.6818206310272217, "reward_std": 0.05389083921909332, "rewards/VisualizationJSONCombinedORM/mean": 0.6818206310272217, "rewards/VisualizationJSONCombinedORM/std": 0.06763052940368652, "step": 1944, "train_speed(iter/s)": 0.07468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 248.3125, "completions/min_length": 183.0, "epoch": 1.608767576509512, "grad_norm": 0.2653263509273529, "kl": 0.0782470703125, "learning_rate": 1.1212454821434476e-06, "loss": 0.0007844939827919006, "memory(GiB)": 38.05, "reward": 0.4527638554573059, "reward_std": 0.08107686787843704, "rewards/VisualizationJSONCombinedORM/mean": 0.4527638554573059, "rewards/VisualizationJSONCombinedORM/std": 0.17589344084262848, "step": 1945, "train_speed(iter/s)": 0.074653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 252.0625, "completions/min_length": 198.0, "epoch": 1.6095947063688998, "grad_norm": 0.17082758247852325, "kl": 0.05023193359375, "learning_rate": 1.1166942222346828e-06, "loss": 0.0005038082599639893, "memory(GiB)": 38.05, "reward": 0.3167637586593628, "reward_std": 0.048226431012153625, "rewards/VisualizationJSONCombinedORM/mean": 0.3167637586593628, "rewards/VisualizationJSONCombinedORM/std": 0.1602301299571991, "step": 1946, "train_speed(iter/s)": 0.074622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 301.375, "completions/min_length": 248.0, "epoch": 1.6104218362282878, "grad_norm": 0.17516973614692688, "kl": 0.0682373046875, "learning_rate": 1.1121510567038279e-06, "loss": 0.0006822347640991211, "memory(GiB)": 38.05, "reward": 0.49009591341018677, "reward_std": 0.06739232689142227, "rewards/VisualizationJSONCombinedORM/mean": 0.49009591341018677, "rewards/VisualizationJSONCombinedORM/std": 0.235190287232399, "step": 1947, "train_speed(iter/s)": 0.07457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 254.625, "completions/min_length": 212.0, "epoch": 1.6112489660876759, "grad_norm": 0.2752175033092499, "kl": 0.050537109375, "learning_rate": 1.1076159950206762e-06, "loss": 0.0005053803324699402, "memory(GiB)": 38.05, "reward": 0.6759604811668396, "reward_std": 0.15093034505844116, "rewards/VisualizationJSONCombinedORM/mean": 0.6759604811668396, "rewards/VisualizationJSONCombinedORM/std": 0.15114353597164154, "step": 1948, "train_speed(iter/s)": 0.074545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 268.5, "completions/min_length": 211.0, "epoch": 1.6120760959470637, "grad_norm": 0.18224139511585236, "kl": 0.0518798828125, "learning_rate": 1.1030890466381272e-06, "loss": 0.0005195513367652893, "memory(GiB)": 38.05, "reward": 0.46294498443603516, "reward_std": 0.07044782489538193, "rewards/VisualizationJSONCombinedORM/mean": 0.46294498443603516, "rewards/VisualizationJSONCombinedORM/std": 0.17129121720790863, "step": 1949, "train_speed(iter/s)": 0.074512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 302.0625, "completions/min_length": 227.0, "epoch": 1.6129032258064515, "grad_norm": 0.20507881045341492, "kl": 0.07354736328125, "learning_rate": 1.0985702209921677e-06, "loss": 0.000734187662601471, "memory(GiB)": 38.05, "reward": 0.5821486711502075, "reward_std": 0.09263791143894196, "rewards/VisualizationJSONCombinedORM/mean": 0.5821486711502075, "rewards/VisualizationJSONCombinedORM/std": 0.09289827942848206, "step": 1950, "train_speed(iter/s)": 0.074456 }, { "epoch": 1.6129032258064515, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 307.375, "eval_completions/mean_length": 263.4583333333333, "eval_completions/min_length": 222.75, "eval_kl": 0.055394490559895836, "eval_loss": 0.0005538724362850189, "eval_reward": 0.45199340023100376, "eval_reward_std": 0.05830737513800462, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45199340023100376, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05830737754392127, "eval_runtime": 276.8105, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.011, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 247.0625, "completions/min_length": 221.0, "epoch": 1.6137303556658396, "grad_norm": 0.17963933944702148, "kl": 0.057861328125, "learning_rate": 1.0940595275018584e-06, "loss": 0.0005779378116130829, "memory(GiB)": 38.05, "reward": 0.517725944519043, "reward_std": 0.08540099114179611, "rewards/VisualizationJSONCombinedORM/mean": 0.517725944519043, "rewards/VisualizationJSONCombinedORM/std": 0.15633682906627655, "step": 1951, "train_speed(iter/s)": 0.073635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 267.75, "completions/min_length": 215.0, "epoch": 1.6145574855252276, "grad_norm": 0.23414760828018188, "kl": 0.07818603515625, "learning_rate": 1.0895569755693076e-06, "loss": 0.0007813572883605957, "memory(GiB)": 38.05, "reward": 0.37816521525382996, "reward_std": 0.074726901948452, "rewards/VisualizationJSONCombinedORM/mean": 0.37816521525382996, "rewards/VisualizationJSONCombinedORM/std": 0.15517762303352356, "step": 1952, "train_speed(iter/s)": 0.073598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 263.875, "completions/min_length": 227.0, "epoch": 1.6153846153846154, "grad_norm": 0.2575499415397644, "kl": 0.0592041015625, "learning_rate": 1.08506257457965e-06, "loss": 0.0005911514163017273, "memory(GiB)": 38.05, "reward": 0.5355337262153625, "reward_std": 0.07842442393302917, "rewards/VisualizationJSONCombinedORM/mean": 0.5355337262153625, "rewards/VisualizationJSONCombinedORM/std": 0.18247336149215698, "step": 1953, "train_speed(iter/s)": 0.073559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 271.125, "completions/min_length": 209.0, "epoch": 1.6162117452440032, "grad_norm": 0.2073441445827484, "kl": 0.0543212890625, "learning_rate": 1.0805763339010329e-06, "loss": 0.000543719157576561, "memory(GiB)": 38.05, "reward": 0.4206017553806305, "reward_std": 0.057464420795440674, "rewards/VisualizationJSONCombinedORM/mean": 0.4206017553806305, "rewards/VisualizationJSONCombinedORM/std": 0.16917364299297333, "step": 1954, "train_speed(iter/s)": 0.07352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 261.875, "completions/min_length": 210.0, "epoch": 1.6170388751033913, "grad_norm": 0.24104878306388855, "kl": 0.052703857421875, "learning_rate": 1.076098262884594e-06, "loss": 0.0005273707211017609, "memory(GiB)": 38.05, "reward": 0.45933997631073, "reward_std": 0.04859720170497894, "rewards/VisualizationJSONCombinedORM/mean": 0.45933997631073, "rewards/VisualizationJSONCombinedORM/std": 0.2054053246974945, "step": 1955, "train_speed(iter/s)": 0.073493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 269.5, "completions/min_length": 220.0, "epoch": 1.617866004962779, "grad_norm": 0.23657803237438202, "kl": 0.072509765625, "learning_rate": 1.0716283708644431e-06, "loss": 0.0007259109988808632, "memory(GiB)": 38.05, "reward": 0.47842293977737427, "reward_std": 0.09334369748830795, "rewards/VisualizationJSONCombinedORM/mean": 0.47842293977737427, "rewards/VisualizationJSONCombinedORM/std": 0.17279957234859467, "step": 1956, "train_speed(iter/s)": 0.073481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 274.0, "completions/min_length": 234.0, "epoch": 1.6186931348221671, "grad_norm": 0.16833654046058655, "kl": 0.203369140625, "learning_rate": 1.0671666671576437e-06, "loss": 0.0020303744822740555, "memory(GiB)": 38.05, "reward": 0.33946871757507324, "reward_std": 0.059388674795627594, "rewards/VisualizationJSONCombinedORM/mean": 0.33946871757507324, "rewards/VisualizationJSONCombinedORM/std": 0.11848150193691254, "step": 1957, "train_speed(iter/s)": 0.073448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 270.875, "completions/min_length": 229.0, "epoch": 1.619520264681555, "grad_norm": 0.18054930865764618, "kl": 0.08349609375, "learning_rate": 1.0627131610641829e-06, "loss": 0.0008330196142196655, "memory(GiB)": 38.05, "reward": 0.6768810749053955, "reward_std": 0.10809605568647385, "rewards/VisualizationJSONCombinedORM/mean": 0.6768810749053955, "rewards/VisualizationJSONCombinedORM/std": 0.10529821366071701, "step": 1958, "train_speed(iter/s)": 0.07342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 225.8125, "completions/min_length": 186.0, "epoch": 1.6203473945409428, "grad_norm": 0.15989133715629578, "kl": 0.03863525390625, "learning_rate": 1.058267861866969e-06, "loss": 0.0003867894411087036, "memory(GiB)": 38.05, "reward": 0.3294452726840973, "reward_std": 0.04921332746744156, "rewards/VisualizationJSONCombinedORM/mean": 0.3294452726840973, "rewards/VisualizationJSONCombinedORM/std": 0.048442378640174866, "step": 1959, "train_speed(iter/s)": 0.073394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 287.5, "completions/min_length": 212.0, "epoch": 1.6211745244003308, "grad_norm": 0.17537236213684082, "kl": 0.0496826171875, "learning_rate": 1.0538307788318014e-06, "loss": 0.0004968717694282532, "memory(GiB)": 38.05, "reward": 0.3136604428291321, "reward_std": 0.05068104714155197, "rewards/VisualizationJSONCombinedORM/mean": 0.3136604428291321, "rewards/VisualizationJSONCombinedORM/std": 0.1650664210319519, "step": 1960, "train_speed(iter/s)": 0.073359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 254.0, "completions/min_length": 205.0, "epoch": 1.6220016542597189, "grad_norm": 0.16307316720485687, "kl": 0.03802490234375, "learning_rate": 1.0494019212073547e-06, "loss": 0.00037971604615449905, "memory(GiB)": 38.05, "reward": 0.5133318305015564, "reward_std": 0.023040898144245148, "rewards/VisualizationJSONCombinedORM/mean": 0.5133318305015564, "rewards/VisualizationJSONCombinedORM/std": 0.2892351746559143, "step": 1961, "train_speed(iter/s)": 0.073334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 274.5, "completions/min_length": 252.0, "epoch": 1.6228287841191067, "grad_norm": 0.19541844725608826, "kl": 0.083984375, "learning_rate": 1.0449812982251556e-06, "loss": 0.0008415356278419495, "memory(GiB)": 38.05, "reward": 0.40830928087234497, "reward_std": 0.05429410934448242, "rewards/VisualizationJSONCombinedORM/mean": 0.40830928087234497, "rewards/VisualizationJSONCombinedORM/std": 0.1800464689731598, "step": 1962, "train_speed(iter/s)": 0.073306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 268.0625, "completions/min_length": 211.0, "epoch": 1.6236559139784945, "grad_norm": 0.1875351369380951, "kl": 0.03961181640625, "learning_rate": 1.0405689190995677e-06, "loss": 0.0003963503986597061, "memory(GiB)": 38.05, "reward": 0.5485666394233704, "reward_std": 0.06088430806994438, "rewards/VisualizationJSONCombinedORM/mean": 0.5485666394233704, "rewards/VisualizationJSONCombinedORM/std": 0.15172715485095978, "step": 1963, "train_speed(iter/s)": 0.073277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/mean_length": 255.375, "completions/min_length": 210.0, "epoch": 1.6244830438378826, "grad_norm": 0.1952178031206131, "kl": 0.0980224609375, "learning_rate": 1.0361647930277719e-06, "loss": 0.0009820256382226944, "memory(GiB)": 38.05, "reward": 0.5591988563537598, "reward_std": 0.11205226182937622, "rewards/VisualizationJSONCombinedORM/mean": 0.5591988563537598, "rewards/VisualizationJSONCombinedORM/std": 0.16581016778945923, "step": 1964, "train_speed(iter/s)": 0.073256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 247.0625, "completions/min_length": 213.0, "epoch": 1.6253101736972706, "grad_norm": 0.15871475636959076, "kl": 0.04998779296875, "learning_rate": 1.0317689291897465e-06, "loss": 0.0004996210336685181, "memory(GiB)": 38.05, "reward": 0.5207897424697876, "reward_std": 0.06335241347551346, "rewards/VisualizationJSONCombinedORM/mean": 0.5207897424697876, "rewards/VisualizationJSONCombinedORM/std": 0.08960990607738495, "step": 1965, "train_speed(iter/s)": 0.073235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 258.25, "completions/min_length": 217.0, "epoch": 1.6261373035566584, "grad_norm": 0.18636876344680786, "kl": 0.078369140625, "learning_rate": 1.02738133674825e-06, "loss": 0.0007825605571269989, "memory(GiB)": 38.05, "reward": 0.5809142589569092, "reward_std": 0.08825360238552094, "rewards/VisualizationJSONCombinedORM/mean": 0.5809142589569092, "rewards/VisualizationJSONCombinedORM/std": 0.13046810030937195, "step": 1966, "train_speed(iter/s)": 0.073197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 252.9375, "completions/min_length": 221.0, "epoch": 1.6269644334160462, "grad_norm": 0.18351654708385468, "kl": 0.04559326171875, "learning_rate": 1.0230020248487922e-06, "loss": 0.00045643001794815063, "memory(GiB)": 38.05, "reward": 0.5390210747718811, "reward_std": 0.05238718539476395, "rewards/VisualizationJSONCombinedORM/mean": 0.5390210747718811, "rewards/VisualizationJSONCombinedORM/std": 0.15373556315898895, "step": 1967, "train_speed(iter/s)": 0.073156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 247.75, "completions/min_length": 204.0, "epoch": 1.6277915632754343, "grad_norm": 0.17369413375854492, "kl": 0.042205810546875, "learning_rate": 1.01863100261963e-06, "loss": 0.0004222691059112549, "memory(GiB)": 38.05, "reward": 0.6811869740486145, "reward_std": 0.09279809892177582, "rewards/VisualizationJSONCombinedORM/mean": 0.6811869740486145, "rewards/VisualizationJSONCombinedORM/std": 0.1748819500207901, "step": 1968, "train_speed(iter/s)": 0.073117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 261.0, "completions/min_length": 223.0, "epoch": 1.6286186931348223, "grad_norm": 0.147862046957016, "kl": 0.033203125, "learning_rate": 1.0142682791717406e-06, "loss": 0.00033255666494369507, "memory(GiB)": 38.05, "reward": 0.687221348285675, "reward_std": 0.042864974588155746, "rewards/VisualizationJSONCombinedORM/mean": 0.687221348285675, "rewards/VisualizationJSONCombinedORM/std": 0.1406707465648651, "step": 1969, "train_speed(iter/s)": 0.073073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 266.0625, "completions/min_length": 218.0, "epoch": 1.6294458229942101, "grad_norm": 0.1596897840499878, "kl": 0.0283203125, "learning_rate": 1.0099138635988026e-06, "loss": 0.0002833940088748932, "memory(GiB)": 38.05, "reward": 0.7137625813484192, "reward_std": 0.05776417255401611, "rewards/VisualizationJSONCombinedORM/mean": 0.7137625813484192, "rewards/VisualizationJSONCombinedORM/std": 0.10744277387857437, "step": 1970, "train_speed(iter/s)": 0.073041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 263.3125, "completions/min_length": 199.0, "epoch": 1.630272952853598, "grad_norm": 0.1499875783920288, "kl": 0.0743408203125, "learning_rate": 1.005567764977176e-06, "loss": 0.0007445588707923889, "memory(GiB)": 38.05, "reward": 0.3753618001937866, "reward_std": 0.04368015006184578, "rewards/VisualizationJSONCombinedORM/mean": 0.3753618001937866, "rewards/VisualizationJSONCombinedORM/std": 0.11171577870845795, "step": 1971, "train_speed(iter/s)": 0.072999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 272.0625, "completions/min_length": 225.0, "epoch": 1.6311000827129858, "grad_norm": 0.16857659816741943, "kl": 0.11279296875, "learning_rate": 1.0012299923658848e-06, "loss": 0.0011312700808048248, "memory(GiB)": 38.05, "reward": 0.4512142539024353, "reward_std": 0.06880561262369156, "rewards/VisualizationJSONCombinedORM/mean": 0.4512142539024353, "rewards/VisualizationJSONCombinedORM/std": 0.22803710401058197, "step": 1972, "train_speed(iter/s)": 0.072973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 278.8125, "completions/min_length": 228.0, "epoch": 1.6319272125723738, "grad_norm": 0.2222621738910675, "kl": 0.07861328125, "learning_rate": 9.969005548065997e-07, "loss": 0.0007854178547859192, "memory(GiB)": 38.05, "reward": 0.5342662334442139, "reward_std": 0.06684185564517975, "rewards/VisualizationJSONCombinedORM/mean": 0.5342662334442139, "rewards/VisualizationJSONCombinedORM/std": 0.1949404627084732, "step": 1973, "train_speed(iter/s)": 0.07294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 256.0625, "completions/min_length": 191.0, "epoch": 1.6327543424317619, "grad_norm": 0.1718185693025589, "kl": 0.03985595703125, "learning_rate": 9.925794613236201e-07, "loss": 0.000398438423871994, "memory(GiB)": 38.05, "reward": 0.4859107732772827, "reward_std": 0.04100123047828674, "rewards/VisualizationJSONCombinedORM/mean": 0.4859107732772827, "rewards/VisualizationJSONCombinedORM/std": 0.2035137414932251, "step": 1974, "train_speed(iter/s)": 0.072908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 284.125, "completions/min_length": 232.0, "epoch": 1.6335814722911497, "grad_norm": 0.17658443748950958, "kl": 0.08837890625, "learning_rate": 9.882667209238484e-07, "loss": 0.0008860919624567032, "memory(GiB)": 38.05, "reward": 0.32336074113845825, "reward_std": 0.04159119352698326, "rewards/VisualizationJSONCombinedORM/mean": 0.32336074113845825, "rewards/VisualizationJSONCombinedORM/std": 0.1541656255722046, "step": 1975, "train_speed(iter/s)": 0.072882 }, { "epoch": 1.6335814722911497, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 314.375, "eval_completions/mean_length": 266.375, "eval_completions/min_length": 230.70833333333334, "eval_kl": 0.052907307942708336, "eval_loss": 0.0005304596270434558, "eval_reward": 0.43936022495230037, "eval_reward_std": 0.059931959762858845, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43936022495230037, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05993196088820696, "eval_runtime": 280.7831, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 269.4375, "completions/min_length": 228.0, "epoch": 1.6344086021505375, "grad_norm": 0.21337495744228363, "kl": 0.07403564453125, "learning_rate": 9.83962342596776e-07, "loss": 0.0007406808435916901, "memory(GiB)": 38.05, "reward": 0.44071775674819946, "reward_std": 0.08311028778553009, "rewards/VisualizationJSONCombinedORM/mean": 0.44071775674819946, "rewards/VisualizationJSONCombinedORM/std": 0.26058775186538696, "step": 1976, "train_speed(iter/s)": 0.072108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 243.5625, "completions/min_length": 213.0, "epoch": 1.6352357320099256, "grad_norm": 0.1419641673564911, "kl": 0.047119140625, "learning_rate": 9.79666335314468e-07, "loss": 0.0004718005657196045, "memory(GiB)": 38.05, "reward": 0.3750923275947571, "reward_std": 0.04898969829082489, "rewards/VisualizationJSONCombinedORM/mean": 0.3750923275947571, "rewards/VisualizationJSONCombinedORM/std": 0.05035179853439331, "step": 1977, "train_speed(iter/s)": 0.07208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 254.125, "completions/min_length": 206.0, "epoch": 1.6360628618693136, "grad_norm": 0.17774690687656403, "kl": 0.0667724609375, "learning_rate": 9.753787080315385e-07, "loss": 0.0006676949560642242, "memory(GiB)": 38.05, "reward": 0.6280788779258728, "reward_std": 0.1106138676404953, "rewards/VisualizationJSONCombinedORM/mean": 0.6280788779258728, "rewards/VisualizationJSONCombinedORM/std": 0.14378321170806885, "step": 1978, "train_speed(iter/s)": 0.072061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/mean_length": 242.125, "completions/min_length": 188.0, "epoch": 1.6368899917287014, "grad_norm": 0.17222484946250916, "kl": 0.04443359375, "learning_rate": 9.710994696851372e-07, "loss": 0.00044416263699531555, "memory(GiB)": 38.05, "reward": 0.3074951767921448, "reward_std": 0.04242474585771561, "rewards/VisualizationJSONCombinedORM/mean": 0.3074951767921448, "rewards/VisualizationJSONCombinedORM/std": 0.12150020152330399, "step": 1979, "train_speed(iter/s)": 0.072037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 275.0625, "completions/min_length": 222.0, "epoch": 1.6377171215880892, "grad_norm": 0.1749914586544037, "kl": 0.1123046875, "learning_rate": 9.668286291949224e-07, "loss": 0.0011215880513191223, "memory(GiB)": 38.05, "reward": 0.25812631845474243, "reward_std": 0.03788295015692711, "rewards/VisualizationJSONCombinedORM/mean": 0.25812631845474243, "rewards/VisualizationJSONCombinedORM/std": 0.06656999886035919, "step": 1980, "train_speed(iter/s)": 0.071999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 246.25, "completions/min_length": 209.0, "epoch": 1.6385442514474773, "grad_norm": 0.1646183580160141, "kl": 0.029144287109375, "learning_rate": 9.62566195463051e-07, "loss": 0.00029195845127105713, "memory(GiB)": 38.05, "reward": 0.787129819393158, "reward_std": 0.08449900895357132, "rewards/VisualizationJSONCombinedORM/mean": 0.787129819393158, "rewards/VisualizationJSONCombinedORM/std": 0.09139508008956909, "step": 1981, "train_speed(iter/s)": 0.07197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 278.625, "completions/min_length": 257.0, "epoch": 1.6393713813068653, "grad_norm": 0.20928984880447388, "kl": 0.0477294921875, "learning_rate": 9.583121773741571e-07, "loss": 0.00047706998884677887, "memory(GiB)": 38.05, "reward": 0.5662167072296143, "reward_std": 0.06372609734535217, "rewards/VisualizationJSONCombinedORM/mean": 0.5662167072296143, "rewards/VisualizationJSONCombinedORM/std": 0.0732676163315773, "step": 1982, "train_speed(iter/s)": 0.071942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 251.4375, "completions/min_length": 199.0, "epoch": 1.6401985111662531, "grad_norm": 0.21448896825313568, "kl": 0.08062744140625, "learning_rate": 9.540665837953344e-07, "loss": 0.000805392861366272, "memory(GiB)": 38.05, "reward": 0.3315974473953247, "reward_std": 0.07911811769008636, "rewards/VisualizationJSONCombinedORM/mean": 0.3315974473953247, "rewards/VisualizationJSONCombinedORM/std": 0.09673821181058884, "step": 1983, "train_speed(iter/s)": 0.071912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 257.25, "completions/min_length": 198.0, "epoch": 1.641025641025641, "grad_norm": 0.20673836767673492, "kl": 0.069854736328125, "learning_rate": 9.498294235761141e-07, "loss": 0.0006986372172832489, "memory(GiB)": 38.05, "reward": 0.5356948375701904, "reward_std": 0.13231393694877625, "rewards/VisualizationJSONCombinedORM/mean": 0.5356948375701904, "rewards/VisualizationJSONCombinedORM/std": 0.1439364105463028, "step": 1984, "train_speed(iter/s)": 0.071891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 249.75, "completions/min_length": 206.0, "epoch": 1.6418527708850288, "grad_norm": 0.21610556542873383, "kl": 0.052490234375, "learning_rate": 9.456007055484478e-07, "loss": 0.0005259662866592407, "memory(GiB)": 38.05, "reward": 0.48413214087486267, "reward_std": 0.05249105021357536, "rewards/VisualizationJSONCombinedORM/mean": 0.48413214087486267, "rewards/VisualizationJSONCombinedORM/std": 0.14022794365882874, "step": 1985, "train_speed(iter/s)": 0.071854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 277.75, "completions/min_length": 227.0, "epoch": 1.6426799007444168, "grad_norm": 0.18580512702465057, "kl": 0.0836181640625, "learning_rate": 9.41380438526694e-07, "loss": 0.0008343160152435303, "memory(GiB)": 38.05, "reward": 0.46938830614089966, "reward_std": 0.06214170530438423, "rewards/VisualizationJSONCombinedORM/mean": 0.46938830614089966, "rewards/VisualizationJSONCombinedORM/std": 0.16064289212226868, "step": 1986, "train_speed(iter/s)": 0.071827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 261.8125, "completions/min_length": 224.0, "epoch": 1.6435070306038049, "grad_norm": 0.16693663597106934, "kl": 0.059326171875, "learning_rate": 9.371686313075945e-07, "loss": 0.0005942285060882568, "memory(GiB)": 38.05, "reward": 0.5385878086090088, "reward_std": 0.05676552280783653, "rewards/VisualizationJSONCombinedORM/mean": 0.5385878086090088, "rewards/VisualizationJSONCombinedORM/std": 0.1624668389558792, "step": 1987, "train_speed(iter/s)": 0.0718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 270.875, "completions/min_length": 232.0, "epoch": 1.6443341604631927, "grad_norm": 0.16611379384994507, "kl": 0.02117919921875, "learning_rate": 9.329652926702559e-07, "loss": 0.00021005794405937195, "memory(GiB)": 38.05, "reward": 0.8031226396560669, "reward_std": 0.03410422429442406, "rewards/VisualizationJSONCombinedORM/mean": 0.8031226396560669, "rewards/VisualizationJSONCombinedORM/std": 0.05740325525403023, "step": 1988, "train_speed(iter/s)": 0.071771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 284.375, "completions/min_length": 233.0, "epoch": 1.6451612903225805, "grad_norm": 0.17354261875152588, "kl": 0.05181884765625, "learning_rate": 9.287704313761325e-07, "loss": 0.0005184486508369446, "memory(GiB)": 38.05, "reward": 0.5829366445541382, "reward_std": 0.04985826835036278, "rewards/VisualizationJSONCombinedORM/mean": 0.5829366445541382, "rewards/VisualizationJSONCombinedORM/std": 0.18601933121681213, "step": 1989, "train_speed(iter/s)": 0.071754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 252.75, "completions/min_length": 215.0, "epoch": 1.6459884201819686, "grad_norm": 0.15157370269298553, "kl": 0.03363037109375, "learning_rate": 9.245840561690117e-07, "loss": 0.0003366917371749878, "memory(GiB)": 38.05, "reward": 0.5228365659713745, "reward_std": 0.06209006905555725, "rewards/VisualizationJSONCombinedORM/mean": 0.5228365659713745, "rewards/VisualizationJSONCombinedORM/std": 0.12145289778709412, "step": 1990, "train_speed(iter/s)": 0.07173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 255.1875, "completions/min_length": 214.0, "epoch": 1.6468155500413566, "grad_norm": 0.18627747893333435, "kl": 0.0477294921875, "learning_rate": 9.204061757749911e-07, "loss": 0.00047688931226730347, "memory(GiB)": 38.05, "reward": 0.6193461418151855, "reward_std": 0.09217848628759384, "rewards/VisualizationJSONCombinedORM/mean": 0.6193461418151855, "rewards/VisualizationJSONCombinedORM/std": 0.11532916873693466, "step": 1991, "train_speed(iter/s)": 0.071696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 279.1875, "completions/min_length": 233.0, "epoch": 1.6476426799007444, "grad_norm": 0.1542218029499054, "kl": 0.08990478515625, "learning_rate": 9.162367989024584e-07, "loss": 0.0008981749415397644, "memory(GiB)": 38.05, "reward": 0.594233512878418, "reward_std": 0.049851901829242706, "rewards/VisualizationJSONCombinedORM/mean": 0.594233512878418, "rewards/VisualizationJSONCombinedORM/std": 0.22503575682640076, "step": 1992, "train_speed(iter/s)": 0.071667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 271.25, "completions/min_length": 223.0, "epoch": 1.6484698097601322, "grad_norm": 0.2514656186103821, "kl": 0.04254150390625, "learning_rate": 9.120759342420821e-07, "loss": 0.00042522698640823364, "memory(GiB)": 38.05, "reward": 0.5617709755897522, "reward_std": 0.07018357515335083, "rewards/VisualizationJSONCombinedORM/mean": 0.5617709755897522, "rewards/VisualizationJSONCombinedORM/std": 0.13112013041973114, "step": 1993, "train_speed(iter/s)": 0.071625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 290.5625, "completions/min_length": 223.0, "epoch": 1.6492969396195203, "grad_norm": 0.15546883642673492, "kl": 0.04193115234375, "learning_rate": 9.079235904667826e-07, "loss": 0.0004177577793598175, "memory(GiB)": 38.05, "reward": 0.5064414143562317, "reward_std": 0.03663162887096405, "rewards/VisualizationJSONCombinedORM/mean": 0.5064414143562317, "rewards/VisualizationJSONCombinedORM/std": 0.2200344353914261, "step": 1994, "train_speed(iter/s)": 0.07159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 277.5625, "completions/min_length": 215.0, "epoch": 1.6501240694789083, "grad_norm": 0.19655685126781464, "kl": 0.03631591796875, "learning_rate": 9.03779776231723e-07, "loss": 0.0003625210374593735, "memory(GiB)": 38.05, "reward": 0.4518927335739136, "reward_std": 0.060453496873378754, "rewards/VisualizationJSONCombinedORM/mean": 0.4518927335739136, "rewards/VisualizationJSONCombinedORM/std": 0.22587081789970398, "step": 1995, "train_speed(iter/s)": 0.071567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 279.6875, "completions/min_length": 235.0, "epoch": 1.6509511993382961, "grad_norm": 0.21361143887043, "kl": 0.0810546875, "learning_rate": 8.996445001742871e-07, "loss": 0.0008118264377117157, "memory(GiB)": 38.05, "reward": 0.5078812837600708, "reward_std": 0.0902336910367012, "rewards/VisualizationJSONCombinedORM/mean": 0.5078812837600708, "rewards/VisualizationJSONCombinedORM/std": 0.19705823063850403, "step": 1996, "train_speed(iter/s)": 0.071542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 259.8125, "completions/min_length": 214.0, "epoch": 1.651778329197684, "grad_norm": 0.1860632747411728, "kl": 0.032073974609375, "learning_rate": 8.955177709140594e-07, "loss": 0.0003213360905647278, "memory(GiB)": 38.05, "reward": 0.6203728914260864, "reward_std": 0.06328220665454865, "rewards/VisualizationJSONCombinedORM/mean": 0.6203728914260864, "rewards/VisualizationJSONCombinedORM/std": 0.06840847432613373, "step": 1997, "train_speed(iter/s)": 0.071519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 272.8125, "completions/min_length": 227.0, "epoch": 1.652605459057072, "grad_norm": 0.18544599413871765, "kl": 0.1063232421875, "learning_rate": 8.913995970528089e-07, "loss": 0.0010628625750541687, "memory(GiB)": 38.05, "reward": 0.5022841095924377, "reward_std": 0.04637262225151062, "rewards/VisualizationJSONCombinedORM/mean": 0.5022841095924377, "rewards/VisualizationJSONCombinedORM/std": 0.23536494374275208, "step": 1998, "train_speed(iter/s)": 0.071475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 263.8125, "completions/min_length": 226.0, "epoch": 1.6534325889164598, "grad_norm": 0.19584064185619354, "kl": 0.0499267578125, "learning_rate": 8.872899871744756e-07, "loss": 0.0004989355802536011, "memory(GiB)": 38.05, "reward": 0.5956588983535767, "reward_std": 0.07375949621200562, "rewards/VisualizationJSONCombinedORM/mean": 0.5956588983535767, "rewards/VisualizationJSONCombinedORM/std": 0.14259622991085052, "step": 1999, "train_speed(iter/s)": 0.071435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 257.25, "completions/min_length": 216.0, "epoch": 1.6542597187758479, "grad_norm": 0.1988021433353424, "kl": 0.0570068359375, "learning_rate": 8.831889498451474e-07, "loss": 0.0005706623196601868, "memory(GiB)": 38.05, "reward": 0.53490149974823, "reward_std": 0.12428125739097595, "rewards/VisualizationJSONCombinedORM/mean": 0.53490149974823, "rewards/VisualizationJSONCombinedORM/std": 0.12156850844621658, "step": 2000, "train_speed(iter/s)": 0.07141 }, { "epoch": 1.6542597187758479, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 320.6666666666667, "eval_completions/mean_length": 266.875, "eval_completions/min_length": 224.29166666666666, "eval_kl": 0.062367757161458336, "eval_loss": 0.0006256730412133038, "eval_reward": 0.4472664737453063, "eval_reward_std": 0.06813629204407334, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4472664737453063, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06813629282017548, "eval_runtime": 284.3218, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.011, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 264.9375, "completions/min_length": 216.0, "epoch": 1.6550868486352357, "grad_norm": 0.2034977227449417, "kl": 0.04449462890625, "learning_rate": 8.790964936130403e-07, "loss": 0.00044557638466358185, "memory(GiB)": 38.05, "reward": 0.523072361946106, "reward_std": 0.08162941038608551, "rewards/VisualizationJSONCombinedORM/mean": 0.523072361946106, "rewards/VisualizationJSONCombinedORM/std": 0.09676504880189896, "step": 2001, "train_speed(iter/s)": 0.070653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 280.75, "completions/min_length": 213.0, "epoch": 1.6559139784946235, "grad_norm": 0.1653233915567398, "kl": 0.038726806640625, "learning_rate": 8.750126270084891e-07, "loss": 0.0003878101706504822, "memory(GiB)": 38.05, "reward": 0.7584062218666077, "reward_std": 0.08016609400510788, "rewards/VisualizationJSONCombinedORM/mean": 0.7584062218666077, "rewards/VisualizationJSONCombinedORM/std": 0.09876316040754318, "step": 2002, "train_speed(iter/s)": 0.07063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 249.875, "completions/min_length": 201.0, "epoch": 1.6567411083540116, "grad_norm": 0.19925180077552795, "kl": 0.0625, "learning_rate": 8.70937358543918e-07, "loss": 0.0006249174475669861, "memory(GiB)": 38.05, "reward": 0.3927125930786133, "reward_std": 0.04974500834941864, "rewards/VisualizationJSONCombinedORM/mean": 0.3927125930786133, "rewards/VisualizationJSONCombinedORM/std": 0.05078856647014618, "step": 2003, "train_speed(iter/s)": 0.070615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 253.6875, "completions/min_length": 208.0, "epoch": 1.6575682382133996, "grad_norm": 0.18205127120018005, "kl": 0.025726318359375, "learning_rate": 8.668706967138363e-07, "loss": 0.0002565234899520874, "memory(GiB)": 38.05, "reward": 0.7090717554092407, "reward_std": 0.044927097856998444, "rewards/VisualizationJSONCombinedORM/mean": 0.7090717554092407, "rewards/VisualizationJSONCombinedORM/std": 0.14271581172943115, "step": 2004, "train_speed(iter/s)": 0.070572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 240.6875, "completions/min_length": 207.0, "epoch": 1.6583953680727874, "grad_norm": 0.1635514348745346, "kl": 0.04193115234375, "learning_rate": 8.62812649994807e-07, "loss": 0.00041959062218666077, "memory(GiB)": 38.05, "reward": 0.720180869102478, "reward_std": 0.058812446892261505, "rewards/VisualizationJSONCombinedORM/mean": 0.720180869102478, "rewards/VisualizationJSONCombinedORM/std": 0.06474584341049194, "step": 2005, "train_speed(iter/s)": 0.070536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 289.25, "completions/min_length": 208.0, "epoch": 1.6592224979321752, "grad_norm": 0.24479557573795319, "kl": 0.0697021484375, "learning_rate": 8.587632268454405e-07, "loss": 0.0006951652467250824, "memory(GiB)": 38.05, "reward": 0.5449235439300537, "reward_std": 0.07536457479000092, "rewards/VisualizationJSONCombinedORM/mean": 0.5449235439300537, "rewards/VisualizationJSONCombinedORM/std": 0.1401197910308838, "step": 2006, "train_speed(iter/s)": 0.070508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 284.0, "completions/min_length": 238.0, "epoch": 1.6600496277915633, "grad_norm": 0.21892209351062775, "kl": 0.03363037109375, "learning_rate": 8.547224357063694e-07, "loss": 0.00033612921833992004, "memory(GiB)": 38.05, "reward": 0.4409700632095337, "reward_std": 0.07728073000907898, "rewards/VisualizationJSONCombinedORM/mean": 0.4409700632095337, "rewards/VisualizationJSONCombinedORM/std": 0.16160839796066284, "step": 2007, "train_speed(iter/s)": 0.070478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 272.125, "completions/min_length": 218.0, "epoch": 1.6608767576509513, "grad_norm": 0.16741988062858582, "kl": 0.067626953125, "learning_rate": 8.506902850002358e-07, "loss": 0.0006764158606529236, "memory(GiB)": 38.05, "reward": 0.6946296691894531, "reward_std": 0.07010668516159058, "rewards/VisualizationJSONCombinedORM/mean": 0.6946296691894531, "rewards/VisualizationJSONCombinedORM/std": 0.06939677149057388, "step": 2008, "train_speed(iter/s)": 0.070452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 278.5, "completions/min_length": 207.0, "epoch": 1.6617038875103392, "grad_norm": 0.17994526028633118, "kl": 0.07611083984375, "learning_rate": 8.466667831316693e-07, "loss": 0.0007621124386787415, "memory(GiB)": 38.05, "reward": 0.2927936315536499, "reward_std": 0.04076500982046127, "rewards/VisualizationJSONCombinedORM/mean": 0.2927936315536499, "rewards/VisualizationJSONCombinedORM/std": 0.08016735315322876, "step": 2009, "train_speed(iter/s)": 0.070428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 248.5, "completions/min_length": 218.0, "epoch": 1.662531017369727, "grad_norm": 0.24590028822422028, "kl": 0.0496826171875, "learning_rate": 8.426519384872733e-07, "loss": 0.000496596097946167, "memory(GiB)": 38.05, "reward": 0.3977199196815491, "reward_std": 0.05453885346651077, "rewards/VisualizationJSONCombinedORM/mean": 0.3977199196815491, "rewards/VisualizationJSONCombinedORM/std": 0.06614191085100174, "step": 2010, "train_speed(iter/s)": 0.070408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 287.4375, "completions/min_length": 234.0, "epoch": 1.663358147229115, "grad_norm": 0.17912857234477997, "kl": 0.083251953125, "learning_rate": 8.386457594356078e-07, "loss": 0.0008318312466144562, "memory(GiB)": 38.05, "reward": 0.6961261034011841, "reward_std": 0.1739618182182312, "rewards/VisualizationJSONCombinedORM/mean": 0.6961261034011841, "rewards/VisualizationJSONCombinedORM/std": 0.20138317346572876, "step": 2011, "train_speed(iter/s)": 0.070377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 267.75, "completions/min_length": 237.0, "epoch": 1.664185277088503, "grad_norm": 0.16599774360656738, "kl": 0.0841064453125, "learning_rate": 8.346482543271656e-07, "loss": 0.000841144472360611, "memory(GiB)": 38.05, "reward": 0.5759405493736267, "reward_std": 0.08417586982250214, "rewards/VisualizationJSONCombinedORM/mean": 0.5759405493736267, "rewards/VisualizationJSONCombinedORM/std": 0.11197438836097717, "step": 2012, "train_speed(iter/s)": 0.070354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 271.75, "completions/min_length": 198.0, "epoch": 1.6650124069478909, "grad_norm": 0.18478482961654663, "kl": 0.02716064453125, "learning_rate": 8.306594314943645e-07, "loss": 0.0002715848386287689, "memory(GiB)": 38.05, "reward": 0.5622313022613525, "reward_std": 0.06555014103651047, "rewards/VisualizationJSONCombinedORM/mean": 0.5622313022613525, "rewards/VisualizationJSONCombinedORM/std": 0.11169900745153427, "step": 2013, "train_speed(iter/s)": 0.070328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 257.1875, "completions/min_length": 217.0, "epoch": 1.6658395368072787, "grad_norm": 0.1903015673160553, "kl": 0.0477294921875, "learning_rate": 8.266792992515199e-07, "loss": 0.0004765614867210388, "memory(GiB)": 38.05, "reward": 0.4513545036315918, "reward_std": 0.083269402384758, "rewards/VisualizationJSONCombinedORM/mean": 0.4513545036315918, "rewards/VisualizationJSONCombinedORM/std": 0.11448349803686142, "step": 2014, "train_speed(iter/s)": 0.070309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 252.6875, "completions/min_length": 220.0, "epoch": 1.6666666666666665, "grad_norm": 0.18884877860546112, "kl": 0.08074951171875, "learning_rate": 8.227078658948385e-07, "loss": 0.000806860625743866, "memory(GiB)": 38.05, "reward": 0.5274593830108643, "reward_std": 0.10799603909254074, "rewards/VisualizationJSONCombinedORM/mean": 0.5274593830108643, "rewards/VisualizationJSONCombinedORM/std": 0.1323985904455185, "step": 2015, "train_speed(iter/s)": 0.070272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 287.8125, "completions/min_length": 237.0, "epoch": 1.6674937965260546, "grad_norm": 0.2138143628835678, "kl": 0.024810791015625, "learning_rate": 8.187451397023877e-07, "loss": 0.0002475399523973465, "memory(GiB)": 38.05, "reward": 0.5386313199996948, "reward_std": 0.06883422285318375, "rewards/VisualizationJSONCombinedORM/mean": 0.5386313199996948, "rewards/VisualizationJSONCombinedORM/std": 0.06907103210687637, "step": 2016, "train_speed(iter/s)": 0.070246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 267.4375, "completions/min_length": 218.0, "epoch": 1.6683209263854426, "grad_norm": 0.20687821507453918, "kl": 0.039459228515625, "learning_rate": 8.147911289340938e-07, "loss": 0.00039496785029768944, "memory(GiB)": 38.05, "reward": 0.47605466842651367, "reward_std": 0.0679188147187233, "rewards/VisualizationJSONCombinedORM/mean": 0.47605466842651367, "rewards/VisualizationJSONCombinedORM/std": 0.27781787514686584, "step": 2017, "train_speed(iter/s)": 0.070207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 270.5, "completions/min_length": 225.0, "epoch": 1.6691480562448304, "grad_norm": 0.1812269389629364, "kl": 0.0657958984375, "learning_rate": 8.108458418317089e-07, "loss": 0.0006579114124178886, "memory(GiB)": 38.05, "reward": 0.3945620059967041, "reward_std": 0.07618582993745804, "rewards/VisualizationJSONCombinedORM/mean": 0.3945620059967041, "rewards/VisualizationJSONCombinedORM/std": 0.18109571933746338, "step": 2018, "train_speed(iter/s)": 0.07017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 271.25, "completions/min_length": 223.0, "epoch": 1.6699751861042182, "grad_norm": 0.1971825808286667, "kl": 0.02911376953125, "learning_rate": 8.069092866188072e-07, "loss": 0.0002912282943725586, "memory(GiB)": 38.05, "reward": 0.45039159059524536, "reward_std": 0.04880125820636749, "rewards/VisualizationJSONCombinedORM/mean": 0.45039159059524536, "rewards/VisualizationJSONCombinedORM/std": 0.051502835005521774, "step": 2019, "train_speed(iter/s)": 0.070139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 257.4375, "completions/min_length": 220.0, "epoch": 1.6708023159636063, "grad_norm": 0.1913953274488449, "kl": 0.04559326171875, "learning_rate": 8.029814715007589e-07, "loss": 0.0004551932215690613, "memory(GiB)": 38.05, "reward": 0.3933674693107605, "reward_std": 0.05565662682056427, "rewards/VisualizationJSONCombinedORM/mean": 0.3933674693107605, "rewards/VisualizationJSONCombinedORM/std": 0.057624854147434235, "step": 2020, "train_speed(iter/s)": 0.07011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 257.8125, "completions/min_length": 197.0, "epoch": 1.6716294458229943, "grad_norm": 0.15343517065048218, "kl": 0.0645751953125, "learning_rate": 7.990624046647188e-07, "loss": 0.0006449297070503235, "memory(GiB)": 38.05, "reward": 0.538149356842041, "reward_std": 0.06766719371080399, "rewards/VisualizationJSONCombinedORM/mean": 0.538149356842041, "rewards/VisualizationJSONCombinedORM/std": 0.16868506371974945, "step": 2021, "train_speed(iter/s)": 0.070084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 237.8125, "completions/min_length": 197.0, "epoch": 1.6724565756823822, "grad_norm": 0.16426803171634674, "kl": 0.07598876953125, "learning_rate": 7.951520942796026e-07, "loss": 0.0007610209286212921, "memory(GiB)": 38.05, "reward": 0.6330669522285461, "reward_std": 0.10555579513311386, "rewards/VisualizationJSONCombinedORM/mean": 0.6330669522285461, "rewards/VisualizationJSONCombinedORM/std": 0.13085591793060303, "step": 2022, "train_speed(iter/s)": 0.070061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 273.8125, "completions/min_length": 218.0, "epoch": 1.67328370554177, "grad_norm": 0.1813337653875351, "kl": 0.0704345703125, "learning_rate": 7.912505484960792e-07, "loss": 0.0007058270275592804, "memory(GiB)": 38.05, "reward": 0.50185626745224, "reward_std": 0.05721091106534004, "rewards/VisualizationJSONCombinedORM/mean": 0.50185626745224, "rewards/VisualizationJSONCombinedORM/std": 0.22649534046649933, "step": 2023, "train_speed(iter/s)": 0.070039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 259.875, "completions/min_length": 217.0, "epoch": 1.674110835401158, "grad_norm": 0.1761866956949234, "kl": 0.05474853515625, "learning_rate": 7.873577754465456e-07, "loss": 0.0005485862493515015, "memory(GiB)": 38.05, "reward": 0.51530921459198, "reward_std": 0.04251505434513092, "rewards/VisualizationJSONCombinedORM/mean": 0.51530921459198, "rewards/VisualizationJSONCombinedORM/std": 0.19558598101139069, "step": 2024, "train_speed(iter/s)": 0.070012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 256.3125, "completions/min_length": 201.0, "epoch": 1.674937965260546, "grad_norm": 0.23522914946079254, "kl": 0.06280517578125, "learning_rate": 7.834737832451134e-07, "loss": 0.0006277859210968018, "memory(GiB)": 38.05, "reward": 0.6295351386070251, "reward_std": 0.09932701289653778, "rewards/VisualizationJSONCombinedORM/mean": 0.6295351386070251, "rewards/VisualizationJSONCombinedORM/std": 0.1551843285560608, "step": 2025, "train_speed(iter/s)": 0.069973 }, { "epoch": 1.674937965260546, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 322.0833333333333, "eval_completions/mean_length": 270.6041666666667, "eval_completions/min_length": 230.95833333333334, "eval_kl": 0.06753031412760417, "eval_loss": 0.0006785604055039585, "eval_reward": 0.4546626570324103, "eval_reward_std": 0.06385546596720815, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4546626570324103, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06385546868356566, "eval_runtime": 285.9302, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 295.4375, "completions/min_length": 239.0, "epoch": 1.6757650951199339, "grad_norm": 0.17175400257110596, "kl": 0.04644775390625, "learning_rate": 7.7959857998759e-07, "loss": 0.0004648156464099884, "memory(GiB)": 38.05, "reward": 0.5279556512832642, "reward_std": 0.062468647956848145, "rewards/VisualizationJSONCombinedORM/mean": 0.5279556512832642, "rewards/VisualizationJSONCombinedORM/std": 0.12553121149539948, "step": 2026, "train_speed(iter/s)": 0.069245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 236.4375, "completions/min_length": 194.0, "epoch": 1.6765922249793217, "grad_norm": 0.1847967952489853, "kl": 0.05517578125, "learning_rate": 7.757321737514645e-07, "loss": 0.0005517229437828064, "memory(GiB)": 38.05, "reward": 0.3701210618019104, "reward_std": 0.0477585643529892, "rewards/VisualizationJSONCombinedORM/mean": 0.3701210618019104, "rewards/VisualizationJSONCombinedORM/std": 0.04672330245375633, "step": 2027, "train_speed(iter/s)": 0.069214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 271.4375, "completions/min_length": 209.0, "epoch": 1.6774193548387095, "grad_norm": 0.2011280357837677, "kl": 0.0634765625, "learning_rate": 7.718745725958914e-07, "loss": 0.0006351694464683533, "memory(GiB)": 38.05, "reward": 0.5419626832008362, "reward_std": 0.08073756098747253, "rewards/VisualizationJSONCombinedORM/mean": 0.5419626832008362, "rewards/VisualizationJSONCombinedORM/std": 0.11412963271141052, "step": 2028, "train_speed(iter/s)": 0.069193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 261.75, "completions/min_length": 203.0, "epoch": 1.6782464846980976, "grad_norm": 0.16288848221302032, "kl": 0.053955078125, "learning_rate": 7.680257845616679e-07, "loss": 0.0005408879369497299, "memory(GiB)": 38.05, "reward": 0.603503406047821, "reward_std": 0.07410961389541626, "rewards/VisualizationJSONCombinedORM/mean": 0.603503406047821, "rewards/VisualizationJSONCombinedORM/std": 0.08823257684707642, "step": 2029, "train_speed(iter/s)": 0.069162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 253.625, "completions/min_length": 223.0, "epoch": 1.6790736145574856, "grad_norm": 0.17981858551502228, "kl": 0.0406494140625, "learning_rate": 7.641858176712241e-07, "loss": 0.00040712952613830566, "memory(GiB)": 38.05, "reward": 0.3988259434700012, "reward_std": 0.052738070487976074, "rewards/VisualizationJSONCombinedORM/mean": 0.3988259434700012, "rewards/VisualizationJSONCombinedORM/std": 0.11064736545085907, "step": 2030, "train_speed(iter/s)": 0.069135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 266.0625, "completions/min_length": 215.0, "epoch": 1.6799007444168734, "grad_norm": 0.1995408833026886, "kl": 0.084228515625, "learning_rate": 7.60354679928601e-07, "loss": 0.0008424445986747742, "memory(GiB)": 38.05, "reward": 0.5624133944511414, "reward_std": 0.0681106448173523, "rewards/VisualizationJSONCombinedORM/mean": 0.5624133944511414, "rewards/VisualizationJSONCombinedORM/std": 0.15155157446861267, "step": 2031, "train_speed(iter/s)": 0.069112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 254.875, "completions/min_length": 214.0, "epoch": 1.6807278742762612, "grad_norm": 0.16343791782855988, "kl": 0.1221923828125, "learning_rate": 7.565323793194373e-07, "loss": 0.0012210644781589508, "memory(GiB)": 38.05, "reward": 0.5889495611190796, "reward_std": 0.0835009515285492, "rewards/VisualizationJSONCombinedORM/mean": 0.5889495611190796, "rewards/VisualizationJSONCombinedORM/std": 0.15023267269134521, "step": 2032, "train_speed(iter/s)": 0.069082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 272.4375, "completions/min_length": 237.0, "epoch": 1.6815550041356493, "grad_norm": 0.1899004578590393, "kl": 0.07733154296875, "learning_rate": 7.527189238109539e-07, "loss": 0.0007727816700935364, "memory(GiB)": 38.05, "reward": 0.6818766593933105, "reward_std": 0.0963909924030304, "rewards/VisualizationJSONCombinedORM/mean": 0.6818766593933105, "rewards/VisualizationJSONCombinedORM/std": 0.09329945594072342, "step": 2033, "train_speed(iter/s)": 0.069062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 270.0625, "completions/min_length": 205.0, "epoch": 1.6823821339950373, "grad_norm": 0.17230089008808136, "kl": 0.09130859375, "learning_rate": 7.489143213519301e-07, "loss": 0.0009133480489253998, "memory(GiB)": 38.05, "reward": 0.5501214265823364, "reward_std": 0.08265897631645203, "rewards/VisualizationJSONCombinedORM/mean": 0.5501214265823364, "rewards/VisualizationJSONCombinedORM/std": 0.10839749872684479, "step": 2034, "train_speed(iter/s)": 0.069038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 270.25, "completions/min_length": 207.0, "epoch": 1.6832092638544252, "grad_norm": 0.16317960619926453, "kl": 0.07977294921875, "learning_rate": 7.451185798726939e-07, "loss": 0.0007948949933052063, "memory(GiB)": 38.05, "reward": 0.3644481897354126, "reward_std": 0.07684417068958282, "rewards/VisualizationJSONCombinedORM/mean": 0.3644481897354126, "rewards/VisualizationJSONCombinedORM/std": 0.11675868928432465, "step": 2035, "train_speed(iter/s)": 0.069014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 285.875, "completions/min_length": 223.0, "epoch": 1.684036393713813, "grad_norm": 0.18229259550571442, "kl": 0.06011962890625, "learning_rate": 7.413317072851051e-07, "loss": 0.0006005614995956421, "memory(GiB)": 38.05, "reward": 0.5657302141189575, "reward_std": 0.0715160071849823, "rewards/VisualizationJSONCombinedORM/mean": 0.5657302141189575, "rewards/VisualizationJSONCombinedORM/std": 0.18327663838863373, "step": 2036, "train_speed(iter/s)": 0.068982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 279.8125, "completions/min_length": 234.0, "epoch": 1.684863523573201, "grad_norm": 0.2063639909029007, "kl": 0.08538818359375, "learning_rate": 7.375537114825365e-07, "loss": 0.0008536316454410553, "memory(GiB)": 38.05, "reward": 0.4341779053211212, "reward_std": 0.0547567754983902, "rewards/VisualizationJSONCombinedORM/mean": 0.4341779053211212, "rewards/VisualizationJSONCombinedORM/std": 0.1859239935874939, "step": 2037, "train_speed(iter/s)": 0.068948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 264.5625, "completions/min_length": 225.0, "epoch": 1.685690653432589, "grad_norm": 0.23027992248535156, "kl": 0.0831298828125, "learning_rate": 7.337846003398568e-07, "loss": 0.0008314996957778931, "memory(GiB)": 38.05, "reward": 0.604469358921051, "reward_std": 0.12092694640159607, "rewards/VisualizationJSONCombinedORM/mean": 0.604469358921051, "rewards/VisualizationJSONCombinedORM/std": 0.1168542206287384, "step": 2038, "train_speed(iter/s)": 0.068928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 275.5, "completions/min_length": 221.0, "epoch": 1.6865177832919769, "grad_norm": 0.1535428911447525, "kl": 0.03631591796875, "learning_rate": 7.300243817134145e-07, "loss": 0.00036329030990600586, "memory(GiB)": 38.05, "reward": 0.7436704039573669, "reward_std": 0.09890174865722656, "rewards/VisualizationJSONCombinedORM/mean": 0.7436704039573669, "rewards/VisualizationJSONCombinedORM/std": 0.12861113250255585, "step": 2039, "train_speed(iter/s)": 0.06889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 292.5, "completions/min_length": 220.0, "epoch": 1.6873449131513647, "grad_norm": 0.15091989934444427, "kl": 0.04901123046875, "learning_rate": 7.262730634410259e-07, "loss": 0.0004897415637969971, "memory(GiB)": 38.05, "reward": 0.4700019061565399, "reward_std": 0.0670769214630127, "rewards/VisualizationJSONCombinedORM/mean": 0.4700019061565399, "rewards/VisualizationJSONCombinedORM/std": 0.26366859674453735, "step": 2040, "train_speed(iter/s)": 0.068856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 264.5625, "completions/min_length": 219.0, "epoch": 1.6881720430107527, "grad_norm": 0.2139509618282318, "kl": 0.168701171875, "learning_rate": 7.225306533419546e-07, "loss": 0.001687757670879364, "memory(GiB)": 38.05, "reward": 0.5646742582321167, "reward_std": 0.12342728674411774, "rewards/VisualizationJSONCombinedORM/mean": 0.5646742582321167, "rewards/VisualizationJSONCombinedORM/std": 0.13703294098377228, "step": 2041, "train_speed(iter/s)": 0.068843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 269.875, "completions/min_length": 214.0, "epoch": 1.6889991728701406, "grad_norm": 0.1824827641248703, "kl": 0.04217529296875, "learning_rate": 7.187971592168936e-07, "loss": 0.00042140111327171326, "memory(GiB)": 38.05, "reward": 0.47607094049453735, "reward_std": 0.06549697369337082, "rewards/VisualizationJSONCombinedORM/mean": 0.47607094049453735, "rewards/VisualizationJSONCombinedORM/std": 0.16405466198921204, "step": 2042, "train_speed(iter/s)": 0.068823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 244.9375, "completions/min_length": 212.0, "epoch": 1.6898263027295286, "grad_norm": 0.20787711441516876, "kl": 0.05633544921875, "learning_rate": 7.15072588847952e-07, "loss": 0.000563543289899826, "memory(GiB)": 38.05, "reward": 0.6602169275283813, "reward_std": 0.10755790770053864, "rewards/VisualizationJSONCombinedORM/mean": 0.6602169275283813, "rewards/VisualizationJSONCombinedORM/std": 0.11906149983406067, "step": 2043, "train_speed(iter/s)": 0.06879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 289.1875, "completions/min_length": 235.0, "epoch": 1.6906534325889164, "grad_norm": 0.1731332689523697, "kl": 0.06231689453125, "learning_rate": 7.113569499986401e-07, "loss": 0.0006227344274520874, "memory(GiB)": 38.05, "reward": 0.7635879516601562, "reward_std": 0.11591332405805588, "rewards/VisualizationJSONCombinedORM/mean": 0.7635879516601562, "rewards/VisualizationJSONCombinedORM/std": 0.11958542466163635, "step": 2044, "train_speed(iter/s)": 0.068776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 285.0, "completions/min_length": 228.0, "epoch": 1.6914805624483042, "grad_norm": 0.16930322349071503, "kl": 0.07550048828125, "learning_rate": 7.076502504138494e-07, "loss": 0.000753939151763916, "memory(GiB)": 38.05, "reward": 0.5978394150733948, "reward_std": 0.08473356068134308, "rewards/VisualizationJSONCombinedORM/mean": 0.5978394150733948, "rewards/VisualizationJSONCombinedORM/std": 0.14750643074512482, "step": 2045, "train_speed(iter/s)": 0.068745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 269.25, "completions/min_length": 217.0, "epoch": 1.6923076923076923, "grad_norm": 0.15080004930496216, "kl": 0.05999755859375, "learning_rate": 7.039524978198414e-07, "loss": 0.000600472092628479, "memory(GiB)": 38.05, "reward": 0.5553097724914551, "reward_std": 0.06715196371078491, "rewards/VisualizationJSONCombinedORM/mean": 0.5553097724914551, "rewards/VisualizationJSONCombinedORM/std": 0.0695699006319046, "step": 2046, "train_speed(iter/s)": 0.068724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 256.125, "completions/min_length": 213.0, "epoch": 1.6931348221670803, "grad_norm": 0.17910721898078918, "kl": 0.051513671875, "learning_rate": 7.002636999242252e-07, "loss": 0.0005169827491044998, "memory(GiB)": 38.05, "reward": 0.38234585523605347, "reward_std": 0.04472450166940689, "rewards/VisualizationJSONCombinedORM/mean": 0.38234585523605347, "rewards/VisualizationJSONCombinedORM/std": 0.20881806313991547, "step": 2047, "train_speed(iter/s)": 0.068699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 277.5, "completions/min_length": 204.0, "epoch": 1.6939619520264682, "grad_norm": 0.17599323391914368, "kl": 0.05987548828125, "learning_rate": 6.965838644159434e-07, "loss": 0.0005978569388389587, "memory(GiB)": 38.05, "reward": 0.39661794900894165, "reward_std": 0.05100813880562782, "rewards/VisualizationJSONCombinedORM/mean": 0.39661794900894165, "rewards/VisualizationJSONCombinedORM/std": 0.09741290658712387, "step": 2048, "train_speed(iter/s)": 0.068672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 268.3125, "completions/min_length": 199.0, "epoch": 1.694789081885856, "grad_norm": 0.14679664373397827, "kl": 0.038238525390625, "learning_rate": 6.929129989652617e-07, "loss": 0.00038196519017219543, "memory(GiB)": 38.05, "reward": 0.623079776763916, "reward_std": 0.10241255164146423, "rewards/VisualizationJSONCombinedORM/mean": 0.623079776763916, "rewards/VisualizationJSONCombinedORM/std": 0.1413193941116333, "step": 2049, "train_speed(iter/s)": 0.068642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 272.9375, "completions/min_length": 234.0, "epoch": 1.695616211745244, "grad_norm": 0.2147422432899475, "kl": 0.04620361328125, "learning_rate": 6.892511112237472e-07, "loss": 0.00046105682849884033, "memory(GiB)": 38.05, "reward": 0.42391133308410645, "reward_std": 0.07206134498119354, "rewards/VisualizationJSONCombinedORM/mean": 0.42391133308410645, "rewards/VisualizationJSONCombinedORM/std": 0.1896592676639557, "step": 2050, "train_speed(iter/s)": 0.068615 }, { "epoch": 1.695616211745244, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 315.9583333333333, "eval_completions/mean_length": 269.53125, "eval_completions/min_length": 231.04166666666666, "eval_kl": 0.062744140625, "eval_loss": 0.0006293226033449173, "eval_reward": 0.4577499274164438, "eval_reward_std": 0.056879002678518496, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4577499274164438, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05687899961291502, "eval_runtime": 281.6329, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 280.5, "completions/min_length": 200.0, "epoch": 1.696443341604632, "grad_norm": 0.16924820840358734, "kl": 0.05389404296875, "learning_rate": 6.855982088242524e-07, "loss": 0.000538162887096405, "memory(GiB)": 38.05, "reward": 0.5241116285324097, "reward_std": 0.06914041191339493, "rewards/VisualizationJSONCombinedORM/mean": 0.5241116285324097, "rewards/VisualizationJSONCombinedORM/std": 0.12928538024425507, "step": 2051, "train_speed(iter/s)": 0.06795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 263.1875, "completions/min_length": 239.0, "epoch": 1.6972704714640199, "grad_norm": 0.15629008412361145, "kl": 0.0411376953125, "learning_rate": 6.819542993809003e-07, "loss": 0.0004119556397199631, "memory(GiB)": 38.05, "reward": 0.49413543939590454, "reward_std": 0.05240984261035919, "rewards/VisualizationJSONCombinedORM/mean": 0.49413543939590454, "rewards/VisualizationJSONCombinedORM/std": 0.24211755394935608, "step": 2052, "train_speed(iter/s)": 0.067936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 259.375, "completions/min_length": 208.0, "epoch": 1.6980976013234077, "grad_norm": 0.21691417694091797, "kl": 0.023193359375, "learning_rate": 6.783193904890712e-07, "loss": 0.00023226439952850342, "memory(GiB)": 38.05, "reward": 0.7155195474624634, "reward_std": 0.0812983289361, "rewards/VisualizationJSONCombinedORM/mean": 0.7155195474624634, "rewards/VisualizationJSONCombinedORM/std": 0.09199045598506927, "step": 2053, "train_speed(iter/s)": 0.067912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 257.0625, "completions/min_length": 221.0, "epoch": 1.6989247311827957, "grad_norm": 0.16291923820972443, "kl": 0.044921875, "learning_rate": 6.746934897253832e-07, "loss": 0.00044840574264526367, "memory(GiB)": 38.05, "reward": 0.6987200379371643, "reward_std": 0.09600958228111267, "rewards/VisualizationJSONCombinedORM/mean": 0.6987200379371643, "rewards/VisualizationJSONCombinedORM/std": 0.1048927903175354, "step": 2054, "train_speed(iter/s)": 0.06788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 273.0, "completions/min_length": 199.0, "epoch": 1.6997518610421838, "grad_norm": 0.23697197437286377, "kl": 0.04388427734375, "learning_rate": 6.710766046476813e-07, "loss": 0.0004389360547065735, "memory(GiB)": 38.05, "reward": 0.5622898936271667, "reward_std": 0.09134864807128906, "rewards/VisualizationJSONCombinedORM/mean": 0.5622898936271667, "rewards/VisualizationJSONCombinedORM/std": 0.14884671568870544, "step": 2055, "train_speed(iter/s)": 0.067858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 261.5625, "completions/min_length": 216.0, "epoch": 1.7005789909015716, "grad_norm": 0.18258462846279144, "kl": 0.0482177734375, "learning_rate": 6.6746874279501e-07, "loss": 0.0004813149571418762, "memory(GiB)": 38.05, "reward": 0.6646877527236938, "reward_std": 0.06914711743593216, "rewards/VisualizationJSONCombinedORM/mean": 0.6646877527236938, "rewards/VisualizationJSONCombinedORM/std": 0.0746769905090332, "step": 2056, "train_speed(iter/s)": 0.067834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 259.5625, "completions/min_length": 216.0, "epoch": 1.7014061207609594, "grad_norm": 0.1977211982011795, "kl": 0.10986328125, "learning_rate": 6.638699116876129e-07, "loss": 0.0010987166315317154, "memory(GiB)": 38.05, "reward": 0.6357179880142212, "reward_std": 0.09549888968467712, "rewards/VisualizationJSONCombinedORM/mean": 0.6357179880142212, "rewards/VisualizationJSONCombinedORM/std": 0.2030271738767624, "step": 2057, "train_speed(iter/s)": 0.067813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 249.9375, "completions/min_length": 196.0, "epoch": 1.7022332506203472, "grad_norm": 0.18827442824840546, "kl": 0.03753662109375, "learning_rate": 6.602801188269081e-07, "loss": 0.00037516653537750244, "memory(GiB)": 38.05, "reward": 0.6442364454269409, "reward_std": 0.0786600261926651, "rewards/VisualizationJSONCombinedORM/mean": 0.6442364454269409, "rewards/VisualizationJSONCombinedORM/std": 0.15582509338855743, "step": 2058, "train_speed(iter/s)": 0.067791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 268.9375, "completions/min_length": 212.0, "epoch": 1.7030603804797353, "grad_norm": 0.1941530704498291, "kl": 0.027374267578125, "learning_rate": 6.566993716954751e-07, "loss": 0.0002735275775194168, "memory(GiB)": 38.05, "reward": 0.5878785252571106, "reward_std": 0.10017180442810059, "rewards/VisualizationJSONCombinedORM/mean": 0.5878785252571106, "rewards/VisualizationJSONCombinedORM/std": 0.12506413459777832, "step": 2059, "train_speed(iter/s)": 0.067769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 263.9375, "completions/min_length": 219.0, "epoch": 1.7038875103391233, "grad_norm": 0.180413618683815, "kl": 0.0821533203125, "learning_rate": 6.531276777570361e-07, "loss": 0.0008212365210056305, "memory(GiB)": 38.05, "reward": 0.4387744963169098, "reward_std": 0.06719913333654404, "rewards/VisualizationJSONCombinedORM/mean": 0.4387744963169098, "rewards/VisualizationJSONCombinedORM/std": 0.07453199476003647, "step": 2060, "train_speed(iter/s)": 0.067734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 257.25, "completions/min_length": 217.0, "epoch": 1.7047146401985112, "grad_norm": 0.20084325969219208, "kl": 0.06842041015625, "learning_rate": 6.495650444564433e-07, "loss": 0.0006846040487289429, "memory(GiB)": 38.05, "reward": 0.6736372709274292, "reward_std": 0.08327922224998474, "rewards/VisualizationJSONCombinedORM/mean": 0.6736372709274292, "rewards/VisualizationJSONCombinedORM/std": 0.0863715261220932, "step": 2061, "train_speed(iter/s)": 0.067718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 257.0625, "completions/min_length": 230.0, "epoch": 1.705541770057899, "grad_norm": 0.23424117267131805, "kl": 0.078857421875, "learning_rate": 6.460114792196642e-07, "loss": 0.0007898882031440735, "memory(GiB)": 38.05, "reward": 0.5052148103713989, "reward_std": 0.03958465903997421, "rewards/VisualizationJSONCombinedORM/mean": 0.5052148103713989, "rewards/VisualizationJSONCombinedORM/std": 0.15919627249240875, "step": 2062, "train_speed(iter/s)": 0.067704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 256.8125, "completions/min_length": 230.0, "epoch": 1.706368899917287, "grad_norm": 0.1722268909215927, "kl": 0.0523681640625, "learning_rate": 6.42466989453765e-07, "loss": 0.0005234815180301666, "memory(GiB)": 38.05, "reward": 0.549432098865509, "reward_std": 0.10091409832239151, "rewards/VisualizationJSONCombinedORM/mean": 0.549432098865509, "rewards/VisualizationJSONCombinedORM/std": 0.17500431835651398, "step": 2063, "train_speed(iter/s)": 0.067687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 250.125, "completions/min_length": 203.0, "epoch": 1.707196029776675, "grad_norm": 0.1700151413679123, "kl": 0.07452392578125, "learning_rate": 6.38931582546895e-07, "loss": 0.0007453709840774536, "memory(GiB)": 38.05, "reward": 0.6092579364776611, "reward_std": 0.05062129348516464, "rewards/VisualizationJSONCombinedORM/mean": 0.6092579364776611, "rewards/VisualizationJSONCombinedORM/std": 0.1381630003452301, "step": 2064, "train_speed(iter/s)": 0.067663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 292.0625, "completions/min_length": 233.0, "epoch": 1.7080231596360629, "grad_norm": 0.1832907795906067, "kl": 0.069580078125, "learning_rate": 6.354052658682675e-07, "loss": 0.0006949454545974731, "memory(GiB)": 38.05, "reward": 0.3912038207054138, "reward_std": 0.05704052001237869, "rewards/VisualizationJSONCombinedORM/mean": 0.3912038207054138, "rewards/VisualizationJSONCombinedORM/std": 0.12766240537166595, "step": 2065, "train_speed(iter/s)": 0.067634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 287.0, "completions/min_length": 234.0, "epoch": 1.7088502894954507, "grad_norm": 0.19896680116653442, "kl": 0.056396484375, "learning_rate": 6.318880467681527e-07, "loss": 0.0005639493465423584, "memory(GiB)": 38.05, "reward": 0.21656697988510132, "reward_std": 0.017340678721666336, "rewards/VisualizationJSONCombinedORM/mean": 0.21656697988510132, "rewards/VisualizationJSONCombinedORM/std": 0.05135639011859894, "step": 2066, "train_speed(iter/s)": 0.067609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 265.9375, "completions/min_length": 214.0, "epoch": 1.7096774193548387, "grad_norm": 0.1858862340450287, "kl": 0.04376220703125, "learning_rate": 6.28379932577855e-07, "loss": 0.0004375651478767395, "memory(GiB)": 38.05, "reward": 0.44719499349594116, "reward_std": 0.09487655758857727, "rewards/VisualizationJSONCombinedORM/mean": 0.44719499349594116, "rewards/VisualizationJSONCombinedORM/std": 0.15006378293037415, "step": 2067, "train_speed(iter/s)": 0.067583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 270.6875, "completions/min_length": 229.0, "epoch": 1.7105045492142268, "grad_norm": 0.1593867391347885, "kl": 0.05523681640625, "learning_rate": 6.248809306097036e-07, "loss": 0.0005524149164557457, "memory(GiB)": 38.05, "reward": 0.6432374715805054, "reward_std": 0.06689935177564621, "rewards/VisualizationJSONCombinedORM/mean": 0.6432374715805054, "rewards/VisualizationJSONCombinedORM/std": 0.10474736243486404, "step": 2068, "train_speed(iter/s)": 0.06757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 275.875, "completions/min_length": 222.0, "epoch": 1.7113316790736146, "grad_norm": 0.1646660566329956, "kl": 0.060546875, "learning_rate": 6.213910481570306e-07, "loss": 0.0006049582734704018, "memory(GiB)": 38.05, "reward": 0.6167011260986328, "reward_std": 0.08037789165973663, "rewards/VisualizationJSONCombinedORM/mean": 0.6167011260986328, "rewards/VisualizationJSONCombinedORM/std": 0.12844012677669525, "step": 2069, "train_speed(iter/s)": 0.067532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 236.5, "completions/min_length": 204.0, "epoch": 1.7121588089330024, "grad_norm": 0.18468263745307922, "kl": 0.032562255859375, "learning_rate": 6.179102924941599e-07, "loss": 0.0003259629011154175, "memory(GiB)": 38.05, "reward": 0.5537644624710083, "reward_std": 0.07708534598350525, "rewards/VisualizationJSONCombinedORM/mean": 0.5537644624710083, "rewards/VisualizationJSONCombinedORM/std": 0.08250455558300018, "step": 2070, "train_speed(iter/s)": 0.067518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 256.75, "completions/min_length": 201.0, "epoch": 1.7129859387923903, "grad_norm": 0.21572978794574738, "kl": 0.0966796875, "learning_rate": 6.144386708763933e-07, "loss": 0.0009685754776000977, "memory(GiB)": 38.05, "reward": 0.4129638671875, "reward_std": 0.08382662385702133, "rewards/VisualizationJSONCombinedORM/mean": 0.4129638671875, "rewards/VisualizationJSONCombinedORM/std": 0.1942332535982132, "step": 2071, "train_speed(iter/s)": 0.067499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 258.375, "completions/min_length": 209.0, "epoch": 1.7138130686517783, "grad_norm": 0.18704967200756073, "kl": 0.0621337890625, "learning_rate": 6.10976190539993e-07, "loss": 0.0006224177777767181, "memory(GiB)": 38.05, "reward": 0.6262751817703247, "reward_std": 0.07397051155567169, "rewards/VisualizationJSONCombinedORM/mean": 0.6262751817703247, "rewards/VisualizationJSONCombinedORM/std": 0.1434367150068283, "step": 2072, "train_speed(iter/s)": 0.067478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 268.9375, "completions/min_length": 227.0, "epoch": 1.7146401985111663, "grad_norm": 0.19538681209087372, "kl": 0.02435302734375, "learning_rate": 6.075228587021669e-07, "loss": 0.0002437680959701538, "memory(GiB)": 38.05, "reward": 0.5962032079696655, "reward_std": 0.06120359152555466, "rewards/VisualizationJSONCombinedORM/mean": 0.5962032079696655, "rewards/VisualizationJSONCombinedORM/std": 0.16475816071033478, "step": 2073, "train_speed(iter/s)": 0.067454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 258.375, "completions/min_length": 200.0, "epoch": 1.7154673283705542, "grad_norm": 0.1842128038406372, "kl": 0.0567626953125, "learning_rate": 6.040786825610518e-07, "loss": 0.0005680732429027557, "memory(GiB)": 38.05, "reward": 0.34013819694519043, "reward_std": 0.047016941010951996, "rewards/VisualizationJSONCombinedORM/mean": 0.34013819694519043, "rewards/VisualizationJSONCombinedORM/std": 0.18479163944721222, "step": 2074, "train_speed(iter/s)": 0.067432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 278.3125, "completions/min_length": 230.0, "epoch": 1.716294458229942, "grad_norm": 0.20245715975761414, "kl": 0.0728759765625, "learning_rate": 6.006436692957035e-07, "loss": 0.0007289499044418335, "memory(GiB)": 38.05, "reward": 0.29498425126075745, "reward_std": 0.03179905191063881, "rewards/VisualizationJSONCombinedORM/mean": 0.29498425126075745, "rewards/VisualizationJSONCombinedORM/std": 0.07962584495544434, "step": 2075, "train_speed(iter/s)": 0.067404 }, { "epoch": 1.716294458229942, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 310.2083333333333, "eval_completions/mean_length": 266.7239583333333, "eval_completions/min_length": 228.25, "eval_kl": 0.061894734700520836, "eval_loss": 0.0006200503557920456, "eval_reward": 0.4596848425765832, "eval_reward_std": 0.05901137824791173, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4596848425765832, "eval_rewards/VisualizationJSONCombinedORM/std": 0.059011381662761174, "eval_runtime": 279.3305, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 286.75, "completions/min_length": 237.0, "epoch": 1.71712158808933, "grad_norm": 0.1700281798839569, "kl": 0.07012939453125, "learning_rate": 5.972178260660771e-07, "loss": 0.000701315701007843, "memory(GiB)": 38.05, "reward": 0.566977322101593, "reward_std": 0.07410704344511032, "rewards/VisualizationJSONCombinedORM/mean": 0.566977322101593, "rewards/VisualizationJSONCombinedORM/std": 0.22047357261180878, "step": 2076, "train_speed(iter/s)": 0.066778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 278.0, "completions/min_length": 241.0, "epoch": 1.717948717948718, "grad_norm": 0.2192651331424713, "kl": 0.0594482421875, "learning_rate": 5.938011600130134e-07, "loss": 0.0005950927734375, "memory(GiB)": 38.05, "reward": 0.5672074556350708, "reward_std": 0.10674485564231873, "rewards/VisualizationJSONCombinedORM/mean": 0.5672074556350708, "rewards/VisualizationJSONCombinedORM/std": 0.1351759433746338, "step": 2077, "train_speed(iter/s)": 0.066758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 263.375, "completions/min_length": 239.0, "epoch": 1.7187758478081059, "grad_norm": 0.1579153835773468, "kl": 0.095703125, "learning_rate": 5.903936782582253e-07, "loss": 0.0009557418525218964, "memory(GiB)": 38.05, "reward": 0.415149986743927, "reward_std": 0.0536317303776741, "rewards/VisualizationJSONCombinedORM/mean": 0.415149986743927, "rewards/VisualizationJSONCombinedORM/std": 0.05282531678676605, "step": 2078, "train_speed(iter/s)": 0.066739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 279.6875, "completions/min_length": 215.0, "epoch": 1.7196029776674937, "grad_norm": 0.18658442795276642, "kl": 0.05120849609375, "learning_rate": 5.8699538790428e-07, "loss": 0.0005132700316607952, "memory(GiB)": 38.05, "reward": 0.4199141263961792, "reward_std": 0.06954272091388702, "rewards/VisualizationJSONCombinedORM/mean": 0.4199141263961792, "rewards/VisualizationJSONCombinedORM/std": 0.0907517597079277, "step": 2079, "train_speed(iter/s)": 0.066718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 279.375, "completions/min_length": 230.0, "epoch": 1.7204301075268817, "grad_norm": 0.3047391474246979, "kl": 0.117431640625, "learning_rate": 5.836062960345878e-07, "loss": 0.0011736191809177399, "memory(GiB)": 38.05, "reward": 0.6236608028411865, "reward_std": 0.10218524932861328, "rewards/VisualizationJSONCombinedORM/mean": 0.6236608028411865, "rewards/VisualizationJSONCombinedORM/std": 0.13179121911525726, "step": 2080, "train_speed(iter/s)": 0.066705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 298.1875, "completions/min_length": 244.0, "epoch": 1.7212572373862698, "grad_norm": 0.16811180114746094, "kl": 0.06134033203125, "learning_rate": 5.80226409713387e-07, "loss": 0.0006141327321529388, "memory(GiB)": 38.05, "reward": 0.5641350746154785, "reward_std": 0.11425883322954178, "rewards/VisualizationJSONCombinedORM/mean": 0.5641350746154785, "rewards/VisualizationJSONCombinedORM/std": 0.12569421529769897, "step": 2081, "train_speed(iter/s)": 0.066682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 241.0, "completions/min_length": 185.0, "epoch": 1.7220843672456576, "grad_norm": 0.1795056164264679, "kl": 0.037109375, "learning_rate": 5.768557359857241e-07, "loss": 0.00037179887294769287, "memory(GiB)": 38.05, "reward": 0.7300198078155518, "reward_std": 0.047120921313762665, "rewards/VisualizationJSONCombinedORM/mean": 0.7300198078155518, "rewards/VisualizationJSONCombinedORM/std": 0.11078748852014542, "step": 2082, "train_speed(iter/s)": 0.066668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 259.375, "completions/min_length": 213.0, "epoch": 1.7229114971050454, "grad_norm": 0.17733915150165558, "kl": 0.0745849609375, "learning_rate": 5.734942818774437e-07, "loss": 0.0007452294230461121, "memory(GiB)": 38.05, "reward": 0.6843967437744141, "reward_std": 0.10942121595144272, "rewards/VisualizationJSONCombinedORM/mean": 0.6843967437744141, "rewards/VisualizationJSONCombinedORM/std": 0.1585087776184082, "step": 2083, "train_speed(iter/s)": 0.066644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 274.125, "completions/min_length": 202.0, "epoch": 1.7237386269644333, "grad_norm": 0.1737937033176422, "kl": 0.04620361328125, "learning_rate": 5.701420543951757e-07, "loss": 0.0004616677761077881, "memory(GiB)": 38.05, "reward": 0.30186426639556885, "reward_std": 0.041357140988111496, "rewards/VisualizationJSONCombinedORM/mean": 0.30186426639556885, "rewards/VisualizationJSONCombinedORM/std": 0.15641078352928162, "step": 2084, "train_speed(iter/s)": 0.066617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 265.75, "completions/min_length": 219.0, "epoch": 1.7245657568238213, "grad_norm": 0.17307375371456146, "kl": 0.036041259765625, "learning_rate": 5.667990605263174e-07, "loss": 0.00036153942346572876, "memory(GiB)": 38.05, "reward": 0.42637336254119873, "reward_std": 0.04607783257961273, "rewards/VisualizationJSONCombinedORM/mean": 0.42637336254119873, "rewards/VisualizationJSONCombinedORM/std": 0.08281578868627548, "step": 2085, "train_speed(iter/s)": 0.066599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 273.3125, "completions/min_length": 223.0, "epoch": 1.7253928866832093, "grad_norm": 0.212271586060524, "kl": 0.08154296875, "learning_rate": 5.634653072390167e-07, "loss": 0.0008140727877616882, "memory(GiB)": 38.05, "reward": 0.4160286486148834, "reward_std": 0.08104372024536133, "rewards/VisualizationJSONCombinedORM/mean": 0.4160286486148834, "rewards/VisualizationJSONCombinedORM/std": 0.1622716188430786, "step": 2086, "train_speed(iter/s)": 0.066579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 249.4375, "completions/min_length": 218.0, "epoch": 1.7262200165425972, "grad_norm": 0.20300257205963135, "kl": 0.1090087890625, "learning_rate": 5.601408014821619e-07, "loss": 0.0010886117815971375, "memory(GiB)": 38.05, "reward": 0.5900897979736328, "reward_std": 0.04792486131191254, "rewards/VisualizationJSONCombinedORM/mean": 0.5900897979736328, "rewards/VisualizationJSONCombinedORM/std": 0.14687928557395935, "step": 2087, "train_speed(iter/s)": 0.066558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 287.375, "completions/min_length": 223.0, "epoch": 1.727047146401985, "grad_norm": 0.19308209419250488, "kl": 0.10205078125, "learning_rate": 5.568255501853664e-07, "loss": 0.001018419861793518, "memory(GiB)": 38.05, "reward": 0.4130316376686096, "reward_std": 0.06510134041309357, "rewards/VisualizationJSONCombinedORM/mean": 0.4130316376686096, "rewards/VisualizationJSONCombinedORM/std": 0.08551520854234695, "step": 2088, "train_speed(iter/s)": 0.066534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 254.25, "completions/min_length": 203.0, "epoch": 1.727874276261373, "grad_norm": 0.18501393496990204, "kl": 0.029754638671875, "learning_rate": 5.535195602589544e-07, "loss": 0.00029816851019859314, "memory(GiB)": 38.05, "reward": 0.6690207719802856, "reward_std": 0.08659842610359192, "rewards/VisualizationJSONCombinedORM/mean": 0.6690207719802856, "rewards/VisualizationJSONCombinedORM/std": 0.09822036325931549, "step": 2089, "train_speed(iter/s)": 0.066508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 264.125, "completions/min_length": 215.0, "epoch": 1.728701406120761, "grad_norm": 0.20165467262268066, "kl": 0.05755615234375, "learning_rate": 5.502228385939418e-07, "loss": 0.0005753636360168457, "memory(GiB)": 38.05, "reward": 0.5666100978851318, "reward_std": 0.08084797859191895, "rewards/VisualizationJSONCombinedORM/mean": 0.5666100978851318, "rewards/VisualizationJSONCombinedORM/std": 0.193040132522583, "step": 2090, "train_speed(iter/s)": 0.066488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 250.375, "completions/min_length": 192.0, "epoch": 1.7295285359801489, "grad_norm": 0.16411720216274261, "kl": 0.033966064453125, "learning_rate": 5.469353920620307e-07, "loss": 0.00033901259303092957, "memory(GiB)": 38.05, "reward": 0.6313143968582153, "reward_std": 0.07897446304559708, "rewards/VisualizationJSONCombinedORM/mean": 0.6313143968582153, "rewards/VisualizationJSONCombinedORM/std": 0.1546880006790161, "step": 2091, "train_speed(iter/s)": 0.06647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 301.6875, "completions/min_length": 243.0, "epoch": 1.7303556658395367, "grad_norm": 0.18270960450172424, "kl": 0.0626220703125, "learning_rate": 5.43657227515586e-07, "loss": 0.0006256476044654846, "memory(GiB)": 38.05, "reward": 0.6033600568771362, "reward_std": 0.10832321643829346, "rewards/VisualizationJSONCombinedORM/mean": 0.6033600568771362, "rewards/VisualizationJSONCombinedORM/std": 0.1153128370642662, "step": 2092, "train_speed(iter/s)": 0.066447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 282.25, "completions/min_length": 226.0, "epoch": 1.7311827956989247, "grad_norm": 0.17026019096374512, "kl": 0.08740234375, "learning_rate": 5.403883517876279e-07, "loss": 0.0008729957044124603, "memory(GiB)": 38.05, "reward": 0.6669648885726929, "reward_std": 0.08888093382120132, "rewards/VisualizationJSONCombinedORM/mean": 0.6669648885726929, "rewards/VisualizationJSONCombinedORM/std": 0.10927291959524155, "step": 2093, "train_speed(iter/s)": 0.066429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 271.8125, "completions/min_length": 205.0, "epoch": 1.7320099255583128, "grad_norm": 0.16066597402095795, "kl": 0.1114501953125, "learning_rate": 5.371287716918128e-07, "loss": 0.00111442431807518, "memory(GiB)": 38.05, "reward": 0.5666941404342651, "reward_std": 0.07677164673805237, "rewards/VisualizationJSONCombinedORM/mean": 0.5666941404342651, "rewards/VisualizationJSONCombinedORM/std": 0.2058631181716919, "step": 2094, "train_speed(iter/s)": 0.066411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 249.625, "completions/min_length": 211.0, "epoch": 1.7328370554177006, "grad_norm": 0.27828899025917053, "kl": 0.12841796875, "learning_rate": 5.338784940224239e-07, "loss": 0.0012835189700126648, "memory(GiB)": 38.05, "reward": 0.6636444330215454, "reward_std": 0.14406849443912506, "rewards/VisualizationJSONCombinedORM/mean": 0.6636444330215454, "rewards/VisualizationJSONCombinedORM/std": 0.1507335901260376, "step": 2095, "train_speed(iter/s)": 0.066398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 288.25, "completions/min_length": 231.0, "epoch": 1.7336641852770884, "grad_norm": 0.21398137509822845, "kl": 0.0762939453125, "learning_rate": 5.306375255543511e-07, "loss": 0.0007615648210048676, "memory(GiB)": 38.05, "reward": 0.5714375972747803, "reward_std": 0.07782701402902603, "rewards/VisualizationJSONCombinedORM/mean": 0.5714375972747803, "rewards/VisualizationJSONCombinedORM/std": 0.12216383218765259, "step": 2096, "train_speed(iter/s)": 0.066378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 281.6875, "completions/min_length": 213.0, "epoch": 1.7344913151364765, "grad_norm": 0.2039874941110611, "kl": 0.07403564453125, "learning_rate": 5.274058730430826e-07, "loss": 0.0007412433624267578, "memory(GiB)": 38.05, "reward": 0.3076888918876648, "reward_std": 0.03615034371614456, "rewards/VisualizationJSONCombinedORM/mean": 0.3076888918876648, "rewards/VisualizationJSONCombinedORM/std": 0.11225059628486633, "step": 2097, "train_speed(iter/s)": 0.066363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 273.125, "completions/min_length": 229.0, "epoch": 1.7353184449958643, "grad_norm": 0.16894707083702087, "kl": 0.035736083984375, "learning_rate": 5.241835432246888e-07, "loss": 0.00035677850246429443, "memory(GiB)": 38.05, "reward": 0.6764759421348572, "reward_std": 0.05563061684370041, "rewards/VisualizationJSONCombinedORM/mean": 0.6764759421348572, "rewards/VisualizationJSONCombinedORM/std": 0.09116006642580032, "step": 2098, "train_speed(iter/s)": 0.066341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 260.6875, "completions/min_length": 214.0, "epoch": 1.7361455748552523, "grad_norm": 0.19123691320419312, "kl": 0.0640869140625, "learning_rate": 5.209705428158046e-07, "loss": 0.0006400495767593384, "memory(GiB)": 38.05, "reward": 0.4272923469543457, "reward_std": 0.06870059669017792, "rewards/VisualizationJSONCombinedORM/mean": 0.4272923469543457, "rewards/VisualizationJSONCombinedORM/std": 0.2267044633626938, "step": 2099, "train_speed(iter/s)": 0.066323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 278.5625, "completions/min_length": 229.0, "epoch": 1.7369727047146402, "grad_norm": 0.18184205889701843, "kl": 0.06976318359375, "learning_rate": 5.177668785136225e-07, "loss": 0.0006984323263168335, "memory(GiB)": 38.05, "reward": 0.2652745246887207, "reward_std": 0.023874208331108093, "rewards/VisualizationJSONCombinedORM/mean": 0.2652745246887207, "rewards/VisualizationJSONCombinedORM/std": 0.026554400101304054, "step": 2100, "train_speed(iter/s)": 0.066299 }, { "epoch": 1.7369727047146402, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 312.1666666666667, "eval_completions/mean_length": 267.6041666666667, "eval_completions/min_length": 228.16666666666666, "eval_kl": 0.06559244791666667, "eval_loss": 0.0006587542593479156, "eval_reward": 0.4685381930321455, "eval_reward_std": 0.06297261275661488, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4685381930321455, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06297261376554768, "eval_runtime": 279.5038, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 236.5, "completions/min_length": 209.0, "epoch": 1.737799834574028, "grad_norm": 0.22112683951854706, "kl": 0.07464599609375, "learning_rate": 5.145725569958714e-07, "loss": 0.000746551901102066, "memory(GiB)": 38.05, "reward": 0.41694211959838867, "reward_std": 0.07056746631860733, "rewards/VisualizationJSONCombinedORM/mean": 0.41694211959838867, "rewards/VisualizationJSONCombinedORM/std": 0.09738819301128387, "step": 2101, "train_speed(iter/s)": 0.065706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 258.625, "completions/min_length": 218.0, "epoch": 1.738626964433416, "grad_norm": 0.22697047889232635, "kl": 0.05157470703125, "learning_rate": 5.1138758492081e-07, "loss": 0.0005156621336936951, "memory(GiB)": 38.05, "reward": 0.5678085088729858, "reward_std": 0.0696638748049736, "rewards/VisualizationJSONCombinedORM/mean": 0.5678085088729858, "rewards/VisualizationJSONCombinedORM/std": 0.06787429749965668, "step": 2102, "train_speed(iter/s)": 0.065694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 273.75, "completions/min_length": 212.0, "epoch": 1.739454094292804, "grad_norm": 0.1733371466398239, "kl": 0.040802001953125, "learning_rate": 5.082119689272042e-07, "loss": 0.00040787458419799805, "memory(GiB)": 38.05, "reward": 0.40825361013412476, "reward_std": 0.0513196662068367, "rewards/VisualizationJSONCombinedORM/mean": 0.40825361013412476, "rewards/VisualizationJSONCombinedORM/std": 0.06510628759860992, "step": 2103, "train_speed(iter/s)": 0.065674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 266.8125, "completions/min_length": 225.0, "epoch": 1.7402812241521919, "grad_norm": 0.2253638058900833, "kl": 0.06231689453125, "learning_rate": 5.050457156343225e-07, "loss": 0.0006237290799617767, "memory(GiB)": 38.05, "reward": 0.5202717781066895, "reward_std": 0.09599490463733673, "rewards/VisualizationJSONCombinedORM/mean": 0.5202717781066895, "rewards/VisualizationJSONCombinedORM/std": 0.0967680960893631, "step": 2104, "train_speed(iter/s)": 0.065651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 255.1875, "completions/min_length": 205.0, "epoch": 1.7411083540115797, "grad_norm": 0.20021642744541168, "kl": 0.05865478515625, "learning_rate": 5.01888831641914e-07, "loss": 0.0005851425230503082, "memory(GiB)": 38.05, "reward": 0.4878082275390625, "reward_std": 0.08105242252349854, "rewards/VisualizationJSONCombinedORM/mean": 0.4878082275390625, "rewards/VisualizationJSONCombinedORM/std": 0.2502324879169464, "step": 2105, "train_speed(iter/s)": 0.065634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 252.125, "completions/min_length": 198.0, "epoch": 1.7419354838709677, "grad_norm": 0.14875957369804382, "kl": 0.0445556640625, "learning_rate": 4.987413235302025e-07, "loss": 0.0004449710249900818, "memory(GiB)": 38.05, "reward": 0.7004196643829346, "reward_std": 0.08443466573953629, "rewards/VisualizationJSONCombinedORM/mean": 0.7004196643829346, "rewards/VisualizationJSONCombinedORM/std": 0.10536893457174301, "step": 2106, "train_speed(iter/s)": 0.065615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 270.0, "completions/min_length": 223.0, "epoch": 1.7427626137303558, "grad_norm": 0.16082550585269928, "kl": 0.0987548828125, "learning_rate": 4.95603197859863e-07, "loss": 0.0009872093796730042, "memory(GiB)": 38.05, "reward": 0.513765811920166, "reward_std": 0.06304313987493515, "rewards/VisualizationJSONCombinedORM/mean": 0.513765811920166, "rewards/VisualizationJSONCombinedORM/std": 0.09877245873212814, "step": 2107, "train_speed(iter/s)": 0.065605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 262.5, "completions/min_length": 227.0, "epoch": 1.7435897435897436, "grad_norm": 0.22573350369930267, "kl": 0.06256103515625, "learning_rate": 4.924744611720201e-07, "loss": 0.000625312328338623, "memory(GiB)": 38.05, "reward": 0.6213769912719727, "reward_std": 0.06683844327926636, "rewards/VisualizationJSONCombinedORM/mean": 0.6213769912719727, "rewards/VisualizationJSONCombinedORM/std": 0.1105193942785263, "step": 2108, "train_speed(iter/s)": 0.065589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 247.5625, "completions/min_length": 225.0, "epoch": 1.7444168734491314, "grad_norm": 0.17943689227104187, "kl": 0.0902099609375, "learning_rate": 4.893551199882241e-07, "loss": 0.0009021386504173279, "memory(GiB)": 38.05, "reward": 0.5997434258460999, "reward_std": 0.07336153835058212, "rewards/VisualizationJSONCombinedORM/mean": 0.5997434258460999, "rewards/VisualizationJSONCombinedORM/std": 0.07314605265855789, "step": 2109, "train_speed(iter/s)": 0.065577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 285.8125, "completions/min_length": 203.0, "epoch": 1.7452440033085195, "grad_norm": 0.17984060943126678, "kl": 0.03936767578125, "learning_rate": 4.862451808104419e-07, "loss": 0.0003938041627407074, "memory(GiB)": 38.05, "reward": 0.5673502087593079, "reward_std": 0.06287612020969391, "rewards/VisualizationJSONCombinedORM/mean": 0.5673502087593079, "rewards/VisualizationJSONCombinedORM/std": 0.12308960407972336, "step": 2110, "train_speed(iter/s)": 0.065544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 258.3125, "completions/min_length": 219.0, "epoch": 1.7460711331679075, "grad_norm": 0.16921262443065643, "kl": 0.05487060546875, "learning_rate": 4.831446501210418e-07, "loss": 0.0005484037101268768, "memory(GiB)": 38.05, "reward": 0.5254773497581482, "reward_std": 0.04568079859018326, "rewards/VisualizationJSONCombinedORM/mean": 0.5254773497581482, "rewards/VisualizationJSONCombinedORM/std": 0.19575795531272888, "step": 2111, "train_speed(iter/s)": 0.065521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 262.375, "completions/min_length": 208.0, "epoch": 1.7468982630272953, "grad_norm": 0.17552272975444794, "kl": 0.04669189453125, "learning_rate": 4.800535343827834e-07, "loss": 0.00046715885400772095, "memory(GiB)": 38.05, "reward": 0.6713269948959351, "reward_std": 0.04403391480445862, "rewards/VisualizationJSONCombinedORM/mean": 0.6713269948959351, "rewards/VisualizationJSONCombinedORM/std": 0.1915559321641922, "step": 2112, "train_speed(iter/s)": 0.065511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 251.6875, "completions/min_length": 209.0, "epoch": 1.7477253928866832, "grad_norm": 0.1867324560880661, "kl": 0.06890869140625, "learning_rate": 4.769718400388013e-07, "loss": 0.0006897076964378357, "memory(GiB)": 38.05, "reward": 0.5216248035430908, "reward_std": 0.05354144051671028, "rewards/VisualizationJSONCombinedORM/mean": 0.5216248035430908, "rewards/VisualizationJSONCombinedORM/std": 0.05239613726735115, "step": 2113, "train_speed(iter/s)": 0.065496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 267.625, "completions/min_length": 223.0, "epoch": 1.748552522746071, "grad_norm": 0.19004149734973907, "kl": 0.1094970703125, "learning_rate": 4.738995735125895e-07, "loss": 0.0010948218405246735, "memory(GiB)": 38.05, "reward": 0.3811337649822235, "reward_std": 0.07726052403450012, "rewards/VisualizationJSONCombinedORM/mean": 0.3811337649822235, "rewards/VisualizationJSONCombinedORM/std": 0.07516416162252426, "step": 2114, "train_speed(iter/s)": 0.065484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 277.125, "completions/min_length": 210.0, "epoch": 1.749379652605459, "grad_norm": 0.20428219437599182, "kl": 0.0811767578125, "learning_rate": 4.7083674120799506e-07, "loss": 0.0008121281862258911, "memory(GiB)": 38.05, "reward": 0.36600351333618164, "reward_std": 0.051147785037755966, "rewards/VisualizationJSONCombinedORM/mean": 0.36600351333618164, "rewards/VisualizationJSONCombinedORM/std": 0.17421333491802216, "step": 2115, "train_speed(iter/s)": 0.065462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 246.5, "completions/min_length": 212.0, "epoch": 1.750206782464847, "grad_norm": 0.16512244939804077, "kl": 0.072998046875, "learning_rate": 4.677833495091949e-07, "loss": 0.0007317550480365753, "memory(GiB)": 38.05, "reward": 0.48776763677597046, "reward_std": 0.04550928622484207, "rewards/VisualizationJSONCombinedORM/mean": 0.48776763677597046, "rewards/VisualizationJSONCombinedORM/std": 0.07314656674861908, "step": 2116, "train_speed(iter/s)": 0.06544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 255.75, "completions/min_length": 215.0, "epoch": 1.7510339123242349, "grad_norm": 0.2121475338935852, "kl": 0.1170654296875, "learning_rate": 4.6473940478069304e-07, "loss": 0.0011721625924110413, "memory(GiB)": 38.05, "reward": 0.43807244300842285, "reward_std": 0.07275401055812836, "rewards/VisualizationJSONCombinedORM/mean": 0.43807244300842285, "rewards/VisualizationJSONCombinedORM/std": 0.07541730254888535, "step": 2117, "train_speed(iter/s)": 0.065416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 289.625, "completions/min_length": 217.0, "epoch": 1.7518610421836227, "grad_norm": 0.1704452484846115, "kl": 0.05548095703125, "learning_rate": 4.6170491336729794e-07, "loss": 0.0005556177347898483, "memory(GiB)": 38.05, "reward": 0.5216712951660156, "reward_std": 0.09550712257623672, "rewards/VisualizationJSONCombinedORM/mean": 0.5216712951660156, "rewards/VisualizationJSONCombinedORM/std": 0.12704436480998993, "step": 2118, "train_speed(iter/s)": 0.065387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 280.75, "completions/min_length": 250.0, "epoch": 1.7526881720430108, "grad_norm": 0.2042611539363861, "kl": 0.080078125, "learning_rate": 4.5867988159411793e-07, "loss": 0.0007989481091499329, "memory(GiB)": 38.05, "reward": 0.3741569519042969, "reward_std": 0.04901132732629776, "rewards/VisualizationJSONCombinedORM/mean": 0.3741569519042969, "rewards/VisualizationJSONCombinedORM/std": 0.1317276656627655, "step": 2119, "train_speed(iter/s)": 0.065358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 278.25, "completions/min_length": 251.0, "epoch": 1.7535153019023988, "grad_norm": 0.1642933189868927, "kl": 0.051513671875, "learning_rate": 4.55664315766538e-07, "loss": 0.0005158483982086182, "memory(GiB)": 38.05, "reward": 0.6831363439559937, "reward_std": 0.11894041299819946, "rewards/VisualizationJSONCombinedORM/mean": 0.6831363439559937, "rewards/VisualizationJSONCombinedORM/std": 0.1315290480852127, "step": 2120, "train_speed(iter/s)": 0.06533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 255.5625, "completions/min_length": 216.0, "epoch": 1.7543424317617866, "grad_norm": 0.1640753597021103, "kl": 0.06005859375, "learning_rate": 4.5265822217021803e-07, "loss": 0.000600181519985199, "memory(GiB)": 38.05, "reward": 0.6541085839271545, "reward_std": 0.06475574523210526, "rewards/VisualizationJSONCombinedORM/mean": 0.6541085839271545, "rewards/VisualizationJSONCombinedORM/std": 0.0730309784412384, "step": 2121, "train_speed(iter/s)": 0.065313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 285.8125, "completions/min_length": 217.0, "epoch": 1.7551695616211744, "grad_norm": 0.17334821820259094, "kl": 0.0633544921875, "learning_rate": 4.4966160707107075e-07, "loss": 0.0006337016820907593, "memory(GiB)": 38.05, "reward": 0.5882415771484375, "reward_std": 0.1150437742471695, "rewards/VisualizationJSONCombinedORM/mean": 0.5882415771484375, "rewards/VisualizationJSONCombinedORM/std": 0.11563847213983536, "step": 2122, "train_speed(iter/s)": 0.065292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 278.3125, "completions/min_length": 222.0, "epoch": 1.7559966914805625, "grad_norm": 0.2007223516702652, "kl": 0.03887939453125, "learning_rate": 4.4667447671525257e-07, "loss": 0.00038915127515792847, "memory(GiB)": 38.05, "reward": 0.6883870363235474, "reward_std": 0.0665358453989029, "rewards/VisualizationJSONCombinedORM/mean": 0.6883870363235474, "rewards/VisualizationJSONCombinedORM/std": 0.1140638217329979, "step": 2123, "train_speed(iter/s)": 0.065268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 263.25, "completions/min_length": 201.0, "epoch": 1.7568238213399505, "grad_norm": 0.19068413972854614, "kl": 0.06402587890625, "learning_rate": 4.436968373291489e-07, "loss": 0.0006389915943145752, "memory(GiB)": 38.05, "reward": 0.4797517955303192, "reward_std": 0.07236956059932709, "rewards/VisualizationJSONCombinedORM/mean": 0.4797517955303192, "rewards/VisualizationJSONCombinedORM/std": 0.16444765031337738, "step": 2124, "train_speed(iter/s)": 0.06525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 253.0625, "completions/min_length": 197.0, "epoch": 1.7576509511993383, "grad_norm": 0.18669888377189636, "kl": 0.0380859375, "learning_rate": 4.40728695119364e-07, "loss": 0.00038064271211624146, "memory(GiB)": 38.05, "reward": 0.4473978281021118, "reward_std": 0.045919086784124374, "rewards/VisualizationJSONCombinedORM/mean": 0.4473978281021118, "rewards/VisualizationJSONCombinedORM/std": 0.06384306401014328, "step": 2125, "train_speed(iter/s)": 0.065231 }, { "epoch": 1.7576509511993383, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 314.4583333333333, "eval_completions/mean_length": 269.6041666666667, "eval_completions/min_length": 227.66666666666666, "eval_kl": 0.0772705078125, "eval_loss": 0.0007773588295094669, "eval_reward": 0.4649500511586666, "eval_reward_std": 0.07543866538132231, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4649500511586666, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07543866607981424, "eval_runtime": 280.5517, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 261.375, "completions/min_length": 220.0, "epoch": 1.7584780810587262, "grad_norm": 0.2297782450914383, "kl": 0.0538330078125, "learning_rate": 4.377700562727055e-07, "loss": 0.0005380045622587204, "memory(GiB)": 38.05, "reward": 0.41222453117370605, "reward_std": 0.0857638418674469, "rewards/VisualizationJSONCombinedORM/mean": 0.41222453117370605, "rewards/VisualizationJSONCombinedORM/std": 0.11001614481210709, "step": 2126, "train_speed(iter/s)": 0.06466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 257.8125, "completions/min_length": 224.0, "epoch": 1.759305210918114, "grad_norm": 0.17681969702243805, "kl": 0.1656494140625, "learning_rate": 4.3482092695617097e-07, "loss": 0.0016576610505580902, "memory(GiB)": 38.05, "reward": 0.5959200859069824, "reward_std": 0.08072593808174133, "rewards/VisualizationJSONCombinedORM/mean": 0.5959200859069824, "rewards/VisualizationJSONCombinedORM/std": 0.18057715892791748, "step": 2127, "train_speed(iter/s)": 0.064639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 257.5625, "completions/min_length": 198.0, "epoch": 1.760132340777502, "grad_norm": 0.2308797538280487, "kl": 0.08349609375, "learning_rate": 4.318813133169375e-07, "loss": 0.000834733247756958, "memory(GiB)": 38.05, "reward": 0.37267637252807617, "reward_std": 0.059522151947021484, "rewards/VisualizationJSONCombinedORM/mean": 0.37267637252807617, "rewards/VisualizationJSONCombinedORM/std": 0.08891205489635468, "step": 2128, "train_speed(iter/s)": 0.064621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 280.0, "completions/min_length": 242.0, "epoch": 1.76095947063689, "grad_norm": 0.18092089891433716, "kl": 0.1551513671875, "learning_rate": 4.289512214823466e-07, "loss": 0.0015549436211585999, "memory(GiB)": 38.05, "reward": 0.5485949516296387, "reward_std": 0.08796629309654236, "rewards/VisualizationJSONCombinedORM/mean": 0.5485949516296387, "rewards/VisualizationJSONCombinedORM/std": 0.1284475177526474, "step": 2129, "train_speed(iter/s)": 0.064599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 280.625, "completions/min_length": 230.0, "epoch": 1.7617866004962779, "grad_norm": 0.2104811817407608, "kl": 0.0484619140625, "learning_rate": 4.2603065755989493e-07, "loss": 0.0004843221977353096, "memory(GiB)": 38.05, "reward": 0.5396729707717896, "reward_std": 0.04450184851884842, "rewards/VisualizationJSONCombinedORM/mean": 0.5396729707717896, "rewards/VisualizationJSONCombinedORM/std": 0.2986963391304016, "step": 2130, "train_speed(iter/s)": 0.064576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 287.125, "completions/min_length": 235.0, "epoch": 1.7626137303556657, "grad_norm": 0.19851574301719666, "kl": 0.048095703125, "learning_rate": 4.2311962763721734e-07, "loss": 0.00048152729868888855, "memory(GiB)": 38.05, "reward": 0.5745364427566528, "reward_std": 0.10445526242256165, "rewards/VisualizationJSONCombinedORM/mean": 0.5745364427566528, "rewards/VisualizationJSONCombinedORM/std": 0.10272357612848282, "step": 2131, "train_speed(iter/s)": 0.06455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 271.875, "completions/min_length": 233.0, "epoch": 1.7634408602150538, "grad_norm": 0.24652110040187836, "kl": 0.1240234375, "learning_rate": 4.202181377820752e-07, "loss": 0.0012395232915878296, "memory(GiB)": 38.05, "reward": 0.5180850625038147, "reward_std": 0.06845379620790482, "rewards/VisualizationJSONCombinedORM/mean": 0.5180850625038147, "rewards/VisualizationJSONCombinedORM/std": 0.18173401057720184, "step": 2132, "train_speed(iter/s)": 0.064534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 293.75, "completions/min_length": 238.0, "epoch": 1.7642679900744418, "grad_norm": 0.172187939286232, "kl": 0.104736328125, "learning_rate": 4.173261940423451e-07, "loss": 0.0010477714240550995, "memory(GiB)": 38.05, "reward": 0.5409475564956665, "reward_std": 0.09840838611125946, "rewards/VisualizationJSONCombinedORM/mean": 0.5409475564956665, "rewards/VisualizationJSONCombinedORM/std": 0.10108071565628052, "step": 2133, "train_speed(iter/s)": 0.064513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 255.375, "completions/min_length": 207.0, "epoch": 1.7650951199338296, "grad_norm": 0.19858242571353912, "kl": 0.04071044921875, "learning_rate": 4.1444380244600623e-07, "loss": 0.00040715932846069336, "memory(GiB)": 38.05, "reward": 0.6438992619514465, "reward_std": 0.06777189671993256, "rewards/VisualizationJSONCombinedORM/mean": 0.6438992619514465, "rewards/VisualizationJSONCombinedORM/std": 0.212876558303833, "step": 2134, "train_speed(iter/s)": 0.064498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 261.375, "completions/min_length": 201.0, "epoch": 1.7659222497932174, "grad_norm": 0.20098347961902618, "kl": 0.02392578125, "learning_rate": 4.115709690011288e-07, "loss": 0.0002392977476119995, "memory(GiB)": 38.05, "reward": 0.4461922347545624, "reward_std": 0.06192745268344879, "rewards/VisualizationJSONCombinedORM/mean": 0.4461922347545624, "rewards/VisualizationJSONCombinedORM/std": 0.06191405653953552, "step": 2135, "train_speed(iter/s)": 0.06448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 273.4375, "completions/min_length": 210.0, "epoch": 1.7667493796526055, "grad_norm": 0.19030049443244934, "kl": 0.042724609375, "learning_rate": 4.087076996958561e-07, "loss": 0.00042698904871940613, "memory(GiB)": 38.05, "reward": 0.5970565676689148, "reward_std": 0.08554999530315399, "rewards/VisualizationJSONCombinedORM/mean": 0.5970565676689148, "rewards/VisualizationJSONCombinedORM/std": 0.17594368755817413, "step": 2136, "train_speed(iter/s)": 0.064462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 282.6875, "completions/min_length": 222.0, "epoch": 1.7675765095119935, "grad_norm": 0.18954838812351227, "kl": 0.068359375, "learning_rate": 4.0585400049839853e-07, "loss": 0.0006833374500274658, "memory(GiB)": 38.05, "reward": 0.4986896514892578, "reward_std": 0.0698963850736618, "rewards/VisualizationJSONCombinedORM/mean": 0.4986896514892578, "rewards/VisualizationJSONCombinedORM/std": 0.19000555574893951, "step": 2137, "train_speed(iter/s)": 0.064446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 273.3125, "completions/min_length": 215.0, "epoch": 1.7684036393713813, "grad_norm": 0.2658125162124634, "kl": 0.0474853515625, "learning_rate": 4.0300987735701733e-07, "loss": 0.00047541409730911255, "memory(GiB)": 38.05, "reward": 0.2814522087574005, "reward_std": 0.027016092091798782, "rewards/VisualizationJSONCombinedORM/mean": 0.2814522087574005, "rewards/VisualizationJSONCombinedORM/std": 0.14732377231121063, "step": 2138, "train_speed(iter/s)": 0.064426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 257.75, "completions/min_length": 218.0, "epoch": 1.7692307692307692, "grad_norm": 0.2162107676267624, "kl": 0.04364013671875, "learning_rate": 4.0017533620001603e-07, "loss": 0.0004368135705590248, "memory(GiB)": 38.05, "reward": 0.5074799656867981, "reward_std": 0.08596909046173096, "rewards/VisualizationJSONCombinedORM/mean": 0.5074799656867981, "rewards/VisualizationJSONCombinedORM/std": 0.12487314641475677, "step": 2139, "train_speed(iter/s)": 0.064407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 253.875, "completions/min_length": 204.0, "epoch": 1.7700578990901572, "grad_norm": 0.1672515720129013, "kl": 0.0657958984375, "learning_rate": 3.973503829357223e-07, "loss": 0.0006576851010322571, "memory(GiB)": 38.05, "reward": 0.401841938495636, "reward_std": 0.04589558765292168, "rewards/VisualizationJSONCombinedORM/mean": 0.401841938495636, "rewards/VisualizationJSONCombinedORM/std": 0.07137586176395416, "step": 2140, "train_speed(iter/s)": 0.064398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 271.25, "completions/min_length": 222.0, "epoch": 1.770885028949545, "grad_norm": 0.18285787105560303, "kl": 0.07177734375, "learning_rate": 3.9453502345247863e-07, "loss": 0.0007179267704486847, "memory(GiB)": 38.05, "reward": 0.5481289625167847, "reward_std": 0.07057472318410873, "rewards/VisualizationJSONCombinedORM/mean": 0.5481289625167847, "rewards/VisualizationJSONCombinedORM/std": 0.08099190145730972, "step": 2141, "train_speed(iter/s)": 0.064382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 256.5, "completions/min_length": 205.0, "epoch": 1.771712158808933, "grad_norm": 0.14225655794143677, "kl": 0.0379638671875, "learning_rate": 3.9172926361863316e-07, "loss": 0.0003805980086326599, "memory(GiB)": 38.05, "reward": 0.7762468457221985, "reward_std": 0.07825078070163727, "rewards/VisualizationJSONCombinedORM/mean": 0.7762468457221985, "rewards/VisualizationJSONCombinedORM/std": 0.1079743504524231, "step": 2142, "train_speed(iter/s)": 0.064366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 268.8125, "completions/min_length": 215.0, "epoch": 1.772539288668321, "grad_norm": 0.21525786817073822, "kl": 0.062744140625, "learning_rate": 3.8893310928252157e-07, "loss": 0.0006281137466430664, "memory(GiB)": 38.05, "reward": 0.6643871665000916, "reward_std": 0.10208968818187714, "rewards/VisualizationJSONCombinedORM/mean": 0.6643871665000916, "rewards/VisualizationJSONCombinedORM/std": 0.11457409709692001, "step": 2143, "train_speed(iter/s)": 0.06435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 260.875, "completions/min_length": 217.0, "epoch": 1.7733664185277087, "grad_norm": 0.17364095151424408, "kl": 0.05889892578125, "learning_rate": 3.8614656627246115e-07, "loss": 0.0005898699164390564, "memory(GiB)": 38.05, "reward": 0.648986279964447, "reward_std": 0.0585060641169548, "rewards/VisualizationJSONCombinedORM/mean": 0.648986279964447, "rewards/VisualizationJSONCombinedORM/std": 0.19263800978660583, "step": 2144, "train_speed(iter/s)": 0.064342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 262.6875, "completions/min_length": 210.0, "epoch": 1.7741935483870968, "grad_norm": 0.17862577736377716, "kl": 0.05950927734375, "learning_rate": 3.8336964039673074e-07, "loss": 0.0005946196615695953, "memory(GiB)": 38.05, "reward": 0.410314679145813, "reward_std": 0.051916614174842834, "rewards/VisualizationJSONCombinedORM/mean": 0.410314679145813, "rewards/VisualizationJSONCombinedORM/std": 0.21169069409370422, "step": 2145, "train_speed(iter/s)": 0.064323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 292.0625, "completions/min_length": 218.0, "epoch": 1.7750206782464848, "grad_norm": 0.16269856691360474, "kl": 0.07403564453125, "learning_rate": 3.8060233744356634e-07, "loss": 0.000742809846997261, "memory(GiB)": 38.05, "reward": 0.39822858572006226, "reward_std": 0.05385127291083336, "rewards/VisualizationJSONCombinedORM/mean": 0.39822858572006226, "rewards/VisualizationJSONCombinedORM/std": 0.1093582883477211, "step": 2146, "train_speed(iter/s)": 0.064307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 248.25, "completions/min_length": 212.0, "epoch": 1.7758478081058726, "grad_norm": 0.16610990464687347, "kl": 0.05126953125, "learning_rate": 3.7784466318114554e-07, "loss": 0.0005132481455802917, "memory(GiB)": 38.05, "reward": 0.6129672527313232, "reward_std": 0.09341771900653839, "rewards/VisualizationJSONCombinedORM/mean": 0.6129672527313232, "rewards/VisualizationJSONCombinedORM/std": 0.16882756352424622, "step": 2147, "train_speed(iter/s)": 0.06429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 283.3125, "completions/min_length": 238.0, "epoch": 1.7766749379652604, "grad_norm": 0.2000243365764618, "kl": 0.067138671875, "learning_rate": 3.750966233575753e-07, "loss": 0.0006739944219589233, "memory(GiB)": 38.05, "reward": 0.21942868828773499, "reward_std": 0.023005351424217224, "rewards/VisualizationJSONCombinedORM/mean": 0.21942868828773499, "rewards/VisualizationJSONCombinedORM/std": 0.025567946955561638, "step": 2148, "train_speed(iter/s)": 0.064283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 273.6875, "completions/min_length": 202.0, "epoch": 1.7775020678246485, "grad_norm": 0.19247868657112122, "kl": 0.03912353515625, "learning_rate": 3.723582237008816e-07, "loss": 0.0003923177719116211, "memory(GiB)": 38.05, "reward": 0.7696653604507446, "reward_std": 0.061440370976924896, "rewards/VisualizationJSONCombinedORM/mean": 0.7696653604507446, "rewards/VisualizationJSONCombinedORM/std": 0.06416185945272446, "step": 2149, "train_speed(iter/s)": 0.064266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 278.8125, "completions/min_length": 207.0, "epoch": 1.7783291976840365, "grad_norm": 0.16702859103679657, "kl": 0.05963134765625, "learning_rate": 3.696294699189934e-07, "loss": 0.0005953013896942139, "memory(GiB)": 38.05, "reward": 0.5966094732284546, "reward_std": 0.09405040740966797, "rewards/VisualizationJSONCombinedORM/mean": 0.5966094732284546, "rewards/VisualizationJSONCombinedORM/std": 0.18114235997200012, "step": 2150, "train_speed(iter/s)": 0.064255 }, { "epoch": 1.7783291976840365, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 322.1666666666667, "eval_completions/mean_length": 267.046875, "eval_completions/min_length": 223.20833333333334, "eval_kl": 0.064666748046875, "eval_loss": 0.0006560322945006192, "eval_reward": 0.47425868051747483, "eval_reward_std": 0.07051146496087313, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47425868051747483, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07051146620263656, "eval_runtime": 285.1246, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.011, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 254.875, "completions/min_length": 207.0, "epoch": 1.7791563275434243, "grad_norm": 0.1721673458814621, "kl": 0.03192138671875, "learning_rate": 3.669103676997365e-07, "loss": 0.0003192424774169922, "memory(GiB)": 38.05, "reward": 0.5634341239929199, "reward_std": 0.06830226629972458, "rewards/VisualizationJSONCombinedORM/mean": 0.5634341239929199, "rewards/VisualizationJSONCombinedORM/std": 0.11760414391756058, "step": 2151, "train_speed(iter/s)": 0.063706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 270.5625, "completions/min_length": 219.0, "epoch": 1.7799834574028122, "grad_norm": 0.17968373000621796, "kl": 0.0767822265625, "learning_rate": 3.642009227108195e-07, "loss": 0.0007684193551540375, "memory(GiB)": 38.05, "reward": 0.5984259843826294, "reward_std": 0.1010703518986702, "rewards/VisualizationJSONCombinedORM/mean": 0.5984259843826294, "rewards/VisualizationJSONCombinedORM/std": 0.13687749207019806, "step": 2152, "train_speed(iter/s)": 0.063688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 257.4375, "completions/min_length": 219.0, "epoch": 1.7808105872622002, "grad_norm": 0.18115945160388947, "kl": 0.108642578125, "learning_rate": 3.6150114059982035e-07, "loss": 0.0010875612497329712, "memory(GiB)": 38.05, "reward": 0.5017573833465576, "reward_std": 0.1262594759464264, "rewards/VisualizationJSONCombinedORM/mean": 0.5017573833465576, "rewards/VisualizationJSONCombinedORM/std": 0.1322798728942871, "step": 2153, "train_speed(iter/s)": 0.063678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 268.375, "completions/min_length": 230.0, "epoch": 1.7816377171215882, "grad_norm": 0.15221858024597168, "kl": 0.1632080078125, "learning_rate": 3.588110269941747e-07, "loss": 0.001635529100894928, "memory(GiB)": 38.05, "reward": 0.4029001295566559, "reward_std": 0.04921892285346985, "rewards/VisualizationJSONCombinedORM/mean": 0.4029001295566559, "rewards/VisualizationJSONCombinedORM/std": 0.17552629113197327, "step": 2154, "train_speed(iter/s)": 0.06366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 273.3125, "completions/min_length": 213.0, "epoch": 1.782464846980976, "grad_norm": 0.16555878520011902, "kl": 0.036529541015625, "learning_rate": 3.56130587501165e-07, "loss": 0.0003657899796962738, "memory(GiB)": 38.05, "reward": 0.48688551783561707, "reward_std": 0.06524468958377838, "rewards/VisualizationJSONCombinedORM/mean": 0.48688551783561707, "rewards/VisualizationJSONCombinedORM/std": 0.10199368000030518, "step": 2155, "train_speed(iter/s)": 0.063645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 276.0625, "completions/min_length": 238.0, "epoch": 1.783291976840364, "grad_norm": 0.22181610763072968, "kl": 0.0885009765625, "learning_rate": 3.5345982770791096e-07, "loss": 0.0008860975503921509, "memory(GiB)": 38.05, "reward": 0.477009654045105, "reward_std": 0.073627769947052, "rewards/VisualizationJSONCombinedORM/mean": 0.477009654045105, "rewards/VisualizationJSONCombinedORM/std": 0.24813571572303772, "step": 2156, "train_speed(iter/s)": 0.063624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 278.5, "completions/min_length": 216.0, "epoch": 1.7841191066997517, "grad_norm": 0.18537122011184692, "kl": 0.04876708984375, "learning_rate": 3.5079875318135613e-07, "loss": 0.00048874132335186, "memory(GiB)": 38.05, "reward": 0.6509082317352295, "reward_std": 0.0696316659450531, "rewards/VisualizationJSONCombinedORM/mean": 0.6509082317352295, "rewards/VisualizationJSONCombinedORM/std": 0.0779593363404274, "step": 2157, "train_speed(iter/s)": 0.063614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 281.9375, "completions/min_length": 239.0, "epoch": 1.7849462365591398, "grad_norm": 0.17737123370170593, "kl": 0.044464111328125, "learning_rate": 3.4814736946825357e-07, "loss": 0.00044490210711956024, "memory(GiB)": 38.05, "reward": 0.468527227640152, "reward_std": 0.06763579696416855, "rewards/VisualizationJSONCombinedORM/mean": 0.468527227640152, "rewards/VisualizationJSONCombinedORM/std": 0.17445626854896545, "step": 2158, "train_speed(iter/s)": 0.063592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 264.4375, "completions/min_length": 202.0, "epoch": 1.7857733664185278, "grad_norm": 0.19073861837387085, "kl": 0.05426025390625, "learning_rate": 3.455056820951569e-07, "loss": 0.0005424320697784424, "memory(GiB)": 38.05, "reward": 0.7447251081466675, "reward_std": 0.09174096584320068, "rewards/VisualizationJSONCombinedORM/mean": 0.7447251081466675, "rewards/VisualizationJSONCombinedORM/std": 0.09277832508087158, "step": 2159, "train_speed(iter/s)": 0.063574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 267.0625, "completions/min_length": 214.0, "epoch": 1.7866004962779156, "grad_norm": 0.14147746562957764, "kl": 0.0406494140625, "learning_rate": 3.4287369656841095e-07, "loss": 0.0004077889025211334, "memory(GiB)": 38.05, "reward": 0.709615170955658, "reward_std": 0.04082494601607323, "rewards/VisualizationJSONCombinedORM/mean": 0.709615170955658, "rewards/VisualizationJSONCombinedORM/std": 0.09167554974555969, "step": 2160, "train_speed(iter/s)": 0.063555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 256.1875, "completions/min_length": 207.0, "epoch": 1.7874276261373034, "grad_norm": 0.19920018315315247, "kl": 0.04742431640625, "learning_rate": 3.402514183741368e-07, "loss": 0.00047435984015464783, "memory(GiB)": 38.05, "reward": 0.5923972129821777, "reward_std": 0.09428408741950989, "rewards/VisualizationJSONCombinedORM/mean": 0.5923972129821777, "rewards/VisualizationJSONCombinedORM/std": 0.13191726803779602, "step": 2161, "train_speed(iter/s)": 0.063546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 250.5625, "completions/min_length": 222.0, "epoch": 1.7882547559966915, "grad_norm": 0.14611268043518066, "kl": 0.087158203125, "learning_rate": 3.3763885297822153e-07, "loss": 0.0008719991892576218, "memory(GiB)": 38.05, "reward": 0.5682182908058167, "reward_std": 0.060903400182724, "rewards/VisualizationJSONCombinedORM/mean": 0.5682182908058167, "rewards/VisualizationJSONCombinedORM/std": 0.06019890308380127, "step": 2162, "train_speed(iter/s)": 0.063521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 292.0, "completions/min_length": 229.0, "epoch": 1.7890818858560795, "grad_norm": 0.15892194211483002, "kl": 0.06134033203125, "learning_rate": 3.350360058263058e-07, "loss": 0.000613875687122345, "memory(GiB)": 38.05, "reward": 0.3800690174102783, "reward_std": 0.03799288347363472, "rewards/VisualizationJSONCombinedORM/mean": 0.3800690174102783, "rewards/VisualizationJSONCombinedORM/std": 0.21390581130981445, "step": 2163, "train_speed(iter/s)": 0.063505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 294.75, "completions/min_length": 235.0, "epoch": 1.7899090157154673, "grad_norm": 0.19953303039073944, "kl": 0.05706787109375, "learning_rate": 3.324428823437753e-07, "loss": 0.0005708113312721252, "memory(GiB)": 38.05, "reward": 0.4881225526332855, "reward_std": 0.06714320182800293, "rewards/VisualizationJSONCombinedORM/mean": 0.4881225526332855, "rewards/VisualizationJSONCombinedORM/std": 0.23029819130897522, "step": 2164, "train_speed(iter/s)": 0.063486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 266.5625, "completions/min_length": 213.0, "epoch": 1.7907361455748552, "grad_norm": 0.19024765491485596, "kl": 0.06121826171875, "learning_rate": 3.298594879357464e-07, "loss": 0.0006114356219768524, "memory(GiB)": 38.05, "reward": 0.5194427371025085, "reward_std": 0.07254256308078766, "rewards/VisualizationJSONCombinedORM/mean": 0.5194427371025085, "rewards/VisualizationJSONCombinedORM/std": 0.22413040697574615, "step": 2165, "train_speed(iter/s)": 0.063464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 280.5, "completions/min_length": 219.0, "epoch": 1.7915632754342432, "grad_norm": 0.2034146785736084, "kl": 0.0540771484375, "learning_rate": 3.272858279870583e-07, "loss": 0.0005411095917224884, "memory(GiB)": 38.05, "reward": 0.45824897289276123, "reward_std": 0.041076742112636566, "rewards/VisualizationJSONCombinedORM/mean": 0.45824897289276123, "rewards/VisualizationJSONCombinedORM/std": 0.17363224923610687, "step": 2166, "train_speed(iter/s)": 0.063448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 256.875, "completions/min_length": 197.0, "epoch": 1.7923904052936313, "grad_norm": 0.16164246201515198, "kl": 0.15283203125, "learning_rate": 3.24721907862256e-07, "loss": 0.0015275292098522186, "memory(GiB)": 38.05, "reward": 0.4701957106590271, "reward_std": 0.06041920930147171, "rewards/VisualizationJSONCombinedORM/mean": 0.4701957106590271, "rewards/VisualizationJSONCombinedORM/std": 0.2600381076335907, "step": 2167, "train_speed(iter/s)": 0.063434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 282.6875, "completions/min_length": 212.0, "epoch": 1.793217535153019, "grad_norm": 0.17533332109451294, "kl": 0.04443359375, "learning_rate": 3.22167732905585e-07, "loss": 0.00044370442628860474, "memory(GiB)": 38.05, "reward": 0.49066251516342163, "reward_std": 0.058531418442726135, "rewards/VisualizationJSONCombinedORM/mean": 0.49066251516342163, "rewards/VisualizationJSONCombinedORM/std": 0.23856323957443237, "step": 2168, "train_speed(iter/s)": 0.063418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 287.6875, "completions/min_length": 231.0, "epoch": 1.794044665012407, "grad_norm": 0.17486022412776947, "kl": 0.0753173828125, "learning_rate": 3.196233084409778e-07, "loss": 0.0007530879229307175, "memory(GiB)": 38.05, "reward": 0.4150505065917969, "reward_std": 0.05379379913210869, "rewards/VisualizationJSONCombinedORM/mean": 0.4150505065917969, "rewards/VisualizationJSONCombinedORM/std": 0.19521114230155945, "step": 2169, "train_speed(iter/s)": 0.063396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 286.75, "completions/min_length": 230.0, "epoch": 1.7948717948717947, "grad_norm": 0.17320983111858368, "kl": 0.0626220703125, "learning_rate": 3.170886397720435e-07, "loss": 0.0006259530782699585, "memory(GiB)": 38.05, "reward": 0.41415172815322876, "reward_std": 0.055269185453653336, "rewards/VisualizationJSONCombinedORM/mean": 0.41415172815322876, "rewards/VisualizationJSONCombinedORM/std": 0.1979961097240448, "step": 2170, "train_speed(iter/s)": 0.063378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 248.5, "completions/min_length": 212.0, "epoch": 1.7956989247311828, "grad_norm": 0.18829262256622314, "kl": 0.065185546875, "learning_rate": 3.145637321820555e-07, "loss": 0.0006535165011882782, "memory(GiB)": 38.05, "reward": 0.49116092920303345, "reward_std": 0.051176633685827255, "rewards/VisualizationJSONCombinedORM/mean": 0.49116092920303345, "rewards/VisualizationJSONCombinedORM/std": 0.20506471395492554, "step": 2171, "train_speed(iter/s)": 0.063374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 275.3125, "completions/min_length": 212.0, "epoch": 1.7965260545905708, "grad_norm": 0.15641935169696808, "kl": 0.032318115234375, "learning_rate": 3.120485909339399e-07, "loss": 0.00032295286655426025, "memory(GiB)": 38.05, "reward": 0.37566667795181274, "reward_std": 0.10193780064582825, "rewards/VisualizationJSONCombinedORM/mean": 0.37566667795181274, "rewards/VisualizationJSONCombinedORM/std": 0.12292180955410004, "step": 2172, "train_speed(iter/s)": 0.063358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 257.25, "completions/min_length": 218.0, "epoch": 1.7973531844499586, "grad_norm": 0.21072421967983246, "kl": 0.08465576171875, "learning_rate": 3.0954322127026814e-07, "loss": 0.0008489042520523071, "memory(GiB)": 38.05, "reward": 0.3846149146556854, "reward_std": 0.06006525829434395, "rewards/VisualizationJSONCombinedORM/mean": 0.3846149146556854, "rewards/VisualizationJSONCombinedORM/std": 0.09214406460523605, "step": 2173, "train_speed(iter/s)": 0.063343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 285.625, "completions/min_length": 220.0, "epoch": 1.7981803143093464, "grad_norm": 0.16418929398059845, "kl": 0.0390625, "learning_rate": 3.070476284132429e-07, "loss": 0.000391203910112381, "memory(GiB)": 38.05, "reward": 0.7006306648254395, "reward_std": 0.06399502605199814, "rewards/VisualizationJSONCombinedORM/mean": 0.7006306648254395, "rewards/VisualizationJSONCombinedORM/std": 0.08464165031909943, "step": 2174, "train_speed(iter/s)": 0.063327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 286.75, "completions/min_length": 182.0, "epoch": 1.7990074441687345, "grad_norm": 0.18716807663440704, "kl": 0.043701171875, "learning_rate": 3.0456181756468584e-07, "loss": 0.0004374459385871887, "memory(GiB)": 38.05, "reward": 0.6716888546943665, "reward_std": 0.08291679620742798, "rewards/VisualizationJSONCombinedORM/mean": 0.6716888546943665, "rewards/VisualizationJSONCombinedORM/std": 0.0805542916059494, "step": 2175, "train_speed(iter/s)": 0.06331 }, { "epoch": 1.7990074441687345, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 326.6666666666667, "eval_completions/mean_length": 272.0833333333333, "eval_completions/min_length": 230.29166666666666, "eval_kl": 0.06593831380208333, "eval_loss": 0.000657942146062851, "eval_reward": 0.4889835088203351, "eval_reward_std": 0.06802545961303015, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4889835088203351, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0680254624846081, "eval_runtime": 287.8625, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 260.125, "completions/min_length": 171.0, "epoch": 1.7998345740281225, "grad_norm": 0.18101181089878082, "kl": 0.076416015625, "learning_rate": 3.02085793906034e-07, "loss": 0.0007637962698936462, "memory(GiB)": 38.05, "reward": 0.7709125280380249, "reward_std": 0.0885973796248436, "rewards/VisualizationJSONCombinedORM/mean": 0.7709125280380249, "rewards/VisualizationJSONCombinedORM/std": 0.10551486164331436, "step": 2176, "train_speed(iter/s)": 0.062764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 278.0, "completions/min_length": 220.0, "epoch": 1.8006617038875103, "grad_norm": 0.16051006317138672, "kl": 0.055389404296875, "learning_rate": 2.996195625983178e-07, "loss": 0.0005536079406738281, "memory(GiB)": 38.05, "reward": 0.6076450943946838, "reward_std": 0.061517708003520966, "rewards/VisualizationJSONCombinedORM/mean": 0.6076450943946838, "rewards/VisualizationJSONCombinedORM/std": 0.1724141240119934, "step": 2177, "train_speed(iter/s)": 0.062749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 271.9375, "completions/min_length": 229.0, "epoch": 1.8014888337468982, "grad_norm": 0.17221444845199585, "kl": 0.05621337890625, "learning_rate": 2.9716312878216194e-07, "loss": 0.0005616769194602966, "memory(GiB)": 38.05, "reward": 0.4774402379989624, "reward_std": 0.05988318845629692, "rewards/VisualizationJSONCombinedORM/mean": 0.4774402379989624, "rewards/VisualizationJSONCombinedORM/std": 0.07280100882053375, "step": 2178, "train_speed(iter/s)": 0.062734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 277.5, "completions/min_length": 218.0, "epoch": 1.8023159636062862, "grad_norm": 0.21558411419391632, "kl": 0.06292724609375, "learning_rate": 2.9471649757776555e-07, "loss": 0.000629376620054245, "memory(GiB)": 38.05, "reward": 0.5166030526161194, "reward_std": 0.07887756824493408, "rewards/VisualizationJSONCombinedORM/mean": 0.5166030526161194, "rewards/VisualizationJSONCombinedORM/std": 0.1592252105474472, "step": 2179, "train_speed(iter/s)": 0.062723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 255.75, "completions/min_length": 193.0, "epoch": 1.8031430934656743, "grad_norm": 0.22426031529903412, "kl": 0.03826904296875, "learning_rate": 2.9227967408489653e-07, "loss": 0.0003828369081020355, "memory(GiB)": 38.05, "reward": 0.5076264142990112, "reward_std": 0.03868992254137993, "rewards/VisualizationJSONCombinedORM/mean": 0.5076264142990112, "rewards/VisualizationJSONCombinedORM/std": 0.30605795979499817, "step": 2180, "train_speed(iter/s)": 0.062703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 242.625, "completions/min_length": 215.0, "epoch": 1.803970223325062, "grad_norm": 0.2189868837594986, "kl": 0.05609130859375, "learning_rate": 2.898526633828791e-07, "loss": 0.0005604103207588196, "memory(GiB)": 38.05, "reward": 0.7280930876731873, "reward_std": 0.09531067311763763, "rewards/VisualizationJSONCombinedORM/mean": 0.7280930876731873, "rewards/VisualizationJSONCombinedORM/std": 0.15371371805667877, "step": 2181, "train_speed(iter/s)": 0.062691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 262.375, "completions/min_length": 223.0, "epoch": 1.80479735318445, "grad_norm": 0.21362413465976715, "kl": 0.037933349609375, "learning_rate": 2.874354705305843e-07, "loss": 0.00037997961044311523, "memory(GiB)": 38.05, "reward": 0.5690709352493286, "reward_std": 0.08413350582122803, "rewards/VisualizationJSONCombinedORM/mean": 0.5690709352493286, "rewards/VisualizationJSONCombinedORM/std": 0.10872936248779297, "step": 2182, "train_speed(iter/s)": 0.062673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 291.3125, "completions/min_length": 237.0, "epoch": 1.805624483043838, "grad_norm": 0.21746964752674103, "kl": 0.081298828125, "learning_rate": 2.8502810056641903e-07, "loss": 0.0008141323924064636, "memory(GiB)": 38.05, "reward": 0.6811312437057495, "reward_std": 0.09433664381504059, "rewards/VisualizationJSONCombinedORM/mean": 0.6811312437057495, "rewards/VisualizationJSONCombinedORM/std": 0.09543675184249878, "step": 2183, "train_speed(iter/s)": 0.062653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 268.0, "completions/min_length": 207.0, "epoch": 1.8064516129032258, "grad_norm": 0.18644042313098907, "kl": 0.16552734375, "learning_rate": 2.826305585083144e-07, "loss": 0.001656603068113327, "memory(GiB)": 38.05, "reward": 0.43664342164993286, "reward_std": 0.08400847017765045, "rewards/VisualizationJSONCombinedORM/mean": 0.43664342164993286, "rewards/VisualizationJSONCombinedORM/std": 0.12873506546020508, "step": 2184, "train_speed(iter/s)": 0.062641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 242.5625, "completions/min_length": 213.0, "epoch": 1.8072787427626138, "grad_norm": 0.1787150353193283, "kl": 0.042724609375, "learning_rate": 2.802428493537157e-07, "loss": 0.0004274025559425354, "memory(GiB)": 38.05, "reward": 0.6807671785354614, "reward_std": 0.07367601245641708, "rewards/VisualizationJSONCombinedORM/mean": 0.6807671785354614, "rewards/VisualizationJSONCombinedORM/std": 0.07151491194963455, "step": 2185, "train_speed(iter/s)": 0.062628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 254.1875, "completions/min_length": 207.0, "epoch": 1.8081058726220016, "grad_norm": 0.20036661624908447, "kl": 0.04791259765625, "learning_rate": 2.778649780795739e-07, "loss": 0.00047938525676727295, "memory(GiB)": 38.05, "reward": 0.5052639245986938, "reward_std": 0.05076201260089874, "rewards/VisualizationJSONCombinedORM/mean": 0.5052639245986938, "rewards/VisualizationJSONCombinedORM/std": 0.22325822710990906, "step": 2186, "train_speed(iter/s)": 0.062612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 257.5, "completions/min_length": 213.0, "epoch": 1.8089330024813894, "grad_norm": 0.1953592300415039, "kl": 0.06005859375, "learning_rate": 2.754969496423343e-07, "loss": 0.0006002336740493774, "memory(GiB)": 38.05, "reward": 0.7125554084777832, "reward_std": 0.077239990234375, "rewards/VisualizationJSONCombinedORM/mean": 0.7125554084777832, "rewards/VisualizationJSONCombinedORM/std": 0.09134367108345032, "step": 2187, "train_speed(iter/s)": 0.062601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 282.125, "completions/min_length": 216.0, "epoch": 1.8097601323407775, "grad_norm": 0.20513713359832764, "kl": 0.1297607421875, "learning_rate": 2.7313876897792304e-07, "loss": 0.0012989304959774017, "memory(GiB)": 38.05, "reward": 0.5973222255706787, "reward_std": 0.0684472918510437, "rewards/VisualizationJSONCombinedORM/mean": 0.5973222255706787, "rewards/VisualizationJSONCombinedORM/std": 0.14993825554847717, "step": 2188, "train_speed(iter/s)": 0.062581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 269.25, "completions/min_length": 208.0, "epoch": 1.8105872622001655, "grad_norm": 0.2800453305244446, "kl": 0.04669189453125, "learning_rate": 2.707904410017426e-07, "loss": 0.00046654045581817627, "memory(GiB)": 38.05, "reward": 0.5461143255233765, "reward_std": 0.08322364091873169, "rewards/VisualizationJSONCombinedORM/mean": 0.5461143255233765, "rewards/VisualizationJSONCombinedORM/std": 0.11118002980947495, "step": 2189, "train_speed(iter/s)": 0.062561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 257.625, "completions/min_length": 223.0, "epoch": 1.8114143920595533, "grad_norm": 0.16364189982414246, "kl": 0.093505859375, "learning_rate": 2.684519706086558e-07, "loss": 0.0009344629943370819, "memory(GiB)": 38.05, "reward": 0.277774453163147, "reward_std": 0.07067995518445969, "rewards/VisualizationJSONCombinedORM/mean": 0.277774453163147, "rewards/VisualizationJSONCombinedORM/std": 0.08525431156158447, "step": 2190, "train_speed(iter/s)": 0.062538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 279.875, "completions/min_length": 220.0, "epoch": 1.8122415219189412, "grad_norm": 0.21351687610149384, "kl": 0.05169677734375, "learning_rate": 2.661233626729809e-07, "loss": 0.0005191750824451447, "memory(GiB)": 38.05, "reward": 0.4515253007411957, "reward_std": 0.05089660361409187, "rewards/VisualizationJSONCombinedORM/mean": 0.4515253007411957, "rewards/VisualizationJSONCombinedORM/std": 0.10515337437391281, "step": 2191, "train_speed(iter/s)": 0.062522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 267.0, "completions/min_length": 200.0, "epoch": 1.8130686517783292, "grad_norm": 0.18190400302410126, "kl": 0.08935546875, "learning_rate": 2.6380462204847633e-07, "loss": 0.0008944347500801086, "memory(GiB)": 38.05, "reward": 0.4846891164779663, "reward_std": 0.0651334896683693, "rewards/VisualizationJSONCombinedORM/mean": 0.4846891164779663, "rewards/VisualizationJSONCombinedORM/std": 0.07478946447372437, "step": 2192, "train_speed(iter/s)": 0.062502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 273.5625, "completions/min_length": 200.0, "epoch": 1.8138957816377173, "grad_norm": 0.18249216675758362, "kl": 0.10107421875, "learning_rate": 2.614957535683349e-07, "loss": 0.0010100491344928741, "memory(GiB)": 38.05, "reward": 0.49013620615005493, "reward_std": 0.09432581067085266, "rewards/VisualizationJSONCombinedORM/mean": 0.49013620615005493, "rewards/VisualizationJSONCombinedORM/std": 0.16969719529151917, "step": 2193, "train_speed(iter/s)": 0.062476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 288.0, "completions/min_length": 247.0, "epoch": 1.814722911497105, "grad_norm": 0.17200295627117157, "kl": 0.082275390625, "learning_rate": 2.5919676204517073e-07, "loss": 0.0008222609758377075, "memory(GiB)": 38.05, "reward": 0.41400033235549927, "reward_std": 0.055223435163497925, "rewards/VisualizationJSONCombinedORM/mean": 0.41400033235549927, "rewards/VisualizationJSONCombinedORM/std": 0.07191161811351776, "step": 2194, "train_speed(iter/s)": 0.062467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 258.9375, "completions/min_length": 214.0, "epoch": 1.815550041356493, "grad_norm": 0.20034095644950867, "kl": 0.1416015625, "learning_rate": 2.569076522710118e-07, "loss": 0.0014097802340984344, "memory(GiB)": 38.05, "reward": 0.5186225175857544, "reward_std": 0.08778096735477448, "rewards/VisualizationJSONCombinedORM/mean": 0.5186225175857544, "rewards/VisualizationJSONCombinedORM/std": 0.1746925264596939, "step": 2195, "train_speed(iter/s)": 0.06245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 271.8125, "completions/min_length": 212.0, "epoch": 1.816377171215881, "grad_norm": 0.1874234676361084, "kl": 0.108642578125, "learning_rate": 2.546284290172862e-07, "loss": 0.0010859607718884945, "memory(GiB)": 38.05, "reward": 0.4020238518714905, "reward_std": 0.04965972155332565, "rewards/VisualizationJSONCombinedORM/mean": 0.4020238518714905, "rewards/VisualizationJSONCombinedORM/std": 0.08216232061386108, "step": 2196, "train_speed(iter/s)": 0.062435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 259.8125, "completions/min_length": 231.0, "epoch": 1.817204301075269, "grad_norm": 0.1968688815832138, "kl": 0.0859375, "learning_rate": 2.523590970348166e-07, "loss": 0.0008604023605585098, "memory(GiB)": 38.05, "reward": 0.5405089259147644, "reward_std": 0.08602890372276306, "rewards/VisualizationJSONCombinedORM/mean": 0.5405089259147644, "rewards/VisualizationJSONCombinedORM/std": 0.170456200838089, "step": 2197, "train_speed(iter/s)": 0.062427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 267.9375, "completions/min_length": 210.0, "epoch": 1.8180314309346568, "grad_norm": 0.19577530026435852, "kl": 0.0814208984375, "learning_rate": 2.500996610538081e-07, "loss": 0.000814586877822876, "memory(GiB)": 38.05, "reward": 0.5728603601455688, "reward_std": 0.10062052309513092, "rewards/VisualizationJSONCombinedORM/mean": 0.5728603601455688, "rewards/VisualizationJSONCombinedORM/std": 0.10810214281082153, "step": 2198, "train_speed(iter/s)": 0.062412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 283.125, "completions/min_length": 200.0, "epoch": 1.8188585607940446, "grad_norm": 0.16272751986980438, "kl": 0.05389404296875, "learning_rate": 2.4785012578383673e-07, "loss": 0.0005394518375396729, "memory(GiB)": 38.05, "reward": 0.42174285650253296, "reward_std": 0.0316319465637207, "rewards/VisualizationJSONCombinedORM/mean": 0.42174285650253296, "rewards/VisualizationJSONCombinedORM/std": 0.1888856142759323, "step": 2199, "train_speed(iter/s)": 0.062394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 265.5, "completions/min_length": 217.0, "epoch": 1.8196856906534324, "grad_norm": 0.1743164360523224, "kl": 0.0992431640625, "learning_rate": 2.4561049591384387e-07, "loss": 0.0009928569197654724, "memory(GiB)": 38.05, "reward": 0.6045565605163574, "reward_std": 0.09120481461286545, "rewards/VisualizationJSONCombinedORM/mean": 0.6045565605163574, "rewards/VisualizationJSONCombinedORM/std": 0.26227039098739624, "step": 2200, "train_speed(iter/s)": 0.06238 }, { "epoch": 1.8196856906534324, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 313.5, "eval_completions/mean_length": 266.609375, "eval_completions/min_length": 227.5, "eval_kl": 0.07310994466145833, "eval_loss": 0.000729534775018692, "eval_reward": 0.46669595316052437, "eval_reward_std": 0.06564661193018158, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46669595316052437, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06564661441370845, "eval_runtime": 280.3417, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.011, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 252.625, "completions/min_length": 210.0, "epoch": 1.8205128205128205, "grad_norm": 0.16866819560527802, "kl": 0.05877685546875, "learning_rate": 2.433807761121221e-07, "loss": 0.000585898756980896, "memory(GiB)": 38.05, "reward": 0.45459216833114624, "reward_std": 0.0459321103990078, "rewards/VisualizationJSONCombinedORM/mean": 0.45459216833114624, "rewards/VisualizationJSONCombinedORM/std": 0.3006109595298767, "step": 2201, "train_speed(iter/s)": 0.061873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 250.625, "completions/min_length": 210.0, "epoch": 1.8213399503722085, "grad_norm": 0.21017952263355255, "kl": 0.07183837890625, "learning_rate": 2.411609710263091e-07, "loss": 0.0007189810276031494, "memory(GiB)": 38.05, "reward": 0.6759318113327026, "reward_std": 0.10458958148956299, "rewards/VisualizationJSONCombinedORM/mean": 0.6759318113327026, "rewards/VisualizationJSONCombinedORM/std": 0.17437858879566193, "step": 2202, "train_speed(iter/s)": 0.061858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 246.875, "completions/min_length": 217.0, "epoch": 1.8221670802315963, "grad_norm": 0.21402062475681305, "kl": 0.05181884765625, "learning_rate": 2.3895108528337373e-07, "loss": 0.0005192868411540985, "memory(GiB)": 38.05, "reward": 0.37435102462768555, "reward_std": 0.04648906737565994, "rewards/VisualizationJSONCombinedORM/mean": 0.37435102462768555, "rewards/VisualizationJSONCombinedORM/std": 0.050472963601350784, "step": 2203, "train_speed(iter/s)": 0.061845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 266.125, "completions/min_length": 232.0, "epoch": 1.8229942100909842, "grad_norm": 0.13383358716964722, "kl": 0.044921875, "learning_rate": 2.367511234896125e-07, "loss": 0.0004494190216064453, "memory(GiB)": 38.05, "reward": 0.6814532279968262, "reward_std": 0.07118196040391922, "rewards/VisualizationJSONCombinedORM/mean": 0.6814532279968262, "rewards/VisualizationJSONCombinedORM/std": 0.13197918236255646, "step": 2204, "train_speed(iter/s)": 0.061831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 259.1875, "completions/min_length": 216.0, "epoch": 1.8238213399503722, "grad_norm": 0.19344739615917206, "kl": 0.03411865234375, "learning_rate": 2.345610902306328e-07, "loss": 0.00034038349986076355, "memory(GiB)": 38.05, "reward": 0.609154224395752, "reward_std": 0.044522132724523544, "rewards/VisualizationJSONCombinedORM/mean": 0.609154224395752, "rewards/VisualizationJSONCombinedORM/std": 0.0654720738530159, "step": 2205, "train_speed(iter/s)": 0.061815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 278.625, "completions/min_length": 234.0, "epoch": 1.8246484698097603, "grad_norm": 0.1589978188276291, "kl": 0.14697265625, "learning_rate": 2.3238099007134973e-07, "loss": 0.001468375325202942, "memory(GiB)": 38.05, "reward": 0.680446982383728, "reward_std": 0.09706118702888489, "rewards/VisualizationJSONCombinedORM/mean": 0.680446982383728, "rewards/VisualizationJSONCombinedORM/std": 0.10864447802305222, "step": 2206, "train_speed(iter/s)": 0.061792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 272.875, "completions/min_length": 212.0, "epoch": 1.825475599669148, "grad_norm": 0.2583636939525604, "kl": 0.0633544921875, "learning_rate": 2.3021082755597223e-07, "loss": 0.0006349384784698486, "memory(GiB)": 38.05, "reward": 0.672278881072998, "reward_std": 0.0824417695403099, "rewards/VisualizationJSONCombinedORM/mean": 0.672278881072998, "rewards/VisualizationJSONCombinedORM/std": 0.1238962784409523, "step": 2207, "train_speed(iter/s)": 0.061775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 249.375, "completions/min_length": 207.0, "epoch": 1.826302729528536, "grad_norm": 0.16907677054405212, "kl": 0.074462890625, "learning_rate": 2.280506072079963e-07, "loss": 0.0007462352514266968, "memory(GiB)": 38.05, "reward": 0.25707101821899414, "reward_std": 0.03548218309879303, "rewards/VisualizationJSONCombinedORM/mean": 0.25707101821899414, "rewards/VisualizationJSONCombinedORM/std": 0.04274679720401764, "step": 2208, "train_speed(iter/s)": 0.061765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 268.6875, "completions/min_length": 232.0, "epoch": 1.827129859387924, "grad_norm": 0.17088426649570465, "kl": 0.1121826171875, "learning_rate": 2.2590033353019235e-07, "loss": 0.0011186972260475159, "memory(GiB)": 38.05, "reward": 0.461277037858963, "reward_std": 0.09452099353075027, "rewards/VisualizationJSONCombinedORM/mean": 0.461277037858963, "rewards/VisualizationJSONCombinedORM/std": 0.1190117672085762, "step": 2209, "train_speed(iter/s)": 0.061758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 281.375, "completions/min_length": 245.0, "epoch": 1.827956989247312, "grad_norm": 0.20647282898426056, "kl": 0.07080078125, "learning_rate": 2.237600110046001e-07, "loss": 0.0007077455520629883, "memory(GiB)": 38.05, "reward": 0.26925498247146606, "reward_std": 0.038149915635585785, "rewards/VisualizationJSONCombinedORM/mean": 0.26925498247146606, "rewards/VisualizationJSONCombinedORM/std": 0.061204034835100174, "step": 2210, "train_speed(iter/s)": 0.061747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 252.6875, "completions/min_length": 222.0, "epoch": 1.8287841191066998, "grad_norm": 0.17368687689304352, "kl": 0.0460205078125, "learning_rate": 2.2162964409251697e-07, "loss": 0.00046040862798690796, "memory(GiB)": 38.05, "reward": 0.5860590934753418, "reward_std": 0.05659131705760956, "rewards/VisualizationJSONCombinedORM/mean": 0.5860590934753418, "rewards/VisualizationJSONCombinedORM/std": 0.19704188406467438, "step": 2211, "train_speed(iter/s)": 0.061739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 262.125, "completions/min_length": 208.0, "epoch": 1.8296112489660876, "grad_norm": 0.1927589476108551, "kl": 0.07562255859375, "learning_rate": 2.1950923723448704e-07, "loss": 0.0007561743259429932, "memory(GiB)": 38.05, "reward": 0.5077954530715942, "reward_std": 0.06947344541549683, "rewards/VisualizationJSONCombinedORM/mean": 0.5077954530715942, "rewards/VisualizationJSONCombinedORM/std": 0.0972425788640976, "step": 2212, "train_speed(iter/s)": 0.061722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 287.0625, "completions/min_length": 241.0, "epoch": 1.8304383788254754, "grad_norm": 0.22150897979736328, "kl": 0.078857421875, "learning_rate": 2.1739879485029537e-07, "loss": 0.0007878802716732025, "memory(GiB)": 38.05, "reward": 0.49761366844177246, "reward_std": 0.06524437665939331, "rewards/VisualizationJSONCombinedORM/mean": 0.49761366844177246, "rewards/VisualizationJSONCombinedORM/std": 0.08627961575984955, "step": 2213, "train_speed(iter/s)": 0.06171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 297.375, "completions/min_length": 251.0, "epoch": 1.8312655086848635, "grad_norm": 0.16825899481773376, "kl": 0.123291015625, "learning_rate": 2.152983213389559e-07, "loss": 0.0012334603816270828, "memory(GiB)": 38.05, "reward": 0.3969697058200836, "reward_std": 0.02639719285070896, "rewards/VisualizationJSONCombinedORM/mean": 0.3969697058200836, "rewards/VisualizationJSONCombinedORM/std": 0.13622726500034332, "step": 2214, "train_speed(iter/s)": 0.061687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 282.125, "completions/min_length": 251.0, "epoch": 1.8320926385442515, "grad_norm": 0.17778682708740234, "kl": 0.05352783203125, "learning_rate": 2.1320782107870474e-07, "loss": 0.0005348995327949524, "memory(GiB)": 38.05, "reward": 0.5192127227783203, "reward_std": 0.13565057516098022, "rewards/VisualizationJSONCombinedORM/mean": 0.5192127227783203, "rewards/VisualizationJSONCombinedORM/std": 0.20235632359981537, "step": 2215, "train_speed(iter/s)": 0.061672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 265.875, "completions/min_length": 222.0, "epoch": 1.8329197684036393, "grad_norm": 0.2369738668203354, "kl": 0.08062744140625, "learning_rate": 2.11127298426988e-07, "loss": 0.0008048191666603088, "memory(GiB)": 38.05, "reward": 0.49112316966056824, "reward_std": 0.08659093081951141, "rewards/VisualizationJSONCombinedORM/mean": 0.49112316966056824, "rewards/VisualizationJSONCombinedORM/std": 0.1598891019821167, "step": 2216, "train_speed(iter/s)": 0.061653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 260.125, "completions/min_length": 205.0, "epoch": 1.8337468982630272, "grad_norm": 0.21739129722118378, "kl": 0.031768798828125, "learning_rate": 2.0905675772045608e-07, "loss": 0.00031829625368118286, "memory(GiB)": 38.05, "reward": 0.5760958194732666, "reward_std": 0.06493935734033585, "rewards/VisualizationJSONCombinedORM/mean": 0.5760958194732666, "rewards/VisualizationJSONCombinedORM/std": 0.14688508212566376, "step": 2217, "train_speed(iter/s)": 0.061641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 257.1875, "completions/min_length": 210.0, "epoch": 1.8345740281224152, "grad_norm": 0.20494423806667328, "kl": 0.06207275390625, "learning_rate": 2.0699620327495174e-07, "loss": 0.0006198771297931671, "memory(GiB)": 38.05, "reward": 0.3913351595401764, "reward_std": 0.03853441774845123, "rewards/VisualizationJSONCombinedORM/mean": 0.3913351595401764, "rewards/VisualizationJSONCombinedORM/std": 0.11763971298933029, "step": 2218, "train_speed(iter/s)": 0.061635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 269.3125, "completions/min_length": 228.0, "epoch": 1.8354011579818033, "grad_norm": 0.28257396817207336, "kl": 0.06817626953125, "learning_rate": 2.0494563938550262e-07, "loss": 0.0006802529096603394, "memory(GiB)": 38.05, "reward": 0.40386325120925903, "reward_std": 0.08628495037555695, "rewards/VisualizationJSONCombinedORM/mean": 0.40386325120925903, "rewards/VisualizationJSONCombinedORM/std": 0.11925913393497467, "step": 2219, "train_speed(iter/s)": 0.061619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 262.375, "completions/min_length": 204.0, "epoch": 1.836228287841191, "grad_norm": 0.20794889330863953, "kl": 0.035369873046875, "learning_rate": 2.0290507032631356e-07, "loss": 0.00035445764660835266, "memory(GiB)": 38.05, "reward": 0.6957353353500366, "reward_std": 0.08976374566555023, "rewards/VisualizationJSONCombinedORM/mean": 0.6957353353500366, "rewards/VisualizationJSONCombinedORM/std": 0.09667034447193146, "step": 2220, "train_speed(iter/s)": 0.061599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 301.875, "completions/min_length": 210.0, "epoch": 1.837055417700579, "grad_norm": 0.23104636371135712, "kl": 0.1063232421875, "learning_rate": 2.008745003507534e-07, "loss": 0.0010638050734996796, "memory(GiB)": 38.05, "reward": 0.6143841743469238, "reward_std": 0.09731046855449677, "rewards/VisualizationJSONCombinedORM/mean": 0.6143841743469238, "rewards/VisualizationJSONCombinedORM/std": 0.10367272794246674, "step": 2221, "train_speed(iter/s)": 0.061586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 249.625, "completions/min_length": 193.0, "epoch": 1.837882547559967, "grad_norm": 0.2542349100112915, "kl": 0.13873291015625, "learning_rate": 1.9885393369134976e-07, "loss": 0.0013890787959098816, "memory(GiB)": 38.05, "reward": 0.5229530334472656, "reward_std": 0.09076064825057983, "rewards/VisualizationJSONCombinedORM/mean": 0.5229530334472656, "rewards/VisualizationJSONCombinedORM/std": 0.1129799485206604, "step": 2222, "train_speed(iter/s)": 0.061569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 275.5625, "completions/min_length": 206.0, "epoch": 1.838709677419355, "grad_norm": 0.160517618060112, "kl": 0.03240966796875, "learning_rate": 1.9684337455978085e-07, "loss": 0.00032451748847961426, "memory(GiB)": 38.05, "reward": 0.3686025142669678, "reward_std": 0.04777069389820099, "rewards/VisualizationJSONCombinedORM/mean": 0.3686025142669678, "rewards/VisualizationJSONCombinedORM/std": 0.1526394933462143, "step": 2223, "train_speed(iter/s)": 0.061555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 255.6875, "completions/min_length": 209.0, "epoch": 1.8395368072787428, "grad_norm": 0.19325679540634155, "kl": 0.11456298828125, "learning_rate": 1.9484282714686442e-07, "loss": 0.0011454150080680847, "memory(GiB)": 38.05, "reward": 0.4198353886604309, "reward_std": 0.04754021763801575, "rewards/VisualizationJSONCombinedORM/mean": 0.4198353886604309, "rewards/VisualizationJSONCombinedORM/std": 0.15528389811515808, "step": 2224, "train_speed(iter/s)": 0.061539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 279.1875, "completions/min_length": 225.0, "epoch": 1.8403639371381306, "grad_norm": 0.2284521907567978, "kl": 0.06146240234375, "learning_rate": 1.928522956225487e-07, "loss": 0.0006136223673820496, "memory(GiB)": 38.05, "reward": 0.39382120966911316, "reward_std": 0.06413796544075012, "rewards/VisualizationJSONCombinedORM/mean": 0.39382120966911316, "rewards/VisualizationJSONCombinedORM/std": 0.07319977879524231, "step": 2225, "train_speed(iter/s)": 0.061524 }, { "epoch": 1.8403639371381306, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 323.7916666666667, "eval_completions/mean_length": 267.9947916666667, "eval_completions/min_length": 226.20833333333334, "eval_kl": 0.06496175130208333, "eval_loss": 0.0006556262378580868, "eval_reward": 0.47069465182721615, "eval_reward_std": 0.05918991282427063, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47069465182721615, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05918991377499575, "eval_runtime": 286.6634, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 286.375, "completions/min_length": 221.0, "epoch": 1.8411910669975184, "grad_norm": 0.2277468889951706, "kl": 0.0728759765625, "learning_rate": 1.908717841359048e-07, "loss": 0.0007293485105037689, "memory(GiB)": 38.05, "reward": 0.40178829431533813, "reward_std": 0.050865791738033295, "rewards/VisualizationJSONCombinedORM/mean": 0.40178829431533813, "rewards/VisualizationJSONCombinedORM/std": 0.09017889946699142, "step": 2226, "train_speed(iter/s)": 0.061026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 283.0, "completions/min_length": 228.0, "epoch": 1.8420181968569065, "grad_norm": 0.15968815982341766, "kl": 0.074951171875, "learning_rate": 1.8890129681511992e-07, "loss": 0.0007493309676647186, "memory(GiB)": 38.05, "reward": 0.6798827648162842, "reward_std": 0.06739852577447891, "rewards/VisualizationJSONCombinedORM/mean": 0.6798827648162842, "rewards/VisualizationJSONCombinedORM/std": 0.0984639972448349, "step": 2227, "train_speed(iter/s)": 0.061007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 270.6875, "completions/min_length": 220.0, "epoch": 1.8428453267162945, "grad_norm": 0.19358137249946594, "kl": 0.187744140625, "learning_rate": 1.8694083776748472e-07, "loss": 0.0018786638975143433, "memory(GiB)": 38.05, "reward": 0.568968653678894, "reward_std": 0.09951925277709961, "rewards/VisualizationJSONCombinedORM/mean": 0.568968653678894, "rewards/VisualizationJSONCombinedORM/std": 0.1932188868522644, "step": 2228, "train_speed(iter/s)": 0.060997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 261.125, "completions/min_length": 204.0, "epoch": 1.8436724565756824, "grad_norm": 0.16844560205936432, "kl": 0.0362548828125, "learning_rate": 1.849904110793893e-07, "loss": 0.00036276131868362427, "memory(GiB)": 38.05, "reward": 0.44902974367141724, "reward_std": 0.04362597316503525, "rewards/VisualizationJSONCombinedORM/mean": 0.44902974367141724, "rewards/VisualizationJSONCombinedORM/std": 0.15520356595516205, "step": 2229, "train_speed(iter/s)": 0.060989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 255.125, "completions/min_length": 195.0, "epoch": 1.8444995864350702, "grad_norm": 0.18581627309322357, "kl": 0.035400390625, "learning_rate": 1.8305002081630885e-07, "loss": 0.0003539174795150757, "memory(GiB)": 38.05, "reward": 0.7548621892929077, "reward_std": 0.07124552875757217, "rewards/VisualizationJSONCombinedORM/mean": 0.7548621892929077, "rewards/VisualizationJSONCombinedORM/std": 0.10888580977916718, "step": 2230, "train_speed(iter/s)": 0.060974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 284.25, "completions/min_length": 228.0, "epoch": 1.8453267162944582, "grad_norm": 0.13759173452854156, "kl": 0.072021484375, "learning_rate": 1.8111967102280082e-07, "loss": 0.0007198750972747803, "memory(GiB)": 38.05, "reward": 0.3296951651573181, "reward_std": 0.03723347187042236, "rewards/VisualizationJSONCombinedORM/mean": 0.3296951651573181, "rewards/VisualizationJSONCombinedORM/std": 0.10654368251562119, "step": 2231, "train_speed(iter/s)": 0.06096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 284.875, "completions/min_length": 236.0, "epoch": 1.8461538461538463, "grad_norm": 0.1904783397912979, "kl": 0.065673828125, "learning_rate": 1.7919936572249442e-07, "loss": 0.0006568431854248047, "memory(GiB)": 38.05, "reward": 0.6202201843261719, "reward_std": 0.04610806703567505, "rewards/VisualizationJSONCombinedORM/mean": 0.6202201843261719, "rewards/VisualizationJSONCombinedORM/std": 0.23601724207401276, "step": 2232, "train_speed(iter/s)": 0.060946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 243.125, "completions/min_length": 212.0, "epoch": 1.846980976013234, "grad_norm": 0.20274631679058075, "kl": 0.059326171875, "learning_rate": 1.7728910891808283e-07, "loss": 0.0005935728549957275, "memory(GiB)": 38.05, "reward": 0.6456728577613831, "reward_std": 0.10657583922147751, "rewards/VisualizationJSONCombinedORM/mean": 0.6456728577613831, "rewards/VisualizationJSONCombinedORM/std": 0.15199780464172363, "step": 2233, "train_speed(iter/s)": 0.060934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 281.1875, "completions/min_length": 226.0, "epoch": 1.847808105872622, "grad_norm": 0.18127015233039856, "kl": 0.06298828125, "learning_rate": 1.7538890459131098e-07, "loss": 0.0006292164325714111, "memory(GiB)": 38.05, "reward": 0.5858919620513916, "reward_std": 0.0958469808101654, "rewards/VisualizationJSONCombinedORM/mean": 0.5858919620513916, "rewards/VisualizationJSONCombinedORM/std": 0.09414970874786377, "step": 2234, "train_speed(iter/s)": 0.060919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 268.625, "completions/min_length": 210.0, "epoch": 1.84863523573201, "grad_norm": 0.1703117936849594, "kl": 0.0526123046875, "learning_rate": 1.7349875670297279e-07, "loss": 0.0005262084305286407, "memory(GiB)": 38.05, "reward": 0.4184831380844116, "reward_std": 0.05968456342816353, "rewards/VisualizationJSONCombinedORM/mean": 0.4184831380844116, "rewards/VisualizationJSONCombinedORM/std": 0.10552117228507996, "step": 2235, "train_speed(iter/s)": 0.060907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 261.875, "completions/min_length": 216.0, "epoch": 1.849462365591398, "grad_norm": 0.1678425818681717, "kl": 0.05712890625, "learning_rate": 1.7161866919290004e-07, "loss": 0.0005711782723665237, "memory(GiB)": 38.05, "reward": 0.5820192098617554, "reward_std": 0.08462627232074738, "rewards/VisualizationJSONCombinedORM/mean": 0.5820192098617554, "rewards/VisualizationJSONCombinedORM/std": 0.1343500167131424, "step": 2236, "train_speed(iter/s)": 0.0609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 278.1875, "completions/min_length": 220.0, "epoch": 1.8502894954507858, "grad_norm": 0.17398899793624878, "kl": 0.0665283203125, "learning_rate": 1.6974864597995465e-07, "loss": 0.0006647147238254547, "memory(GiB)": 38.05, "reward": 0.6508504748344421, "reward_std": 0.08079026639461517, "rewards/VisualizationJSONCombinedORM/mean": 0.6508504748344421, "rewards/VisualizationJSONCombinedORM/std": 0.09930317848920822, "step": 2237, "train_speed(iter/s)": 0.060891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 298.375, "completions/min_length": 246.0, "epoch": 1.8511166253101736, "grad_norm": 0.17905494570732117, "kl": 0.077392578125, "learning_rate": 1.6788869096202197e-07, "loss": 0.0007739663124084473, "memory(GiB)": 38.05, "reward": 0.5474324822425842, "reward_std": 0.05115745589137077, "rewards/VisualizationJSONCombinedORM/mean": 0.5474324822425842, "rewards/VisualizationJSONCombinedORM/std": 0.14385627210140228, "step": 2238, "train_speed(iter/s)": 0.060873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 280.3125, "completions/min_length": 213.0, "epoch": 1.8519437551695617, "grad_norm": 0.19565832614898682, "kl": 0.052978515625, "learning_rate": 1.6603880801599636e-07, "loss": 0.0005286596715450287, "memory(GiB)": 38.05, "reward": 0.5645193457603455, "reward_std": 0.112640880048275, "rewards/VisualizationJSONCombinedORM/mean": 0.5645193457603455, "rewards/VisualizationJSONCombinedORM/std": 0.23615682125091553, "step": 2239, "train_speed(iter/s)": 0.06086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 292.875, "completions/min_length": 214.0, "epoch": 1.8527708850289497, "grad_norm": 0.2172870934009552, "kl": 0.1138916015625, "learning_rate": 1.641990009977834e-07, "loss": 0.0011363029479980469, "memory(GiB)": 38.05, "reward": 0.7227950096130371, "reward_std": 0.09636066854000092, "rewards/VisualizationJSONCombinedORM/mean": 0.7227950096130371, "rewards/VisualizationJSONCombinedORM/std": 0.0961068719625473, "step": 2240, "train_speed(iter/s)": 0.060839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 287.3125, "completions/min_length": 235.0, "epoch": 1.8535980148883375, "grad_norm": 0.2400372326374054, "kl": 0.0504150390625, "learning_rate": 1.6236927374228274e-07, "loss": 0.0005045868456363678, "memory(GiB)": 38.05, "reward": 0.3623437285423279, "reward_std": 0.057775042951107025, "rewards/VisualizationJSONCombinedORM/mean": 0.3623437285423279, "rewards/VisualizationJSONCombinedORM/std": 0.06728789955377579, "step": 2241, "train_speed(iter/s)": 0.060828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 285.1875, "completions/min_length": 230.0, "epoch": 1.8544251447477254, "grad_norm": 0.22097881138324738, "kl": 0.06707763671875, "learning_rate": 1.6054963006338742e-07, "loss": 0.0006703119724988937, "memory(GiB)": 38.05, "reward": 0.599342405796051, "reward_std": 0.0781494528055191, "rewards/VisualizationJSONCombinedORM/mean": 0.599342405796051, "rewards/VisualizationJSONCombinedORM/std": 0.0797368511557579, "step": 2242, "train_speed(iter/s)": 0.060818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 264.3125, "completions/min_length": 213.0, "epoch": 1.8552522746071132, "grad_norm": 0.18788352608680725, "kl": 0.0325927734375, "learning_rate": 1.5874007375396793e-07, "loss": 0.0003265589475631714, "memory(GiB)": 38.05, "reward": 0.4190034568309784, "reward_std": 0.0607609748840332, "rewards/VisualizationJSONCombinedORM/mean": 0.4190034568309784, "rewards/VisualizationJSONCombinedORM/std": 0.17613396048545837, "step": 2243, "train_speed(iter/s)": 0.060806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 295.0, "completions/min_length": 219.0, "epoch": 1.8560794044665012, "grad_norm": 0.197090744972229, "kl": 0.04571533203125, "learning_rate": 1.5694060858587046e-07, "loss": 0.0004569580778479576, "memory(GiB)": 38.05, "reward": 0.3900632858276367, "reward_std": 0.04481574892997742, "rewards/VisualizationJSONCombinedORM/mean": 0.3900632858276367, "rewards/VisualizationJSONCombinedORM/std": 0.11131976544857025, "step": 2244, "train_speed(iter/s)": 0.060793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 256.375, "completions/min_length": 209.0, "epoch": 1.8569065343258893, "grad_norm": 0.22110076248645782, "kl": 0.06304931640625, "learning_rate": 1.5515123830990797e-07, "loss": 0.0006296411156654358, "memory(GiB)": 38.05, "reward": 0.43009430170059204, "reward_std": 0.06489633023738861, "rewards/VisualizationJSONCombinedORM/mean": 0.43009430170059204, "rewards/VisualizationJSONCombinedORM/std": 0.09321320801973343, "step": 2245, "train_speed(iter/s)": 0.060784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 253.4375, "completions/min_length": 218.0, "epoch": 1.857733664185277, "grad_norm": 0.18839745223522186, "kl": 0.0693359375, "learning_rate": 1.533719666558514e-07, "loss": 0.0006956644356250763, "memory(GiB)": 38.05, "reward": 0.6772950291633606, "reward_std": 0.07759460806846619, "rewards/VisualizationJSONCombinedORM/mean": 0.6772950291633606, "rewards/VisualizationJSONCombinedORM/std": 0.07817187905311584, "step": 2246, "train_speed(iter/s)": 0.060777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 270.5, "completions/min_length": 211.0, "epoch": 1.858560794044665, "grad_norm": 0.1751425415277481, "kl": 0.11334228515625, "learning_rate": 1.5160279733242133e-07, "loss": 0.0011363662779331207, "memory(GiB)": 38.05, "reward": 0.4980331063270569, "reward_std": 0.08503696322441101, "rewards/VisualizationJSONCombinedORM/mean": 0.4980331063270569, "rewards/VisualizationJSONCombinedORM/std": 0.11138036102056503, "step": 2247, "train_speed(iter/s)": 0.060757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 256.6875, "completions/min_length": 219.0, "epoch": 1.859387923904053, "grad_norm": 0.19962112605571747, "kl": 0.0943603515625, "learning_rate": 1.4984373402728014e-07, "loss": 0.000943424180150032, "memory(GiB)": 38.05, "reward": 0.7117875218391418, "reward_std": 0.07824008911848068, "rewards/VisualizationJSONCombinedORM/mean": 0.7117875218391418, "rewards/VisualizationJSONCombinedORM/std": 0.09197964519262314, "step": 2248, "train_speed(iter/s)": 0.06075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 260.3125, "completions/min_length": 208.0, "epoch": 1.860215053763441, "grad_norm": 0.16654044389724731, "kl": 0.0540771484375, "learning_rate": 1.4809478040702763e-07, "loss": 0.0005406923592090607, "memory(GiB)": 38.05, "reward": 0.4898870289325714, "reward_std": 0.0771799385547638, "rewards/VisualizationJSONCombinedORM/mean": 0.4898870289325714, "rewards/VisualizationJSONCombinedORM/std": 0.20641915500164032, "step": 2249, "train_speed(iter/s)": 0.060728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 284.125, "completions/min_length": 231.0, "epoch": 1.8610421836228288, "grad_norm": 0.17383679747581482, "kl": 0.05609130859375, "learning_rate": 1.4635594011718935e-07, "loss": 0.0005597174167633057, "memory(GiB)": 38.05, "reward": 0.6404058337211609, "reward_std": 0.04812163859605789, "rewards/VisualizationJSONCombinedORM/mean": 0.6404058337211609, "rewards/VisualizationJSONCombinedORM/std": 0.19570763409137726, "step": 2250, "train_speed(iter/s)": 0.060715 }, { "epoch": 1.8610421836228288, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 319.75, "eval_completions/mean_length": 270.8802083333333, "eval_completions/min_length": 229.33333333333334, "eval_kl": 0.07392374674479167, "eval_loss": 0.0007403381168842316, "eval_reward": 0.4586419401069482, "eval_reward_std": 0.0709614703276505, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4586419401069482, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07096147141419351, "eval_runtime": 283.9302, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 255.5, "completions/min_length": 210.0, "epoch": 1.8618693134822166, "grad_norm": 0.189181849360466, "kl": 0.0826416015625, "learning_rate": 1.4462721678221103e-07, "loss": 0.0008264258503913879, "memory(GiB)": 38.05, "reward": 0.3890233635902405, "reward_std": 0.06035054475069046, "rewards/VisualizationJSONCombinedORM/mean": 0.3890233635902405, "rewards/VisualizationJSONCombinedORM/std": 0.1262645423412323, "step": 2251, "train_speed(iter/s)": 0.060241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 285.625, "completions/min_length": 239.0, "epoch": 1.8626964433416047, "grad_norm": 0.1958315670490265, "kl": 0.09759521484375, "learning_rate": 1.4290861400545031e-07, "loss": 0.0009743534028530121, "memory(GiB)": 38.05, "reward": 0.43929630517959595, "reward_std": 0.04502567648887634, "rewards/VisualizationJSONCombinedORM/mean": 0.43929630517959595, "rewards/VisualizationJSONCombinedORM/std": 0.0865287333726883, "step": 2252, "train_speed(iter/s)": 0.060228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 276.3125, "completions/min_length": 236.0, "epoch": 1.8635235732009927, "grad_norm": 0.22385947406291962, "kl": 0.0626220703125, "learning_rate": 1.412001353691689e-07, "loss": 0.0006252676248550415, "memory(GiB)": 38.05, "reward": 0.5947912931442261, "reward_std": 0.08641058206558228, "rewards/VisualizationJSONCombinedORM/mean": 0.5947912931442261, "rewards/VisualizationJSONCombinedORM/std": 0.16724498569965363, "step": 2253, "train_speed(iter/s)": 0.060223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 283.6875, "completions/min_length": 238.0, "epoch": 1.8643507030603805, "grad_norm": 0.1957932859659195, "kl": 0.066162109375, "learning_rate": 1.39501784434527e-07, "loss": 0.0006614699959754944, "memory(GiB)": 38.05, "reward": 0.696805477142334, "reward_std": 0.07939012348651886, "rewards/VisualizationJSONCombinedORM/mean": 0.696805477142334, "rewards/VisualizationJSONCombinedORM/std": 0.09275619685649872, "step": 2254, "train_speed(iter/s)": 0.060213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 276.1875, "completions/min_length": 230.0, "epoch": 1.8651778329197684, "grad_norm": 0.23966380953788757, "kl": 0.07452392578125, "learning_rate": 1.378135647415746e-07, "loss": 0.0007445439696311951, "memory(GiB)": 38.05, "reward": 0.4982792139053345, "reward_std": 0.06668878346681595, "rewards/VisualizationJSONCombinedORM/mean": 0.4982792139053345, "rewards/VisualizationJSONCombinedORM/std": 0.09370885044336319, "step": 2255, "train_speed(iter/s)": 0.060205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 259.3125, "completions/min_length": 203.0, "epoch": 1.8660049627791562, "grad_norm": 0.17381995916366577, "kl": 0.083984375, "learning_rate": 1.361354798092429e-07, "loss": 0.0008384063839912415, "memory(GiB)": 38.05, "reward": 0.6361473798751831, "reward_std": 0.09005890041589737, "rewards/VisualizationJSONCombinedORM/mean": 0.6361473798751831, "rewards/VisualizationJSONCombinedORM/std": 0.11834166944026947, "step": 2256, "train_speed(iter/s)": 0.060197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 244.0, "completions/min_length": 210.0, "epoch": 1.8668320926385442, "grad_norm": 0.1973959356546402, "kl": 0.06927490234375, "learning_rate": 1.3446753313533846e-07, "loss": 0.0006936788558959961, "memory(GiB)": 38.05, "reward": 0.4519690275192261, "reward_std": 0.040016431361436844, "rewards/VisualizationJSONCombinedORM/mean": 0.4519690275192261, "rewards/VisualizationJSONCombinedORM/std": 0.2674277722835541, "step": 2257, "train_speed(iter/s)": 0.060187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 262.5625, "completions/min_length": 231.0, "epoch": 1.8676592224979323, "grad_norm": 0.18173231184482574, "kl": 0.047607421875, "learning_rate": 1.328097281965357e-07, "loss": 0.0004764646291732788, "memory(GiB)": 38.05, "reward": 0.37393057346343994, "reward_std": 0.06285598874092102, "rewards/VisualizationJSONCombinedORM/mean": 0.37393057346343994, "rewards/VisualizationJSONCombinedORM/std": 0.1802433580160141, "step": 2258, "train_speed(iter/s)": 0.060173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 269.25, "completions/min_length": 199.0, "epoch": 1.86848635235732, "grad_norm": 0.1938035488128662, "kl": 0.076416015625, "learning_rate": 1.311620684483711e-07, "loss": 0.0007638335227966309, "memory(GiB)": 38.05, "reward": 0.5834864974021912, "reward_std": 0.0872059166431427, "rewards/VisualizationJSONCombinedORM/mean": 0.5834864974021912, "rewards/VisualizationJSONCombinedORM/std": 0.08856550604104996, "step": 2259, "train_speed(iter/s)": 0.060162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 255.3125, "completions/min_length": 202.0, "epoch": 1.869313482216708, "grad_norm": 0.19158238172531128, "kl": 0.052734375, "learning_rate": 1.2952455732523238e-07, "loss": 0.000527799129486084, "memory(GiB)": 38.05, "reward": 0.34206026792526245, "reward_std": 0.034393906593322754, "rewards/VisualizationJSONCombinedORM/mean": 0.34206026792526245, "rewards/VisualizationJSONCombinedORM/std": 0.03411738947033882, "step": 2260, "train_speed(iter/s)": 0.060145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 249.4375, "completions/min_length": 200.0, "epoch": 1.870140612076096, "grad_norm": 0.1775321662425995, "kl": 0.0521240234375, "learning_rate": 1.2789719824035373e-07, "loss": 0.0005216319113969803, "memory(GiB)": 38.05, "reward": 0.5555534362792969, "reward_std": 0.049858905375003815, "rewards/VisualizationJSONCombinedORM/mean": 0.5555534362792969, "rewards/VisualizationJSONCombinedORM/std": 0.21372461318969727, "step": 2261, "train_speed(iter/s)": 0.060143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 249.3125, "completions/min_length": 191.0, "epoch": 1.870967741935484, "grad_norm": 0.20571903884410858, "kl": 0.0919189453125, "learning_rate": 1.2627999458580952e-07, "loss": 0.0009196028113365173, "memory(GiB)": 38.05, "reward": 0.43273061513900757, "reward_std": 0.11123374104499817, "rewards/VisualizationJSONCombinedORM/mean": 0.43273061513900757, "rewards/VisualizationJSONCombinedORM/std": 0.1303277462720871, "step": 2262, "train_speed(iter/s)": 0.060129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 265.5625, "completions/min_length": 222.0, "epoch": 1.8717948717948718, "grad_norm": 0.18519428372383118, "kl": 0.05224609375, "learning_rate": 1.2467294973250554e-07, "loss": 0.0005232356488704681, "memory(GiB)": 38.05, "reward": 0.43132084608078003, "reward_std": 0.06085435301065445, "rewards/VisualizationJSONCombinedORM/mean": 0.43132084608078003, "rewards/VisualizationJSONCombinedORM/std": 0.1254313439130783, "step": 2263, "train_speed(iter/s)": 0.060119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 266.5625, "completions/min_length": 206.0, "epoch": 1.8726220016542596, "grad_norm": 0.16308225691318512, "kl": 0.1123046875, "learning_rate": 1.2307606703017173e-07, "loss": 0.001119311898946762, "memory(GiB)": 38.05, "reward": 0.5823022723197937, "reward_std": 0.08651651442050934, "rewards/VisualizationJSONCombinedORM/mean": 0.5823022723197937, "rewards/VisualizationJSONCombinedORM/std": 0.15160933136940002, "step": 2264, "train_speed(iter/s)": 0.060102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 289.0625, "completions/min_length": 230.0, "epoch": 1.8734491315136477, "grad_norm": 0.1907249242067337, "kl": 0.0479736328125, "learning_rate": 1.2148934980735772e-07, "loss": 0.00047948211431503296, "memory(GiB)": 38.05, "reward": 0.5307152271270752, "reward_std": 0.06538674235343933, "rewards/VisualizationJSONCombinedORM/mean": 0.5307152271270752, "rewards/VisualizationJSONCombinedORM/std": 0.10323086380958557, "step": 2265, "train_speed(iter/s)": 0.060088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 254.4375, "completions/min_length": 207.0, "epoch": 1.8742762613730357, "grad_norm": 0.17090904712677002, "kl": 0.036224365234375, "learning_rate": 1.199128013714218e-07, "loss": 0.0003617517650127411, "memory(GiB)": 38.05, "reward": 0.5402181148529053, "reward_std": 0.05991438776254654, "rewards/VisualizationJSONCombinedORM/mean": 0.5402181148529053, "rewards/VisualizationJSONCombinedORM/std": 0.08701292425394058, "step": 2266, "train_speed(iter/s)": 0.060077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 276.4375, "completions/min_length": 195.0, "epoch": 1.8751033912324235, "grad_norm": 0.18055909872055054, "kl": 0.072998046875, "learning_rate": 1.1834642500852867e-07, "loss": 0.0007284115999937057, "memory(GiB)": 38.05, "reward": 0.5496773719787598, "reward_std": 0.029561253264546394, "rewards/VisualizationJSONCombinedORM/mean": 0.5496773719787598, "rewards/VisualizationJSONCombinedORM/std": 0.07495911419391632, "step": 2267, "train_speed(iter/s)": 0.06006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 262.1875, "completions/min_length": 220.0, "epoch": 1.8759305210918114, "grad_norm": 0.16001670062541962, "kl": 0.03582763671875, "learning_rate": 1.1679022398363937e-07, "loss": 0.0003580451011657715, "memory(GiB)": 38.05, "reward": 0.606752872467041, "reward_std": 0.059757307171821594, "rewards/VisualizationJSONCombinedORM/mean": 0.606752872467041, "rewards/VisualizationJSONCombinedORM/std": 0.2169458419084549, "step": 2268, "train_speed(iter/s)": 0.060053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 275.375, "completions/min_length": 216.0, "epoch": 1.8767576509511992, "grad_norm": 0.21032291650772095, "kl": 0.03936767578125, "learning_rate": 1.1524420154050586e-07, "loss": 0.00039330869913101196, "memory(GiB)": 38.05, "reward": 0.6798622608184814, "reward_std": 0.09266163408756256, "rewards/VisualizationJSONCombinedORM/mean": 0.6798622608184814, "rewards/VisualizationJSONCombinedORM/std": 0.10284445434808731, "step": 2269, "train_speed(iter/s)": 0.060042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 257.75, "completions/min_length": 223.0, "epoch": 1.8775847808105872, "grad_norm": 0.17887091636657715, "kl": 0.0623779296875, "learning_rate": 1.1370836090166204e-07, "loss": 0.000623771920800209, "memory(GiB)": 38.05, "reward": 0.7378284931182861, "reward_std": 0.07232920825481415, "rewards/VisualizationJSONCombinedORM/mean": 0.7378284931182861, "rewards/VisualizationJSONCombinedORM/std": 0.08061560988426208, "step": 2270, "train_speed(iter/s)": 0.060031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 276.25, "completions/min_length": 204.0, "epoch": 1.8784119106699753, "grad_norm": 0.1887574940919876, "kl": 0.0496826171875, "learning_rate": 1.1218270526842102e-07, "loss": 0.0004969239234924316, "memory(GiB)": 38.05, "reward": 0.24287766218185425, "reward_std": 0.03204599767923355, "rewards/VisualizationJSONCombinedORM/mean": 0.24287766218185425, "rewards/VisualizationJSONCombinedORM/std": 0.10178761929273605, "step": 2271, "train_speed(iter/s)": 0.060018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 258.8125, "completions/min_length": 187.0, "epoch": 1.879239040529363, "grad_norm": 0.16692744195461273, "kl": 0.03460693359375, "learning_rate": 1.1066723782086619e-07, "loss": 0.00034638121724128723, "memory(GiB)": 38.05, "reward": 0.47813814878463745, "reward_std": 0.045944541692733765, "rewards/VisualizationJSONCombinedORM/mean": 0.47813814878463745, "rewards/VisualizationJSONCombinedORM/std": 0.1690061092376709, "step": 2272, "train_speed(iter/s)": 0.060007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 312.3125, "completions/min_length": 273.0, "epoch": 1.880066170388751, "grad_norm": 0.2149229496717453, "kl": 0.085693359375, "learning_rate": 1.0916196171784299e-07, "loss": 0.000856718048453331, "memory(GiB)": 38.05, "reward": 0.2848227620124817, "reward_std": 0.05284491181373596, "rewards/VisualizationJSONCombinedORM/mean": 0.2848227620124817, "rewards/VisualizationJSONCombinedORM/std": 0.0953560620546341, "step": 2273, "train_speed(iter/s)": 0.059992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 286.625, "completions/min_length": 222.0, "epoch": 1.880893300248139, "grad_norm": 0.17841848731040955, "kl": 0.04998779296875, "learning_rate": 1.0766688009695548e-07, "loss": 0.0005003660917282104, "memory(GiB)": 38.05, "reward": 0.6036558151245117, "reward_std": 0.10519315302371979, "rewards/VisualizationJSONCombinedORM/mean": 0.6036558151245117, "rewards/VisualizationJSONCombinedORM/std": 0.12729734182357788, "step": 2274, "train_speed(iter/s)": 0.059971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 264.375, "completions/min_length": 204.0, "epoch": 1.881720430107527, "grad_norm": 0.17191408574581146, "kl": 0.04534912109375, "learning_rate": 1.0618199607455637e-07, "loss": 0.0004529803991317749, "memory(GiB)": 38.05, "reward": 0.2425576001405716, "reward_std": 0.028622113168239594, "rewards/VisualizationJSONCombinedORM/mean": 0.2425576001405716, "rewards/VisualizationJSONCombinedORM/std": 0.0648711770772934, "step": 2275, "train_speed(iter/s)": 0.05996 }, { "epoch": 1.881720430107527, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 332.5, "eval_completions/mean_length": 270.390625, "eval_completions/min_length": 227.83333333333334, "eval_kl": 0.07127888997395833, "eval_loss": 0.0007222853600978851, "eval_reward": 0.4629281734426816, "eval_reward_std": 0.07151102367788553, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4629281734426816, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07151102523008983, "eval_runtime": 291.2224, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 283.5, "completions/min_length": 227.0, "epoch": 1.8825475599669148, "grad_norm": 0.2401735633611679, "kl": 0.06146240234375, "learning_rate": 1.0470731274574542e-07, "loss": 0.0006143748760223389, "memory(GiB)": 38.05, "reward": 0.44781458377838135, "reward_std": 0.08177820593118668, "rewards/VisualizationJSONCombinedORM/mean": 0.44781458377838135, "rewards/VisualizationJSONCombinedORM/std": 0.20064584910869598, "step": 2276, "train_speed(iter/s)": 0.059493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 274.25, "completions/min_length": 223.0, "epoch": 1.8833746898263026, "grad_norm": 0.16256077587604523, "kl": 0.0484619140625, "learning_rate": 1.0324283318435713e-07, "loss": 0.00048534199595451355, "memory(GiB)": 38.05, "reward": 0.7195348739624023, "reward_std": 0.06557919830083847, "rewards/VisualizationJSONCombinedORM/mean": 0.7195348739624023, "rewards/VisualizationJSONCombinedORM/std": 0.08029957115650177, "step": 2277, "train_speed(iter/s)": 0.059475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 253.5625, "completions/min_length": 227.0, "epoch": 1.8842018196856907, "grad_norm": 0.21121369302272797, "kl": 0.03875732421875, "learning_rate": 1.0178856044295971e-07, "loss": 0.00038792937994003296, "memory(GiB)": 38.05, "reward": 0.3622583746910095, "reward_std": 0.0699990838766098, "rewards/VisualizationJSONCombinedORM/mean": 0.3622583746910095, "rewards/VisualizationJSONCombinedORM/std": 0.17420904338359833, "step": 2278, "train_speed(iter/s)": 0.059464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 242.5, "completions/min_length": 222.0, "epoch": 1.8850289495450787, "grad_norm": 0.15182672441005707, "kl": 0.05853271484375, "learning_rate": 1.0034449755284392e-07, "loss": 0.0005870945751667023, "memory(GiB)": 38.05, "reward": 0.5835927724838257, "reward_std": 0.05533764138817787, "rewards/VisualizationJSONCombinedORM/mean": 0.5835927724838257, "rewards/VisualizationJSONCombinedORM/std": 0.13321734964847565, "step": 2279, "train_speed(iter/s)": 0.059453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 266.5625, "completions/min_length": 223.0, "epoch": 1.8858560794044665, "grad_norm": 0.13916318118572235, "kl": 0.075927734375, "learning_rate": 9.891064752402091e-08, "loss": 0.000759270042181015, "memory(GiB)": 38.05, "reward": 0.5578948259353638, "reward_std": 0.06822869926691055, "rewards/VisualizationJSONCombinedORM/mean": 0.5578948259353638, "rewards/VisualizationJSONCombinedORM/std": 0.1603642851114273, "step": 2280, "train_speed(iter/s)": 0.059442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 280.6875, "completions/min_length": 242.0, "epoch": 1.8866832092638544, "grad_norm": 0.20549151301383972, "kl": 0.1368408203125, "learning_rate": 9.748701334521215e-08, "loss": 0.00136638805270195, "memory(GiB)": 38.05, "reward": 0.48514989018440247, "reward_std": 0.0537065826356411, "rewards/VisualizationJSONCombinedORM/mean": 0.48514989018440247, "rewards/VisualizationJSONCombinedORM/std": 0.23971813917160034, "step": 2281, "train_speed(iter/s)": 0.059432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 272.8125, "completions/min_length": 241.0, "epoch": 1.8875103391232424, "grad_norm": 0.1771368533372879, "kl": 0.03863525390625, "learning_rate": 9.607359798384785e-08, "loss": 0.0003862902522087097, "memory(GiB)": 38.05, "reward": 0.5070356130599976, "reward_std": 0.05549589544534683, "rewards/VisualizationJSONCombinedORM/mean": 0.5070356130599976, "rewards/VisualizationJSONCombinedORM/std": 0.11006447672843933, "step": 2282, "train_speed(iter/s)": 0.059419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 276.375, "completions/min_length": 221.0, "epoch": 1.8883374689826302, "grad_norm": 0.17857810854911804, "kl": 0.05560302734375, "learning_rate": 9.467040438605579e-08, "loss": 0.0005565956234931946, "memory(GiB)": 38.05, "reward": 0.628253698348999, "reward_std": 0.13416951894760132, "rewards/VisualizationJSONCombinedORM/mean": 0.628253698348999, "rewards/VisualizationJSONCombinedORM/std": 0.13022395968437195, "step": 2283, "train_speed(iter/s)": 0.059405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 266.625, "completions/min_length": 215.0, "epoch": 1.8891645988420183, "grad_norm": 0.2066521793603897, "kl": 0.1715087890625, "learning_rate": 9.327743547665858e-08, "loss": 0.0017146840691566467, "memory(GiB)": 38.05, "reward": 0.40622639656066895, "reward_std": 0.07041267305612564, "rewards/VisualizationJSONCombinedORM/mean": 0.40622639656066895, "rewards/VisualizationJSONCombinedORM/std": 0.1847519725561142, "step": 2284, "train_speed(iter/s)": 0.059394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 264.625, "completions/min_length": 213.0, "epoch": 1.889991728701406, "grad_norm": 0.2898901402950287, "kl": 0.0927734375, "learning_rate": 9.189469415916586e-08, "loss": 0.0009296871721744537, "memory(GiB)": 38.05, "reward": 0.4514607787132263, "reward_std": 0.06631234288215637, "rewards/VisualizationJSONCombinedORM/mean": 0.4514607787132263, "rewards/VisualizationJSONCombinedORM/std": 0.1386719048023224, "step": 2285, "train_speed(iter/s)": 0.059386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 284.125, "completions/min_length": 235.0, "epoch": 1.890818858560794, "grad_norm": 0.17866118252277374, "kl": 0.0830078125, "learning_rate": 9.052218331576878e-08, "loss": 0.0008309651166200638, "memory(GiB)": 38.05, "reward": 0.4601213037967682, "reward_std": 0.06954004615545273, "rewards/VisualizationJSONCombinedORM/mean": 0.4601213037967682, "rewards/VisualizationJSONCombinedORM/std": 0.0707465186715126, "step": 2286, "train_speed(iter/s)": 0.059374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 280.3125, "completions/min_length": 228.0, "epoch": 1.891645988420182, "grad_norm": 0.2046826332807541, "kl": 0.1055908203125, "learning_rate": 8.915990580733558e-08, "loss": 0.0010580383241176605, "memory(GiB)": 38.05, "reward": 0.3111162483692169, "reward_std": 0.03017704375088215, "rewards/VisualizationJSONCombinedORM/mean": 0.3111162483692169, "rewards/VisualizationJSONCombinedORM/std": 0.0659201368689537, "step": 2287, "train_speed(iter/s)": 0.059358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 272.75, "completions/min_length": 227.0, "epoch": 1.89247311827957, "grad_norm": 0.2154131531715393, "kl": 0.12890625, "learning_rate": 8.780786447340095e-08, "loss": 0.0012922286987304688, "memory(GiB)": 38.05, "reward": 0.39133813977241516, "reward_std": 0.09329067915678024, "rewards/VisualizationJSONCombinedORM/mean": 0.39133813977241516, "rewards/VisualizationJSONCombinedORM/std": 0.1613660305738449, "step": 2288, "train_speed(iter/s)": 0.059348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 248.6875, "completions/min_length": 196.0, "epoch": 1.8933002481389578, "grad_norm": 0.20797069370746613, "kl": 0.1282958984375, "learning_rate": 8.646606213216724e-08, "loss": 0.0012836605310440063, "memory(GiB)": 38.05, "reward": 0.5408319234848022, "reward_std": 0.08260355144739151, "rewards/VisualizationJSONCombinedORM/mean": 0.5408319234848022, "rewards/VisualizationJSONCombinedORM/std": 0.20455574989318848, "step": 2289, "train_speed(iter/s)": 0.059338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 289.8125, "completions/min_length": 227.0, "epoch": 1.8941273779983456, "grad_norm": 0.20627151429653168, "kl": 0.0792236328125, "learning_rate": 8.513450158049109e-08, "loss": 0.000794287770986557, "memory(GiB)": 38.05, "reward": 0.4278106093406677, "reward_std": 0.07843253016471863, "rewards/VisualizationJSONCombinedORM/mean": 0.4278106093406677, "rewards/VisualizationJSONCombinedORM/std": 0.09190420061349869, "step": 2290, "train_speed(iter/s)": 0.059322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 255.0625, "completions/min_length": 203.0, "epoch": 1.8949545078577337, "grad_norm": 0.19388997554779053, "kl": 0.07086181640625, "learning_rate": 8.381318559388341e-08, "loss": 0.0007083714008331299, "memory(GiB)": 38.05, "reward": 0.4645489454269409, "reward_std": 0.07252275943756104, "rewards/VisualizationJSONCombinedORM/mean": 0.4645489454269409, "rewards/VisualizationJSONCombinedORM/std": 0.07187530398368835, "step": 2291, "train_speed(iter/s)": 0.059311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 278.3125, "completions/min_length": 222.0, "epoch": 1.8957816377171217, "grad_norm": 0.1838897168636322, "kl": 0.039520263671875, "learning_rate": 8.250211692650001e-08, "loss": 0.0003946647047996521, "memory(GiB)": 38.05, "reward": 0.7389494776725769, "reward_std": 0.0673259049654007, "rewards/VisualizationJSONCombinedORM/mean": 0.7389494776725769, "rewards/VisualizationJSONCombinedORM/std": 0.07929103821516037, "step": 2292, "train_speed(iter/s)": 0.059298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 261.0, "completions/min_length": 199.0, "epoch": 1.8966087675765095, "grad_norm": 0.22397184371948242, "kl": 0.05322265625, "learning_rate": 8.120129831113766e-08, "loss": 0.0005325861275196075, "memory(GiB)": 38.05, "reward": 0.45643967390060425, "reward_std": 0.050154443830251694, "rewards/VisualizationJSONCombinedORM/mean": 0.45643967390060425, "rewards/VisualizationJSONCombinedORM/std": 0.14501529932022095, "step": 2293, "train_speed(iter/s)": 0.059285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 268.875, "completions/min_length": 223.0, "epoch": 1.8974358974358974, "grad_norm": 0.16571128368377686, "kl": 0.077392578125, "learning_rate": 7.991073245922798e-08, "loss": 0.0007714703679084778, "memory(GiB)": 38.05, "reward": 0.6341631412506104, "reward_std": 0.11484123766422272, "rewards/VisualizationJSONCombinedORM/mean": 0.6341631412506104, "rewards/VisualizationJSONCombinedORM/std": 0.1511552929878235, "step": 2294, "train_speed(iter/s)": 0.059279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 268.9375, "completions/min_length": 212.0, "epoch": 1.8982630272952854, "grad_norm": 0.1805010885000229, "kl": 0.036224365234375, "learning_rate": 7.863042206083138e-08, "loss": 0.0003623887896537781, "memory(GiB)": 38.05, "reward": 0.4798411428928375, "reward_std": 0.0625496581196785, "rewards/VisualizationJSONCombinedORM/mean": 0.4798411428928375, "rewards/VisualizationJSONCombinedORM/std": 0.0982777401804924, "step": 2295, "train_speed(iter/s)": 0.059267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 298.8125, "completions/min_length": 249.0, "epoch": 1.8990901571546734, "grad_norm": 0.22729359567165375, "kl": 0.05364990234375, "learning_rate": 7.736036978463202e-08, "loss": 0.0005360748618841171, "memory(GiB)": 38.05, "reward": 0.6055132150650024, "reward_std": 0.07528592646121979, "rewards/VisualizationJSONCombinedORM/mean": 0.6055132150650024, "rewards/VisualizationJSONCombinedORM/std": 0.1789688766002655, "step": 2296, "train_speed(iter/s)": 0.059255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 276.5, "completions/min_length": 237.0, "epoch": 1.8999172870140613, "grad_norm": 0.16403736174106598, "kl": 0.13372802734375, "learning_rate": 7.610057827793227e-08, "loss": 0.001337248831987381, "memory(GiB)": 38.05, "reward": 0.5148436427116394, "reward_std": 0.07088533043861389, "rewards/VisualizationJSONCombinedORM/mean": 0.5148436427116394, "rewards/VisualizationJSONCombinedORM/std": 0.12815400958061218, "step": 2297, "train_speed(iter/s)": 0.059246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 264.8125, "completions/min_length": 212.0, "epoch": 1.900744416873449, "grad_norm": 0.16912879049777985, "kl": 0.0675048828125, "learning_rate": 7.485105016664551e-08, "loss": 0.0006754044443368912, "memory(GiB)": 38.05, "reward": 0.3419354557991028, "reward_std": 0.031178541481494904, "rewards/VisualizationJSONCombinedORM/mean": 0.3419354557991028, "rewards/VisualizationJSONCombinedORM/std": 0.03645795211195946, "step": 2298, "train_speed(iter/s)": 0.059235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 267.375, "completions/min_length": 214.0, "epoch": 1.901571546732837, "grad_norm": 0.19990956783294678, "kl": 0.0645751953125, "learning_rate": 7.36117880552939e-08, "loss": 0.0006446130573749542, "memory(GiB)": 38.05, "reward": 0.3651916980743408, "reward_std": 0.027042077854275703, "rewards/VisualizationJSONCombinedORM/mean": 0.3651916980743408, "rewards/VisualizationJSONCombinedORM/std": 0.061008308082818985, "step": 2299, "train_speed(iter/s)": 0.059214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 254.875, "completions/min_length": 204.0, "epoch": 1.902398676592225, "grad_norm": 0.15265300869941711, "kl": 0.05792236328125, "learning_rate": 7.238279452700004e-08, "loss": 0.0005802586674690247, "memory(GiB)": 38.05, "reward": 0.5868304967880249, "reward_std": 0.08165772259235382, "rewards/VisualizationJSONCombinedORM/mean": 0.5868304967880249, "rewards/VisualizationJSONCombinedORM/std": 0.1431538164615631, "step": 2300, "train_speed(iter/s)": 0.059206 }, { "epoch": 1.902398676592225, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 325.9166666666667, "eval_completions/mean_length": 268.6875, "eval_completions/min_length": 230.41666666666666, "eval_kl": 0.06993611653645833, "eval_loss": 0.0007053737645037472, "eval_reward": 0.46083726299305755, "eval_reward_std": 0.06604069125993799, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46083726299305755, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06604069052264094, "eval_runtime": 287.7056, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 301.4375, "completions/min_length": 227.0, "epoch": 1.903225806451613, "grad_norm": 0.20444129407405853, "kl": 0.05535888671875, "learning_rate": 7.116407214348253e-08, "loss": 0.0005531832575798035, "memory(GiB)": 38.05, "reward": 0.6320116519927979, "reward_std": 0.06354403495788574, "rewards/VisualizationJSONCombinedORM/mean": 0.6320116519927979, "rewards/VisualizationJSONCombinedORM/std": 0.06899935752153397, "step": 2301, "train_speed(iter/s)": 0.058757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 272.0625, "completions/min_length": 221.0, "epoch": 1.9040529363110008, "grad_norm": 0.1871708184480667, "kl": 0.0640869140625, "learning_rate": 6.995562344505213e-08, "loss": 0.0006417557597160339, "memory(GiB)": 38.05, "reward": 0.6978557705879211, "reward_std": 0.07577942311763763, "rewards/VisualizationJSONCombinedORM/mean": 0.6978557705879211, "rewards/VisualizationJSONCombinedORM/std": 0.08835356682538986, "step": 2302, "train_speed(iter/s)": 0.058745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 282.875, "completions/min_length": 224.0, "epoch": 1.9048800661703886, "grad_norm": 0.20200130343437195, "kl": 0.09619140625, "learning_rate": 6.875745095060337e-08, "loss": 0.0009631738066673279, "memory(GiB)": 38.05, "reward": 0.4206031262874603, "reward_std": 0.07887721806764603, "rewards/VisualizationJSONCombinedORM/mean": 0.4206031262874603, "rewards/VisualizationJSONCombinedORM/std": 0.13375574350357056, "step": 2303, "train_speed(iter/s)": 0.058729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 296.0625, "completions/min_length": 206.0, "epoch": 1.9057071960297767, "grad_norm": 0.1962035447359085, "kl": 0.0550537109375, "learning_rate": 6.756955715761127e-08, "loss": 0.0005512498319149017, "memory(GiB)": 38.05, "reward": 0.6206499338150024, "reward_std": 0.08471578359603882, "rewards/VisualizationJSONCombinedORM/mean": 0.6206499338150024, "rewards/VisualizationJSONCombinedORM/std": 0.10277038812637329, "step": 2304, "train_speed(iter/s)": 0.05871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 281.4375, "completions/min_length": 213.0, "epoch": 1.9065343258891647, "grad_norm": 0.18422649800777435, "kl": 0.06787109375, "learning_rate": 6.639194454212738e-08, "loss": 0.0006787516176700592, "memory(GiB)": 38.05, "reward": 0.5879354476928711, "reward_std": 0.08836311101913452, "rewards/VisualizationJSONCombinedORM/mean": 0.5879354476928711, "rewards/VisualizationJSONCombinedORM/std": 0.17071059346199036, "step": 2305, "train_speed(iter/s)": 0.058696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 257.5625, "completions/min_length": 216.0, "epoch": 1.9073614557485525, "grad_norm": 0.18031857907772064, "kl": 0.04937744140625, "learning_rate": 6.522461555877213e-08, "loss": 0.0004930831491947174, "memory(GiB)": 38.05, "reward": 0.3729536235332489, "reward_std": 0.04929642379283905, "rewards/VisualizationJSONCombinedORM/mean": 0.3729536235332489, "rewards/VisualizationJSONCombinedORM/std": 0.11019322276115417, "step": 2306, "train_speed(iter/s)": 0.058685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 271.0, "completions/min_length": 218.0, "epoch": 1.9081885856079404, "grad_norm": 0.3018609881401062, "kl": 0.04669189453125, "learning_rate": 6.406757264072916e-08, "loss": 0.00046739354729652405, "memory(GiB)": 38.05, "reward": 0.42602792382240295, "reward_std": 0.054471455514431, "rewards/VisualizationJSONCombinedORM/mean": 0.42602792382240295, "rewards/VisualizationJSONCombinedORM/std": 0.14383359253406525, "step": 2307, "train_speed(iter/s)": 0.058667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 279.125, "completions/min_length": 215.0, "epoch": 1.9090157154673284, "grad_norm": 0.17195692658424377, "kl": 0.06378173828125, "learning_rate": 6.292081819974427e-08, "loss": 0.0006392211653292179, "memory(GiB)": 38.05, "reward": 0.5562386512756348, "reward_std": 0.18962910771369934, "rewards/VisualizationJSONCombinedORM/mean": 0.5562386512756348, "rewards/VisualizationJSONCombinedORM/std": 0.2277889996767044, "step": 2308, "train_speed(iter/s)": 0.058646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 262.875, "completions/min_length": 217.0, "epoch": 1.9098428453267164, "grad_norm": 0.2092069834470749, "kl": 0.039794921875, "learning_rate": 6.178435462611764e-08, "loss": 0.00039830058813095093, "memory(GiB)": 38.05, "reward": 0.547630786895752, "reward_std": 0.07695372402667999, "rewards/VisualizationJSONCombinedORM/mean": 0.547630786895752, "rewards/VisualizationJSONCombinedORM/std": 0.17668280005455017, "step": 2309, "train_speed(iter/s)": 0.058636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 256.625, "completions/min_length": 197.0, "epoch": 1.9106699751861043, "grad_norm": 0.1818424016237259, "kl": 0.043853759765625, "learning_rate": 6.065818428869774e-08, "loss": 0.00043912604451179504, "memory(GiB)": 38.05, "reward": 0.5901970863342285, "reward_std": 0.06600572913885117, "rewards/VisualizationJSONCombinedORM/mean": 0.5901970863342285, "rewards/VisualizationJSONCombinedORM/std": 0.1494053155183792, "step": 2310, "train_speed(iter/s)": 0.058625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 280.125, "completions/min_length": 237.0, "epoch": 1.911497105045492, "grad_norm": 0.22641195356845856, "kl": 0.1263427734375, "learning_rate": 5.954230953487794e-08, "loss": 0.0012630671262741089, "memory(GiB)": 38.05, "reward": 0.5252863168716431, "reward_std": 0.07067497074604034, "rewards/VisualizationJSONCombinedORM/mean": 0.5252863168716431, "rewards/VisualizationJSONCombinedORM/std": 0.19143901765346527, "step": 2311, "train_speed(iter/s)": 0.058613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 265.4375, "completions/min_length": 234.0, "epoch": 1.91232423490488, "grad_norm": 0.1601228266954422, "kl": 0.05877685546875, "learning_rate": 5.843673269059269e-08, "loss": 0.0005879774689674377, "memory(GiB)": 38.05, "reward": 0.5854312181472778, "reward_std": 0.06156807392835617, "rewards/VisualizationJSONCombinedORM/mean": 0.5854312181472778, "rewards/VisualizationJSONCombinedORM/std": 0.24892295897006989, "step": 2312, "train_speed(iter/s)": 0.058604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 286.25, "completions/min_length": 227.0, "epoch": 1.913151364764268, "grad_norm": 0.16620156168937683, "kl": 0.042724609375, "learning_rate": 5.734145606031083e-08, "loss": 0.0004266202449798584, "memory(GiB)": 38.05, "reward": 0.513319194316864, "reward_std": 0.03538822382688522, "rewards/VisualizationJSONCombinedORM/mean": 0.513319194316864, "rewards/VisualizationJSONCombinedORM/std": 0.2687546908855438, "step": 2313, "train_speed(iter/s)": 0.058594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 275.4375, "completions/min_length": 197.0, "epoch": 1.913978494623656, "grad_norm": 0.16579477488994598, "kl": 0.0423583984375, "learning_rate": 5.625648192703115e-08, "loss": 0.0004244968295097351, "memory(GiB)": 38.05, "reward": 0.5130294561386108, "reward_std": 0.06448326259851456, "rewards/VisualizationJSONCombinedORM/mean": 0.5130294561386108, "rewards/VisualizationJSONCombinedORM/std": 0.21324731409549713, "step": 2314, "train_speed(iter/s)": 0.058581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 277.25, "completions/min_length": 209.0, "epoch": 1.9148056244830438, "grad_norm": 0.1578015834093094, "kl": 0.0284423828125, "learning_rate": 5.518181255227739e-08, "loss": 0.00028528645634651184, "memory(GiB)": 38.05, "reward": 0.527019202709198, "reward_std": 0.06643792986869812, "rewards/VisualizationJSONCombinedORM/mean": 0.527019202709198, "rewards/VisualizationJSONCombinedORM/std": 0.11688212305307388, "step": 2315, "train_speed(iter/s)": 0.058566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 287.9375, "completions/min_length": 219.0, "epoch": 1.9156327543424316, "grad_norm": 0.17426306009292603, "kl": 0.06451416015625, "learning_rate": 5.411745017609493e-08, "loss": 0.000646248459815979, "memory(GiB)": 38.05, "reward": 0.42680245637893677, "reward_std": 0.04760066047310829, "rewards/VisualizationJSONCombinedORM/mean": 0.42680245637893677, "rewards/VisualizationJSONCombinedORM/std": 0.2337435930967331, "step": 2316, "train_speed(iter/s)": 0.058556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 278.5625, "completions/min_length": 209.0, "epoch": 1.9164598842018197, "grad_norm": 0.19964386522769928, "kl": 0.15899658203125, "learning_rate": 5.306339701704577e-08, "loss": 0.001589260995388031, "memory(GiB)": 38.05, "reward": 0.47046294808387756, "reward_std": 0.10750484466552734, "rewards/VisualizationJSONCombinedORM/mean": 0.47046294808387756, "rewards/VisualizationJSONCombinedORM/std": 0.15684795379638672, "step": 2317, "train_speed(iter/s)": 0.058538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 254.4375, "completions/min_length": 221.0, "epoch": 1.9172870140612077, "grad_norm": 0.15297462046146393, "kl": 0.0574951171875, "learning_rate": 5.201965527220188e-08, "loss": 0.0005748001858592033, "memory(GiB)": 38.05, "reward": 0.5835774540901184, "reward_std": 0.08055290579795837, "rewards/VisualizationJSONCombinedORM/mean": 0.5835774540901184, "rewards/VisualizationJSONCombinedORM/std": 0.1349021941423416, "step": 2318, "train_speed(iter/s)": 0.058526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 277.5625, "completions/min_length": 254.0, "epoch": 1.9181141439205955, "grad_norm": 0.14238564670085907, "kl": 0.05029296875, "learning_rate": 5.098622711714241e-08, "loss": 0.0005028452724218369, "memory(GiB)": 38.05, "reward": 0.5072535872459412, "reward_std": 0.04878731817007065, "rewards/VisualizationJSONCombinedORM/mean": 0.5072535872459412, "rewards/VisualizationJSONCombinedORM/std": 0.11345630139112473, "step": 2319, "train_speed(iter/s)": 0.058509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 263.9375, "completions/min_length": 212.0, "epoch": 1.9189412737799834, "grad_norm": 0.16265852749347687, "kl": 0.04351806640625, "learning_rate": 4.996311470594928e-08, "loss": 0.0004344061017036438, "memory(GiB)": 38.05, "reward": 0.7370494604110718, "reward_std": 0.08897724002599716, "rewards/VisualizationJSONCombinedORM/mean": 0.7370494604110718, "rewards/VisualizationJSONCombinedORM/std": 0.19075611233711243, "step": 2320, "train_speed(iter/s)": 0.058497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 277.3125, "completions/min_length": 216.0, "epoch": 1.9197684036393714, "grad_norm": 0.17493540048599243, "kl": 0.0799560546875, "learning_rate": 4.895032017120216e-08, "loss": 0.0007993653416633606, "memory(GiB)": 38.05, "reward": 0.4051096439361572, "reward_std": 0.035574741661548615, "rewards/VisualizationJSONCombinedORM/mean": 0.4051096439361572, "rewards/VisualizationJSONCombinedORM/std": 0.12244455516338348, "step": 2321, "train_speed(iter/s)": 0.058479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 277.6875, "completions/min_length": 228.0, "epoch": 1.9205955334987594, "grad_norm": 0.1396241933107376, "kl": 0.04302978515625, "learning_rate": 4.794784562397459e-08, "loss": 0.000430254265666008, "memory(GiB)": 38.05, "reward": 0.6084734797477722, "reward_std": 0.07963144779205322, "rewards/VisualizationJSONCombinedORM/mean": 0.6084734797477722, "rewards/VisualizationJSONCombinedORM/std": 0.08959221839904785, "step": 2322, "train_speed(iter/s)": 0.058469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 259.25, "completions/min_length": 201.0, "epoch": 1.9214226633581473, "grad_norm": 0.17582738399505615, "kl": 0.04180908203125, "learning_rate": 4.695569315382731e-08, "loss": 0.0004189368337392807, "memory(GiB)": 38.05, "reward": 0.6107686758041382, "reward_std": 0.06736908107995987, "rewards/VisualizationJSONCombinedORM/mean": 0.6107686758041382, "rewards/VisualizationJSONCombinedORM/std": 0.1302998960018158, "step": 2323, "train_speed(iter/s)": 0.058462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 269.25, "completions/min_length": 211.0, "epoch": 1.922249793217535, "grad_norm": 0.19651374220848083, "kl": 0.080078125, "learning_rate": 4.597386482880717e-08, "loss": 0.0008008256554603577, "memory(GiB)": 38.05, "reward": 0.5804041624069214, "reward_std": 0.07806967198848724, "rewards/VisualizationJSONCombinedORM/mean": 0.5804041624069214, "rewards/VisualizationJSONCombinedORM/std": 0.1770758181810379, "step": 2324, "train_speed(iter/s)": 0.058448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 274.5, "completions/min_length": 222.0, "epoch": 1.9230769230769231, "grad_norm": 0.1681668907403946, "kl": 0.052001953125, "learning_rate": 4.500236269544101e-08, "loss": 0.0005199611186981201, "memory(GiB)": 38.05, "reward": 0.38971462845802307, "reward_std": 0.04424891620874405, "rewards/VisualizationJSONCombinedORM/mean": 0.38971462845802307, "rewards/VisualizationJSONCombinedORM/std": 0.04319370910525322, "step": 2325, "train_speed(iter/s)": 0.058436 }, { "epoch": 1.9230769230769231, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 322.2083333333333, "eval_completions/mean_length": 272.8541666666667, "eval_completions/min_length": 230.08333333333334, "eval_kl": 0.06830851236979167, "eval_loss": 0.0006887565250508487, "eval_reward": 0.4594261373082797, "eval_reward_std": 0.07761226400422554, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4594261373082797, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07761226322812338, "eval_runtime": 285.2117, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.011, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 303.3125, "completions/min_length": 244.0, "epoch": 1.923904052936311, "grad_norm": 0.21060949563980103, "kl": 0.08026123046875, "learning_rate": 4.404118877873176e-08, "loss": 0.0008015371859073639, "memory(GiB)": 38.05, "reward": 0.44048237800598145, "reward_std": 0.04724888503551483, "rewards/VisualizationJSONCombinedORM/mean": 0.44048237800598145, "rewards/VisualizationJSONCombinedORM/std": 0.1833341121673584, "step": 2326, "train_speed(iter/s)": 0.058008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 263.9375, "completions/min_length": 196.0, "epoch": 1.924731182795699, "grad_norm": 0.2060248702764511, "kl": 0.05133056640625, "learning_rate": 4.3090345082155146e-08, "loss": 0.0005131140351295471, "memory(GiB)": 38.05, "reward": 0.6077296733856201, "reward_std": 0.08866263180971146, "rewards/VisualizationJSONCombinedORM/mean": 0.6077296733856201, "rewards/VisualizationJSONCombinedORM/std": 0.09446515887975693, "step": 2327, "train_speed(iter/s)": 0.057991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 275.75, "completions/min_length": 234.0, "epoch": 1.9255583126550868, "grad_norm": 0.1813632845878601, "kl": 0.08624267578125, "learning_rate": 4.21498335876519e-08, "loss": 0.0008624857291579247, "memory(GiB)": 38.05, "reward": 0.6762663125991821, "reward_std": 0.07247208803892136, "rewards/VisualizationJSONCombinedORM/mean": 0.6762663125991821, "rewards/VisualizationJSONCombinedORM/std": 0.12784728407859802, "step": 2328, "train_speed(iter/s)": 0.057978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 284.5625, "completions/min_length": 244.0, "epoch": 1.9263854425144746, "grad_norm": 0.20274744927883148, "kl": 0.09991455078125, "learning_rate": 4.121965625562885e-08, "loss": 0.0009998492896556854, "memory(GiB)": 38.05, "reward": 0.3080975413322449, "reward_std": 0.049413420259952545, "rewards/VisualizationJSONCombinedORM/mean": 0.3080975413322449, "rewards/VisualizationJSONCombinedORM/std": 0.08176877349615097, "step": 2329, "train_speed(iter/s)": 0.057964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 260.875, "completions/min_length": 212.0, "epoch": 1.9272125723738627, "grad_norm": 0.20656634867191315, "kl": 0.07635498046875, "learning_rate": 4.029981502495117e-08, "loss": 0.0007649026811122894, "memory(GiB)": 38.05, "reward": 0.21807733178138733, "reward_std": 0.02191436104476452, "rewards/VisualizationJSONCombinedORM/mean": 0.21807733178138733, "rewards/VisualizationJSONCombinedORM/std": 0.024954749271273613, "step": 2330, "train_speed(iter/s)": 0.057956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 278.25, "completions/min_length": 244.0, "epoch": 1.9280397022332507, "grad_norm": 0.20810407400131226, "kl": 0.078857421875, "learning_rate": 3.939031181293962e-08, "loss": 0.0007878094911575317, "memory(GiB)": 38.05, "reward": 0.5803648829460144, "reward_std": 0.09588223695755005, "rewards/VisualizationJSONCombinedORM/mean": 0.5803648829460144, "rewards/VisualizationJSONCombinedORM/std": 0.09525765478610992, "step": 2331, "train_speed(iter/s)": 0.057946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 249.25, "completions/min_length": 200.0, "epoch": 1.9288668320926385, "grad_norm": 0.17244794964790344, "kl": 0.0384521484375, "learning_rate": 3.8491148515366064e-08, "loss": 0.0003845319151878357, "memory(GiB)": 38.05, "reward": 0.7160084247589111, "reward_std": 0.06898541748523712, "rewards/VisualizationJSONCombinedORM/mean": 0.7160084247589111, "rewards/VisualizationJSONCombinedORM/std": 0.07097220420837402, "step": 2332, "train_speed(iter/s)": 0.057935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 295.0625, "completions/min_length": 228.0, "epoch": 1.9296939619520264, "grad_norm": 0.16781815886497498, "kl": 0.045654296875, "learning_rate": 3.7602327006450166e-08, "loss": 0.0004570847377181053, "memory(GiB)": 38.05, "reward": 0.48865482211112976, "reward_std": 0.07357192039489746, "rewards/VisualizationJSONCombinedORM/mean": 0.48865482211112976, "rewards/VisualizationJSONCombinedORM/std": 0.2970629036426544, "step": 2333, "train_speed(iter/s)": 0.057924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 260.5625, "completions/min_length": 223.0, "epoch": 1.9305210918114144, "grad_norm": 0.17691443860530853, "kl": 0.04510498046875, "learning_rate": 3.672384913885441e-08, "loss": 0.0004510954022407532, "memory(GiB)": 38.05, "reward": 0.4666062593460083, "reward_std": 0.08066616207361221, "rewards/VisualizationJSONCombinedORM/mean": 0.4666062593460083, "rewards/VisualizationJSONCombinedORM/std": 0.07948435842990875, "step": 2334, "train_speed(iter/s)": 0.057909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 271.625, "completions/min_length": 217.0, "epoch": 1.9313482216708024, "grad_norm": 0.15252900123596191, "kl": 0.094482421875, "learning_rate": 3.585571674368238e-08, "loss": 0.0009450465440750122, "memory(GiB)": 38.05, "reward": 0.5144599676132202, "reward_std": 0.06357399374246597, "rewards/VisualizationJSONCombinedORM/mean": 0.5144599676132202, "rewards/VisualizationJSONCombinedORM/std": 0.14239831268787384, "step": 2335, "train_speed(iter/s)": 0.057902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 253.4375, "completions/min_length": 202.0, "epoch": 1.9321753515301903, "grad_norm": 0.16450610756874084, "kl": 0.030487060546875, "learning_rate": 3.499793163047327e-08, "loss": 0.0003060959279537201, "memory(GiB)": 38.05, "reward": 0.4975462555885315, "reward_std": 0.05682092905044556, "rewards/VisualizationJSONCombinedORM/mean": 0.4975462555885315, "rewards/VisualizationJSONCombinedORM/std": 0.09790799766778946, "step": 2336, "train_speed(iter/s)": 0.057888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 288.875, "completions/min_length": 239.0, "epoch": 1.933002481389578, "grad_norm": 0.17083433270454407, "kl": 0.06365966796875, "learning_rate": 3.4150495587195744e-08, "loss": 0.0006359666585922241, "memory(GiB)": 38.05, "reward": 0.375095009803772, "reward_std": 0.04059286788105965, "rewards/VisualizationJSONCombinedORM/mean": 0.375095009803772, "rewards/VisualizationJSONCombinedORM/std": 0.1521555483341217, "step": 2337, "train_speed(iter/s)": 0.057878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 269.5625, "completions/min_length": 207.0, "epoch": 1.9338296112489661, "grad_norm": 0.20126891136169434, "kl": 0.1085205078125, "learning_rate": 3.3313410380250157e-08, "loss": 0.001085132360458374, "memory(GiB)": 38.05, "reward": 0.7178294658660889, "reward_std": 0.09799878299236298, "rewards/VisualizationJSONCombinedORM/mean": 0.7178294658660889, "rewards/VisualizationJSONCombinedORM/std": 0.09952395409345627, "step": 2338, "train_speed(iter/s)": 0.057862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 273.75, "completions/min_length": 221.0, "epoch": 1.9346567411083542, "grad_norm": 0.14134176075458527, "kl": 0.053955078125, "learning_rate": 3.248667775446024e-08, "loss": 0.0005390159785747528, "memory(GiB)": 38.05, "reward": 0.5191289186477661, "reward_std": 0.07453469187021255, "rewards/VisualizationJSONCombinedORM/mean": 0.5191289186477661, "rewards/VisualizationJSONCombinedORM/std": 0.09855175763368607, "step": 2339, "train_speed(iter/s)": 0.057852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 251.75, "completions/min_length": 223.0, "epoch": 1.935483870967742, "grad_norm": 0.2190822958946228, "kl": 0.0467529296875, "learning_rate": 3.1670299433070315e-08, "loss": 0.00046771392226219177, "memory(GiB)": 38.05, "reward": 0.5171604156494141, "reward_std": 0.10043346881866455, "rewards/VisualizationJSONCombinedORM/mean": 0.5171604156494141, "rewards/VisualizationJSONCombinedORM/std": 0.2074173390865326, "step": 2340, "train_speed(iter/s)": 0.05785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/mean_length": 241.8125, "completions/min_length": 216.0, "epoch": 1.9363110008271298, "grad_norm": 0.14925365149974823, "kl": 0.0341796875, "learning_rate": 3.086427711774309e-08, "loss": 0.00034118443727493286, "memory(GiB)": 38.05, "reward": 0.5988193154335022, "reward_std": 0.08528740704059601, "rewards/VisualizationJSONCombinedORM/mean": 0.5988193154335022, "rewards/VisualizationJSONCombinedORM/std": 0.22590932250022888, "step": 2341, "train_speed(iter/s)": 0.057842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 268.3125, "completions/min_length": 226.0, "epoch": 1.9371381306865176, "grad_norm": 0.1250004768371582, "kl": 0.026824951171875, "learning_rate": 3.0068612488554084e-08, "loss": 0.0002683475613594055, "memory(GiB)": 38.05, "reward": 0.5688409805297852, "reward_std": 0.04615149274468422, "rewards/VisualizationJSONCombinedORM/mean": 0.5688409805297852, "rewards/VisualizationJSONCombinedORM/std": 0.16759242117404938, "step": 2342, "train_speed(iter/s)": 0.05783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 266.75, "completions/min_length": 208.0, "epoch": 1.9379652605459057, "grad_norm": 0.20313400030136108, "kl": 0.0400390625, "learning_rate": 2.9283307203989975e-08, "loss": 0.00039937347173690796, "memory(GiB)": 38.05, "reward": 0.5544989109039307, "reward_std": 0.046793483197689056, "rewards/VisualizationJSONCombinedORM/mean": 0.5544989109039307, "rewards/VisualizationJSONCombinedORM/std": 0.2473195642232895, "step": 2343, "train_speed(iter/s)": 0.057812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 270.875, "completions/min_length": 228.0, "epoch": 1.9387923904052937, "grad_norm": 0.15768983960151672, "kl": 0.0650634765625, "learning_rate": 2.850836290094472e-08, "loss": 0.0006492435932159424, "memory(GiB)": 38.05, "reward": 0.6547577977180481, "reward_std": 0.07943964004516602, "rewards/VisualizationJSONCombinedORM/mean": 0.6547577977180481, "rewards/VisualizationJSONCombinedORM/std": 0.08322588354349136, "step": 2344, "train_speed(iter/s)": 0.057796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 267.75, "completions/min_length": 200.0, "epoch": 1.9396195202646815, "grad_norm": 0.1889035701751709, "kl": 0.04901123046875, "learning_rate": 2.7743781194714547e-08, "loss": 0.0004894472658634186, "memory(GiB)": 38.05, "reward": 0.3668251037597656, "reward_std": 0.037075649946928024, "rewards/VisualizationJSONCombinedORM/mean": 0.3668251037597656, "rewards/VisualizationJSONCombinedORM/std": 0.20976772904396057, "step": 2345, "train_speed(iter/s)": 0.057785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 269.8125, "completions/min_length": 195.0, "epoch": 1.9404466501240694, "grad_norm": 0.16260096430778503, "kl": 0.038177490234375, "learning_rate": 2.6989563678996856e-08, "loss": 0.0003815963864326477, "memory(GiB)": 38.05, "reward": 0.7015897631645203, "reward_std": 0.08943898975849152, "rewards/VisualizationJSONCombinedORM/mean": 0.7015897631645203, "rewards/VisualizationJSONCombinedORM/std": 0.08856548368930817, "step": 2346, "train_speed(iter/s)": 0.057774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 262.0625, "completions/min_length": 188.0, "epoch": 1.9412737799834574, "grad_norm": 0.1985551118850708, "kl": 0.057861328125, "learning_rate": 2.6245711925885765e-08, "loss": 0.0005782544612884521, "memory(GiB)": 38.05, "reward": 0.2878842353820801, "reward_std": 0.03142567723989487, "rewards/VisualizationJSONCombinedORM/mean": 0.2878842353820801, "rewards/VisualizationJSONCombinedORM/std": 0.08168967068195343, "step": 2347, "train_speed(iter/s)": 0.057758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 257.75, "completions/min_length": 207.0, "epoch": 1.9421009098428454, "grad_norm": 0.17662166059017181, "kl": 0.117431640625, "learning_rate": 2.551222748586879e-08, "loss": 0.0011722743511199951, "memory(GiB)": 38.05, "reward": 0.525178849697113, "reward_std": 0.08643540740013123, "rewards/VisualizationJSONCombinedORM/mean": 0.525178849697113, "rewards/VisualizationJSONCombinedORM/std": 0.13258428871631622, "step": 2348, "train_speed(iter/s)": 0.057753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 274.5625, "completions/min_length": 210.0, "epoch": 1.9429280397022333, "grad_norm": 0.2274446189403534, "kl": 0.061279296875, "learning_rate": 2.4789111887823513e-08, "loss": 0.0006130263209342957, "memory(GiB)": 38.05, "reward": 0.5438551902770996, "reward_std": 0.07701513171195984, "rewards/VisualizationJSONCombinedORM/mean": 0.5438551902770996, "rewards/VisualizationJSONCombinedORM/std": 0.13756249845027924, "step": 2349, "train_speed(iter/s)": 0.057744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 262.625, "completions/min_length": 214.0, "epoch": 1.943755169561621, "grad_norm": 0.18563099205493927, "kl": 0.083984375, "learning_rate": 2.4076366639015914e-08, "loss": 0.0008392520248889923, "memory(GiB)": 38.05, "reward": 0.4544029235839844, "reward_std": 0.08110496401786804, "rewards/VisualizationJSONCombinedORM/mean": 0.4544029235839844, "rewards/VisualizationJSONCombinedORM/std": 0.14079542458057404, "step": 2350, "train_speed(iter/s)": 0.057729 }, { "epoch": 1.943755169561621, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 324.9166666666667, "eval_completions/mean_length": 269.0208333333333, "eval_completions/min_length": 229.08333333333334, "eval_kl": 0.071075439453125, "eval_loss": 0.000708719075191766, "eval_reward": 0.4685132609059413, "eval_reward_std": 0.06669424392748624, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4685132609059413, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06669424745875101, "eval_runtime": 287.0285, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 273.5625, "completions/min_length": 199.0, "epoch": 1.9445822994210091, "grad_norm": 0.17445781826972961, "kl": 0.105224609375, "learning_rate": 2.337399322509315e-08, "loss": 0.0010527148842811584, "memory(GiB)": 38.05, "reward": 0.4366169571876526, "reward_std": 0.05607353150844574, "rewards/VisualizationJSONCombinedORM/mean": 0.4366169571876526, "rewards/VisualizationJSONCombinedORM/std": 0.17123889923095703, "step": 2351, "train_speed(iter/s)": 0.057314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 256.5, "completions/min_length": 229.0, "epoch": 1.9454094292803972, "grad_norm": 0.18736609816551208, "kl": 0.0526123046875, "learning_rate": 2.26819931100869e-08, "loss": 0.0005263723433017731, "memory(GiB)": 38.05, "reward": 0.4743180274963379, "reward_std": 0.08173859119415283, "rewards/VisualizationJSONCombinedORM/mean": 0.4743180274963379, "rewards/VisualizationJSONCombinedORM/std": 0.08395560830831528, "step": 2352, "train_speed(iter/s)": 0.057309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 296.375, "completions/min_length": 238.0, "epoch": 1.946236559139785, "grad_norm": 0.17433317005634308, "kl": 0.062744140625, "learning_rate": 2.2000367736403906e-08, "loss": 0.0006284192204475403, "memory(GiB)": 38.05, "reward": 0.5108214020729065, "reward_std": 0.0619797557592392, "rewards/VisualizationJSONCombinedORM/mean": 0.5108214020729065, "rewards/VisualizationJSONCombinedORM/std": 0.21918199956417084, "step": 2353, "train_speed(iter/s)": 0.057295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 256.5625, "completions/min_length": 203.0, "epoch": 1.9470636889991728, "grad_norm": 0.21785572171211243, "kl": 0.06378173828125, "learning_rate": 2.1329118524827662e-08, "loss": 0.0006383880972862244, "memory(GiB)": 38.05, "reward": 0.4914397597312927, "reward_std": 0.08989237248897552, "rewards/VisualizationJSONCombinedORM/mean": 0.4914397597312927, "rewards/VisualizationJSONCombinedORM/std": 0.20421814918518066, "step": 2354, "train_speed(iter/s)": 0.057288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 249.5, "completions/min_length": 192.0, "epoch": 1.9478908188585606, "grad_norm": 0.20829814672470093, "kl": 0.06640625, "learning_rate": 2.0668246874511744e-08, "loss": 0.000663071870803833, "memory(GiB)": 38.05, "reward": 0.43573182821273804, "reward_std": 0.0641949474811554, "rewards/VisualizationJSONCombinedORM/mean": 0.43573182821273804, "rewards/VisualizationJSONCombinedORM/std": 0.15409667789936066, "step": 2355, "train_speed(iter/s)": 0.057279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 268.6875, "completions/min_length": 206.0, "epoch": 1.9487179487179487, "grad_norm": 0.18319271504878998, "kl": 0.0576171875, "learning_rate": 2.0017754162979795e-08, "loss": 0.0005775578320026398, "memory(GiB)": 38.05, "reward": 0.6392574310302734, "reward_std": 0.055759966373443604, "rewards/VisualizationJSONCombinedORM/mean": 0.6392574310302734, "rewards/VisualizationJSONCombinedORM/std": 0.14739395678043365, "step": 2356, "train_speed(iter/s)": 0.057268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 248.9375, "completions/min_length": 187.0, "epoch": 1.9495450785773367, "grad_norm": 0.20237913727760315, "kl": 0.04638671875, "learning_rate": 1.937764174612222e-08, "loss": 0.0004630088806152344, "memory(GiB)": 38.05, "reward": 0.5874282121658325, "reward_std": 0.05684936046600342, "rewards/VisualizationJSONCombinedORM/mean": 0.5874282121658325, "rewards/VisualizationJSONCombinedORM/std": 0.1972658336162567, "step": 2357, "train_speed(iter/s)": 0.057262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 273.875, "completions/min_length": 228.0, "epoch": 1.9503722084367245, "grad_norm": 0.15317881107330322, "kl": 0.031402587890625, "learning_rate": 1.8747910958191173e-08, "loss": 0.00031402893364429474, "memory(GiB)": 38.05, "reward": 0.6578925251960754, "reward_std": 0.06195998936891556, "rewards/VisualizationJSONCombinedORM/mean": 0.6578925251960754, "rewards/VisualizationJSONCombinedORM/std": 0.13028044998645782, "step": 2358, "train_speed(iter/s)": 0.057252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 255.5625, "completions/min_length": 219.0, "epoch": 1.9511993382961124, "grad_norm": 0.18398714065551758, "kl": 0.0557861328125, "learning_rate": 1.81285631118e-08, "loss": 0.000559050589799881, "memory(GiB)": 38.05, "reward": 0.47842997312545776, "reward_std": 0.060048073530197144, "rewards/VisualizationJSONCombinedORM/mean": 0.47842997312545776, "rewards/VisualizationJSONCombinedORM/std": 0.05820869281888008, "step": 2359, "train_speed(iter/s)": 0.057248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 269.625, "completions/min_length": 199.0, "epoch": 1.9520264681555004, "grad_norm": 0.1411578208208084, "kl": 0.05340576171875, "learning_rate": 1.7519599497919926e-08, "loss": 0.0005343165248632431, "memory(GiB)": 38.05, "reward": 0.45664340257644653, "reward_std": 0.06448192894458771, "rewards/VisualizationJSONCombinedORM/mean": 0.45664340257644653, "rewards/VisualizationJSONCombinedORM/std": 0.07369408756494522, "step": 2360, "train_speed(iter/s)": 0.057236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 266.0625, "completions/min_length": 199.0, "epoch": 1.9528535980148884, "grad_norm": 0.1711176484823227, "kl": 0.04443359375, "learning_rate": 1.6921021385877258e-08, "loss": 0.0004448499530553818, "memory(GiB)": 38.05, "reward": 0.7527689933776855, "reward_std": 0.048588477075099945, "rewards/VisualizationJSONCombinedORM/mean": 0.7527689933776855, "rewards/VisualizationJSONCombinedORM/std": 0.053961366415023804, "step": 2361, "train_speed(iter/s)": 0.057228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 265.25, "completions/min_length": 227.0, "epoch": 1.9536807278742763, "grad_norm": 0.18326522409915924, "kl": 0.07196044921875, "learning_rate": 1.6332830023350065e-08, "loss": 0.000720662996172905, "memory(GiB)": 38.05, "reward": 0.4908476173877716, "reward_std": 0.062053412199020386, "rewards/VisualizationJSONCombinedORM/mean": 0.4908476173877716, "rewards/VisualizationJSONCombinedORM/std": 0.23778851330280304, "step": 2362, "train_speed(iter/s)": 0.057225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 256.25, "completions/min_length": 224.0, "epoch": 1.954507857733664, "grad_norm": 0.16476315259933472, "kl": 0.0592041015625, "learning_rate": 1.575502663636763e-08, "loss": 0.0005914643406867981, "memory(GiB)": 38.05, "reward": 0.5272408127784729, "reward_std": 0.08212242275476456, "rewards/VisualizationJSONCombinedORM/mean": 0.5272408127784729, "rewards/VisualizationJSONCombinedORM/std": 0.21969550848007202, "step": 2363, "train_speed(iter/s)": 0.05721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 265.1875, "completions/min_length": 224.0, "epoch": 1.9553349875930521, "grad_norm": 0.1998927891254425, "kl": 0.078369140625, "learning_rate": 1.5187612429304887e-08, "loss": 0.0007848069071769714, "memory(GiB)": 38.05, "reward": 0.5825997591018677, "reward_std": 0.04851383715867996, "rewards/VisualizationJSONCombinedORM/mean": 0.5825997591018677, "rewards/VisualizationJSONCombinedORM/std": 0.1628238707780838, "step": 2364, "train_speed(iter/s)": 0.057206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 294.1875, "completions/min_length": 216.0, "epoch": 1.9561621174524402, "grad_norm": 0.22910092771053314, "kl": 0.0872802734375, "learning_rate": 1.463058858488242e-08, "loss": 0.0008724629878997803, "memory(GiB)": 38.05, "reward": 0.7119636535644531, "reward_std": 0.1043279618024826, "rewards/VisualizationJSONCombinedORM/mean": 0.7119636535644531, "rewards/VisualizationJSONCombinedORM/std": 0.10271265357732773, "step": 2365, "train_speed(iter/s)": 0.057198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 253.6875, "completions/min_length": 216.0, "epoch": 1.956989247311828, "grad_norm": 0.19690175354480743, "kl": 0.14111328125, "learning_rate": 1.408395626416259e-08, "loss": 0.0014118999242782593, "memory(GiB)": 38.05, "reward": 0.551193356513977, "reward_std": 0.06045833230018616, "rewards/VisualizationJSONCombinedORM/mean": 0.551193356513977, "rewards/VisualizationJSONCombinedORM/std": 0.21321086585521698, "step": 2366, "train_speed(iter/s)": 0.057185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 260.875, "completions/min_length": 233.0, "epoch": 1.9578163771712158, "grad_norm": 0.17011259496212006, "kl": 0.144775390625, "learning_rate": 1.3547716606548967e-08, "loss": 0.0014483407139778137, "memory(GiB)": 38.05, "reward": 0.5078009366989136, "reward_std": 0.05283939838409424, "rewards/VisualizationJSONCombinedORM/mean": 0.5078009366989136, "rewards/VisualizationJSONCombinedORM/std": 0.2241009622812271, "step": 2367, "train_speed(iter/s)": 0.057175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 269.25, "completions/min_length": 212.0, "epoch": 1.9586435070306039, "grad_norm": 0.16991941630840302, "kl": 0.04229736328125, "learning_rate": 1.3021870729780783e-08, "loss": 0.00042219460010528564, "memory(GiB)": 38.05, "reward": 0.5897572040557861, "reward_std": 0.08903220295906067, "rewards/VisualizationJSONCombinedORM/mean": 0.5897572040557861, "rewards/VisualizationJSONCombinedORM/std": 0.10186570137739182, "step": 2368, "train_speed(iter/s)": 0.057164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 273.4375, "completions/min_length": 208.0, "epoch": 1.9594706368899917, "grad_norm": 0.22475259006023407, "kl": 0.135986328125, "learning_rate": 1.2506419729933494e-08, "loss": 0.0013602972030639648, "memory(GiB)": 38.05, "reward": 0.19900372624397278, "reward_std": 0.025324033573269844, "rewards/VisualizationJSONCombinedORM/mean": 0.19900372624397278, "rewards/VisualizationJSONCombinedORM/std": 0.036284416913986206, "step": 2369, "train_speed(iter/s)": 0.057155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 286.0625, "completions/min_length": 206.0, "epoch": 1.9602977667493797, "grad_norm": 0.18909405171871185, "kl": 0.0848388671875, "learning_rate": 1.200136468141544e-08, "loss": 0.000848330557346344, "memory(GiB)": 38.05, "reward": 0.5813637971878052, "reward_std": 0.08239474147558212, "rewards/VisualizationJSONCombinedORM/mean": 0.5813637971878052, "rewards/VisualizationJSONCombinedORM/std": 0.09999528527259827, "step": 2370, "train_speed(iter/s)": 0.05714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 294.1875, "completions/min_length": 243.0, "epoch": 1.9611248966087675, "grad_norm": 0.27384793758392334, "kl": 0.05206298828125, "learning_rate": 1.1506706636964515e-08, "loss": 0.0005216281861066818, "memory(GiB)": 38.05, "reward": 0.5391062498092651, "reward_std": 0.11343047022819519, "rewards/VisualizationJSONCombinedORM/mean": 0.5391062498092651, "rewards/VisualizationJSONCombinedORM/std": 0.15473490953445435, "step": 2371, "train_speed(iter/s)": 0.057135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 271.9375, "completions/min_length": 227.0, "epoch": 1.9619520264681554, "grad_norm": 0.18770888447761536, "kl": 0.06646728515625, "learning_rate": 1.1022446627649286e-08, "loss": 0.0006632674485445023, "memory(GiB)": 38.05, "reward": 0.4813085198402405, "reward_std": 0.07440851628780365, "rewards/VisualizationJSONCombinedORM/mean": 0.4813085198402405, "rewards/VisualizationJSONCombinedORM/std": 0.1369825154542923, "step": 2372, "train_speed(iter/s)": 0.057121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 274.625, "completions/min_length": 224.0, "epoch": 1.9627791563275434, "grad_norm": 0.15452846884727478, "kl": 0.03314208984375, "learning_rate": 1.0548585662861765e-08, "loss": 0.0003315173089504242, "memory(GiB)": 38.05, "reward": 0.4108208119869232, "reward_std": 0.03281707316637039, "rewards/VisualizationJSONCombinedORM/mean": 0.4108208119869232, "rewards/VisualizationJSONCombinedORM/std": 0.11886871606111526, "step": 2373, "train_speed(iter/s)": 0.05711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 275.25, "completions/min_length": 217.0, "epoch": 1.9636062861869314, "grad_norm": 0.16685383021831512, "kl": 0.0576171875, "learning_rate": 1.008512473032075e-08, "loss": 0.0005756691098213196, "memory(GiB)": 38.05, "reward": 0.5833747386932373, "reward_std": 0.0609164834022522, "rewards/VisualizationJSONCombinedORM/mean": 0.5833747386932373, "rewards/VisualizationJSONCombinedORM/std": 0.15251269936561584, "step": 2374, "train_speed(iter/s)": 0.057095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 294.125, "completions/min_length": 232.0, "epoch": 1.9644334160463193, "grad_norm": 0.18947233259677887, "kl": 0.0902099609375, "learning_rate": 9.632064796065155e-09, "loss": 0.0009010210633277893, "memory(GiB)": 38.05, "reward": 0.6473175883293152, "reward_std": 0.06082955002784729, "rewards/VisualizationJSONCombinedORM/mean": 0.6473175883293152, "rewards/VisualizationJSONCombinedORM/std": 0.08586341887712479, "step": 2375, "train_speed(iter/s)": 0.057082 }, { "epoch": 1.9644334160463193, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 318.2916666666667, "eval_completions/mean_length": 269.7395833333333, "eval_completions/min_length": 235.25, "eval_kl": 0.06747945149739583, "eval_loss": 0.0006793935899622738, "eval_reward": 0.46338904028137523, "eval_reward_std": 0.06706458423286676, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46338904028137523, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0670645876089111, "eval_runtime": 283.7011, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 252.9375, "completions/min_length": 211.0, "epoch": 1.965260545905707, "grad_norm": 0.19747516512870789, "kl": 0.05242919921875, "learning_rate": 9.18940680445568e-09, "loss": 0.0005229730159044266, "memory(GiB)": 38.05, "reward": 0.4561063349246979, "reward_std": 0.07799287140369415, "rewards/VisualizationJSONCombinedORM/mean": 0.4561063349246979, "rewards/VisualizationJSONCombinedORM/std": 0.12089159339666367, "step": 2376, "train_speed(iter/s)": 0.056688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 256.5, "completions/min_length": 215.0, "epoch": 1.9660876757650951, "grad_norm": 0.1894332766532898, "kl": 0.2572021484375, "learning_rate": 8.757151678169818e-09, "loss": 0.0025643855333328247, "memory(GiB)": 38.05, "reward": 0.44847798347473145, "reward_std": 0.06705427169799805, "rewards/VisualizationJSONCombinedORM/mean": 0.44847798347473145, "rewards/VisualizationJSONCombinedORM/std": 0.12226265668869019, "step": 2377, "train_speed(iter/s)": 0.056671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 245.625, "completions/min_length": 210.0, "epoch": 1.9669148056244832, "grad_norm": 0.17243582010269165, "kl": 0.0914306640625, "learning_rate": 8.335300318201844e-09, "loss": 0.0009149378165602684, "memory(GiB)": 38.05, "reward": 0.3763636350631714, "reward_std": 0.05967964977025986, "rewards/VisualizationJSONCombinedORM/mean": 0.3763636350631714, "rewards/VisualizationJSONCombinedORM/std": 0.0970020443201065, "step": 2378, "train_speed(iter/s)": 0.056655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 286.0625, "completions/min_length": 218.0, "epoch": 1.967741935483871, "grad_norm": 0.23691608011722565, "kl": 0.0538330078125, "learning_rate": 7.923853603861165e-09, "loss": 0.0005381181836128235, "memory(GiB)": 38.05, "reward": 0.294289767742157, "reward_std": 0.04362455755472183, "rewards/VisualizationJSONCombinedORM/mean": 0.294289767742157, "rewards/VisualizationJSONCombinedORM/std": 0.11644890904426575, "step": 2379, "train_speed(iter/s)": 0.056646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 266.625, "completions/min_length": 198.0, "epoch": 1.9685690653432588, "grad_norm": 0.1890048086643219, "kl": 0.031341552734375, "learning_rate": 7.52281239276842e-09, "loss": 0.00031340355053544044, "memory(GiB)": 38.05, "reward": 0.7085884809494019, "reward_std": 0.054101213812828064, "rewards/VisualizationJSONCombinedORM/mean": 0.7085884809494019, "rewards/VisualizationJSONCombinedORM/std": 0.07214237004518509, "step": 2380, "train_speed(iter/s)": 0.056634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 249.5625, "completions/min_length": 220.0, "epoch": 1.9693961952026469, "grad_norm": 0.1710379421710968, "kl": 0.0845947265625, "learning_rate": 7.132177520854932e-09, "loss": 0.0008459873497486115, "memory(GiB)": 38.05, "reward": 0.5098147392272949, "reward_std": 0.08616974949836731, "rewards/VisualizationJSONCombinedORM/mean": 0.5098147392272949, "rewards/VisualizationJSONCombinedORM/std": 0.1015697792172432, "step": 2381, "train_speed(iter/s)": 0.056627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 268.5625, "completions/min_length": 210.0, "epoch": 1.970223325062035, "grad_norm": 0.22187602519989014, "kl": 0.0963134765625, "learning_rate": 6.751949802362712e-09, "loss": 0.0009612515568733215, "memory(GiB)": 38.05, "reward": 0.49201977252960205, "reward_std": 0.07065688073635101, "rewards/VisualizationJSONCombinedORM/mean": 0.49201977252960205, "rewards/VisualizationJSONCombinedORM/std": 0.20752058923244476, "step": 2382, "train_speed(iter/s)": 0.056617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 277.4375, "completions/min_length": 204.0, "epoch": 1.9710504549214227, "grad_norm": 0.1837093085050583, "kl": 0.09429931640625, "learning_rate": 6.382130029838896e-09, "loss": 0.0009439587593078613, "memory(GiB)": 38.05, "reward": 0.41977739334106445, "reward_std": 0.0378243662416935, "rewards/VisualizationJSONCombinedORM/mean": 0.41977739334106445, "rewards/VisualizationJSONCombinedORM/std": 0.047033000737428665, "step": 2383, "train_speed(iter/s)": 0.05661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 279.625, "completions/min_length": 192.0, "epoch": 1.9718775847808105, "grad_norm": 0.17311114072799683, "kl": 0.04876708984375, "learning_rate": 6.022718974137976e-09, "loss": 0.0004885271191596985, "memory(GiB)": 38.05, "reward": 0.624542772769928, "reward_std": 0.08052563667297363, "rewards/VisualizationJSONCombinedORM/mean": 0.624542772769928, "rewards/VisualizationJSONCombinedORM/std": 0.10584989935159683, "step": 2384, "train_speed(iter/s)": 0.0566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 286.75, "completions/min_length": 232.0, "epoch": 1.9727047146401984, "grad_norm": 0.26458194851875305, "kl": 0.08251953125, "learning_rate": 5.673717384417354e-09, "loss": 0.0008249059319496155, "memory(GiB)": 38.05, "reward": 0.5198169946670532, "reward_std": 0.07377108931541443, "rewards/VisualizationJSONCombinedORM/mean": 0.5198169946670532, "rewards/VisualizationJSONCombinedORM/std": 0.1945219486951828, "step": 2385, "train_speed(iter/s)": 0.056586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 267.1875, "completions/min_length": 199.0, "epoch": 1.9735318444995864, "grad_norm": 0.2083185613155365, "kl": 0.10260009765625, "learning_rate": 5.3351259881379016e-09, "loss": 0.0010275356471538544, "memory(GiB)": 38.05, "reward": 0.4631926715373993, "reward_std": 0.06626191735267639, "rewards/VisualizationJSONCombinedORM/mean": 0.4631926715373993, "rewards/VisualizationJSONCombinedORM/std": 0.07531581819057465, "step": 2386, "train_speed(iter/s)": 0.056579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 268.75, "completions/min_length": 234.0, "epoch": 1.9743589743589745, "grad_norm": 0.2319803088903427, "kl": 0.0537109375, "learning_rate": 5.006945491060067e-09, "loss": 0.0005380995571613312, "memory(GiB)": 38.05, "reward": 0.5131397247314453, "reward_std": 0.06243719160556793, "rewards/VisualizationJSONCombinedORM/mean": 0.5131397247314453, "rewards/VisualizationJSONCombinedORM/std": 0.11909753829240799, "step": 2387, "train_speed(iter/s)": 0.056568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 288.5625, "completions/min_length": 231.0, "epoch": 1.9751861042183623, "grad_norm": 0.21364323794841766, "kl": 0.06121826171875, "learning_rate": 4.689176577244992e-09, "loss": 0.0006121024489402771, "memory(GiB)": 38.05, "reward": 0.3349017798900604, "reward_std": 0.03786837309598923, "rewards/VisualizationJSONCombinedORM/mean": 0.3349017798900604, "rewards/VisualizationJSONCombinedORM/std": 0.09128471463918686, "step": 2388, "train_speed(iter/s)": 0.056559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 263.5, "completions/min_length": 220.0, "epoch": 1.97601323407775, "grad_norm": 0.1468871682882309, "kl": 0.033294677734375, "learning_rate": 4.381819909051732e-09, "loss": 0.00033288076519966125, "memory(GiB)": 38.05, "reward": 0.5051094889640808, "reward_std": 0.057022176682949066, "rewards/VisualizationJSONCombinedORM/mean": 0.5051094889640808, "rewards/VisualizationJSONCombinedORM/std": 0.1785355657339096, "step": 2389, "train_speed(iter/s)": 0.056548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 259.5625, "completions/min_length": 217.0, "epoch": 1.9768403639371381, "grad_norm": 0.17614588141441345, "kl": 0.05352783203125, "learning_rate": 4.0848761271350405e-09, "loss": 0.0005347281694412231, "memory(GiB)": 38.05, "reward": 0.7312610745429993, "reward_std": 0.13556692004203796, "rewards/VisualizationJSONCombinedORM/mean": 0.7312610745429993, "rewards/VisualizationJSONCombinedORM/std": 0.15106932818889618, "step": 2390, "train_speed(iter/s)": 0.056543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 275.0, "completions/min_length": 202.0, "epoch": 1.9776674937965262, "grad_norm": 0.17904488742351532, "kl": 0.1143798828125, "learning_rate": 3.798345850445917e-09, "loss": 0.001140819862484932, "memory(GiB)": 38.05, "reward": 0.6480370759963989, "reward_std": 0.11869694292545319, "rewards/VisualizationJSONCombinedORM/mean": 0.6480370759963989, "rewards/VisualizationJSONCombinedORM/std": 0.15414461493492126, "step": 2391, "train_speed(iter/s)": 0.056532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 258.0625, "completions/min_length": 213.0, "epoch": 1.978494623655914, "grad_norm": 0.259337842464447, "kl": 0.1123046875, "learning_rate": 3.522229676229949e-09, "loss": 0.0011243298649787903, "memory(GiB)": 38.05, "reward": 0.38906511664390564, "reward_std": 0.059516534209251404, "rewards/VisualizationJSONCombinedORM/mean": 0.38906511664390564, "rewards/VisualizationJSONCombinedORM/std": 0.14473934471607208, "step": 2392, "train_speed(iter/s)": 0.05652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 271.75, "completions/min_length": 220.0, "epoch": 1.9793217535153018, "grad_norm": 0.19809377193450928, "kl": 0.04498291015625, "learning_rate": 3.256528180023977e-09, "loss": 0.00044958293437957764, "memory(GiB)": 38.05, "reward": 0.6280705332756042, "reward_std": 0.08053787052631378, "rewards/VisualizationJSONCombinedORM/mean": 0.6280705332756042, "rewards/VisualizationJSONCombinedORM/std": 0.08022571355104446, "step": 2393, "train_speed(iter/s)": 0.056512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 278.3125, "completions/min_length": 218.0, "epoch": 1.9801488833746899, "grad_norm": 0.1799592822790146, "kl": 0.05609130859375, "learning_rate": 3.0012419156572047e-09, "loss": 0.0005618017166852951, "memory(GiB)": 38.05, "reward": 0.46198025345802307, "reward_std": 0.04525540769100189, "rewards/VisualizationJSONCombinedORM/mean": 0.46198025345802307, "rewards/VisualizationJSONCombinedORM/std": 0.2738693058490753, "step": 2394, "train_speed(iter/s)": 0.056497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 281.8125, "completions/min_length": 208.0, "epoch": 1.980976013234078, "grad_norm": 0.1937924027442932, "kl": 0.126953125, "learning_rate": 2.756371415250092e-09, "loss": 0.0012676455080509186, "memory(GiB)": 38.05, "reward": 0.49844181537628174, "reward_std": 0.09288778901100159, "rewards/VisualizationJSONCombinedORM/mean": 0.49844181537628174, "rewards/VisualizationJSONCombinedORM/std": 0.09175020456314087, "step": 2395, "train_speed(iter/s)": 0.056484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 277.1875, "completions/min_length": 220.0, "epoch": 1.9818031430934657, "grad_norm": 0.19066056609153748, "kl": 0.0863037109375, "learning_rate": 2.5219171892110207e-09, "loss": 0.0008632205426692963, "memory(GiB)": 38.05, "reward": 0.4429149031639099, "reward_std": 0.07109975814819336, "rewards/VisualizationJSONCombinedORM/mean": 0.4429149031639099, "rewards/VisualizationJSONCombinedORM/std": 0.17908039689064026, "step": 2396, "train_speed(iter/s)": 0.056477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 259.25, "completions/min_length": 218.0, "epoch": 1.9826302729528535, "grad_norm": 0.2055949568748474, "kl": 0.0758056640625, "learning_rate": 2.297879726237406e-09, "loss": 0.0007567033171653748, "memory(GiB)": 38.05, "reward": 0.5369873046875, "reward_std": 0.058104511350393295, "rewards/VisualizationJSONCombinedORM/mean": 0.5369873046875, "rewards/VisualizationJSONCombinedORM/std": 0.18925075232982635, "step": 2397, "train_speed(iter/s)": 0.056473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 257.4375, "completions/min_length": 198.0, "epoch": 1.9834574028122414, "grad_norm": 0.17316214740276337, "kl": 0.0772705078125, "learning_rate": 2.0842594933140338e-09, "loss": 0.0007765330374240875, "memory(GiB)": 38.05, "reward": 0.495139479637146, "reward_std": 0.03947757929563522, "rewards/VisualizationJSONCombinedORM/mean": 0.495139479637146, "rewards/VisualizationJSONCombinedORM/std": 0.21762360632419586, "step": 2398, "train_speed(iter/s)": 0.056463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 258.0, "completions/min_length": 190.0, "epoch": 1.9842845326716294, "grad_norm": 0.17297089099884033, "kl": 0.0411376953125, "learning_rate": 1.8810569357113896e-09, "loss": 0.00041061267256736755, "memory(GiB)": 38.05, "reward": 0.49180710315704346, "reward_std": 0.04229477792978287, "rewards/VisualizationJSONCombinedORM/mean": 0.49180710315704346, "rewards/VisualizationJSONCombinedORM/std": 0.13548395037651062, "step": 2399, "train_speed(iter/s)": 0.056451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 291.9375, "completions/min_length": 230.0, "epoch": 1.9851116625310175, "grad_norm": 0.1801607459783554, "kl": 0.0975341796875, "learning_rate": 1.688272476986219e-09, "loss": 0.0009777992963790894, "memory(GiB)": 38.05, "reward": 0.39457499980926514, "reward_std": 0.034860819578170776, "rewards/VisualizationJSONCombinedORM/mean": 0.39457499980926514, "rewards/VisualizationJSONCombinedORM/std": 0.12311672419309616, "step": 2400, "train_speed(iter/s)": 0.056436 }, { "epoch": 1.9851116625310175, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 332.3333333333333, "eval_completions/mean_length": 275.640625, "eval_completions/min_length": 230.58333333333334, "eval_kl": 0.06833902994791667, "eval_loss": 0.0006890526856295764, "eval_reward": 0.4689354617148638, "eval_reward_std": 0.06370575958862901, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4689354617148638, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06370575811403494, "eval_runtime": 291.0486, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 263.3125, "completions/min_length": 213.0, "epoch": 1.9859387923904053, "grad_norm": 0.17544302344322205, "kl": 0.15869140625, "learning_rate": 1.5059065189787502e-09, "loss": 0.001584760844707489, "memory(GiB)": 38.05, "reward": 0.48218223452568054, "reward_std": 0.0812561959028244, "rewards/VisualizationJSONCombinedORM/mean": 0.48218223452568054, "rewards/VisualizationJSONCombinedORM/std": 0.16094174981117249, "step": 2401, "train_speed(iter/s)": 0.056047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 275.5625, "completions/min_length": 213.0, "epoch": 1.986765922249793, "grad_norm": 0.16055862605571747, "kl": 0.05267333984375, "learning_rate": 1.3339594418138036e-09, "loss": 0.0005275271832942963, "memory(GiB)": 38.05, "reward": 0.47004106640815735, "reward_std": 0.059053778648376465, "rewards/VisualizationJSONCombinedORM/mean": 0.47004106640815735, "rewards/VisualizationJSONCombinedORM/std": 0.15313369035720825, "step": 2402, "train_speed(iter/s)": 0.056033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 264.75, "completions/min_length": 209.0, "epoch": 1.9875930521091811, "grad_norm": 0.2096538096666336, "kl": 0.057861328125, "learning_rate": 1.1724316038980166e-09, "loss": 0.0005780868232250214, "memory(GiB)": 38.05, "reward": 0.6499322652816772, "reward_std": 0.09209561347961426, "rewards/VisualizationJSONCombinedORM/mean": 0.6499322652816772, "rewards/VisualizationJSONCombinedORM/std": 0.09029009938240051, "step": 2403, "train_speed(iter/s)": 0.056025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 272.9375, "completions/min_length": 232.0, "epoch": 1.9884201819685692, "grad_norm": 0.18083630502223969, "kl": 0.1141357421875, "learning_rate": 1.0213233419203994e-09, "loss": 0.0011416226625442505, "memory(GiB)": 38.05, "reward": 0.6271140575408936, "reward_std": 0.1041141152381897, "rewards/VisualizationJSONCombinedORM/mean": 0.6271140575408936, "rewards/VisualizationJSONCombinedORM/std": 0.1355993151664734, "step": 2404, "train_speed(iter/s)": 0.056015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 259.6875, "completions/min_length": 205.0, "epoch": 1.989247311827957, "grad_norm": 0.1825801134109497, "kl": 0.085693359375, "learning_rate": 8.806349708528894e-10, "loss": 0.0008578002452850342, "memory(GiB)": 38.05, "reward": 0.5887919068336487, "reward_std": 0.09322191029787064, "rewards/VisualizationJSONCombinedORM/mean": 0.5887919068336487, "rewards/VisualizationJSONCombinedORM/std": 0.09424660354852676, "step": 2405, "train_speed(iter/s)": 0.056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 262.0, "completions/min_length": 227.0, "epoch": 1.9900744416873448, "grad_norm": 0.15920689702033997, "kl": 0.035858154296875, "learning_rate": 7.503667839453555e-10, "loss": 0.0003582984209060669, "memory(GiB)": 38.05, "reward": 0.5275036096572876, "reward_std": 0.10091113299131393, "rewards/VisualizationJSONCombinedORM/mean": 0.5275036096572876, "rewards/VisualizationJSONCombinedORM/std": 0.2584072947502136, "step": 2406, "train_speed(iter/s)": 0.055992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 291.5, "completions/min_length": 239.0, "epoch": 1.9909015715467329, "grad_norm": 0.21503743529319763, "kl": 0.0732421875, "learning_rate": 6.305190527305937e-10, "loss": 0.0007330290973186493, "memory(GiB)": 38.05, "reward": 0.36765146255493164, "reward_std": 0.04294292628765106, "rewards/VisualizationJSONCombinedORM/mean": 0.36765146255493164, "rewards/VisualizationJSONCombinedORM/std": 0.14569084346294403, "step": 2407, "train_speed(iter/s)": 0.055975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 253.25, "completions/min_length": 205.0, "epoch": 1.991728701406121, "grad_norm": 0.21277236938476562, "kl": 0.04718017578125, "learning_rate": 5.210920270187769e-10, "loss": 0.0004716813564300537, "memory(GiB)": 38.05, "reward": 0.7386527061462402, "reward_std": 0.0985867828130722, "rewards/VisualizationJSONCombinedORM/mean": 0.7386527061462402, "rewards/VisualizationJSONCombinedORM/std": 0.09587008506059647, "step": 2408, "train_speed(iter/s)": 0.055966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 259.75, "completions/min_length": 233.0, "epoch": 1.9925558312655087, "grad_norm": 0.20649249851703644, "kl": 0.073486328125, "learning_rate": 4.220859349002293e-10, "loss": 0.0007330477237701416, "memory(GiB)": 38.05, "reward": 0.5172789096832275, "reward_std": 0.06025183945894241, "rewards/VisualizationJSONCombinedORM/mean": 0.5172789096832275, "rewards/VisualizationJSONCombinedORM/std": 0.14197896420955658, "step": 2409, "train_speed(iter/s)": 0.055957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 272.5625, "completions/min_length": 231.0, "epoch": 1.9933829611248965, "grad_norm": 0.16506259143352509, "kl": 0.04083251953125, "learning_rate": 3.335009827437619e-10, "loss": 0.0004075244069099426, "memory(GiB)": 38.05, "reward": 0.3829517960548401, "reward_std": 0.06095583736896515, "rewards/VisualizationJSONCombinedORM/mean": 0.3829517960548401, "rewards/VisualizationJSONCombinedORM/std": 0.10730647295713425, "step": 2410, "train_speed(iter/s)": 0.055951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 283.0, "completions/min_length": 215.0, "epoch": 1.9942100909842844, "grad_norm": 0.14883075654506683, "kl": 0.0355224609375, "learning_rate": 2.5533735519667204e-10, "loss": 0.00035559386014938354, "memory(GiB)": 38.05, "reward": 0.6755223870277405, "reward_std": 0.06222330778837204, "rewards/VisualizationJSONCombinedORM/mean": 0.6755223870277405, "rewards/VisualizationJSONCombinedORM/std": 0.11831079423427582, "step": 2411, "train_speed(iter/s)": 0.05594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 291.5625, "completions/min_length": 217.0, "epoch": 1.9950372208436724, "grad_norm": 0.2242506891489029, "kl": 0.06719970703125, "learning_rate": 1.8759521518307845e-10, "loss": 0.000672563910484314, "memory(GiB)": 38.05, "reward": 0.35670995712280273, "reward_std": 0.05059521645307541, "rewards/VisualizationJSONCombinedORM/mean": 0.35670995712280273, "rewards/VisualizationJSONCombinedORM/std": 0.049461353570222855, "step": 2412, "train_speed(iter/s)": 0.055934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 266.25, "completions/min_length": 220.0, "epoch": 1.9958643507030605, "grad_norm": 0.16945531964302063, "kl": 0.04620361328125, "learning_rate": 1.3027470390447605e-10, "loss": 0.00046259164810180664, "memory(GiB)": 38.05, "reward": 0.7269760966300964, "reward_std": 0.10351815074682236, "rewards/VisualizationJSONCombinedORM/mean": 0.7269760966300964, "rewards/VisualizationJSONCombinedORM/std": 0.11232880502939224, "step": 2413, "train_speed(iter/s)": 0.055928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 262.75, "completions/min_length": 189.0, "epoch": 1.9966914805624483, "grad_norm": 0.15951070189476013, "kl": 0.124755859375, "learning_rate": 8.337594084084633e-11, "loss": 0.0012453235685825348, "memory(GiB)": 38.05, "reward": 0.5896207094192505, "reward_std": 0.060583166778087616, "rewards/VisualizationJSONCombinedORM/mean": 0.5896207094192505, "rewards/VisualizationJSONCombinedORM/std": 0.261643648147583, "step": 2414, "train_speed(iter/s)": 0.055921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 260.4375, "completions/min_length": 220.0, "epoch": 1.997518610421836, "grad_norm": 0.1637076735496521, "kl": 0.06524658203125, "learning_rate": 4.689902374732658e-11, "loss": 0.0006534978747367859, "memory(GiB)": 38.05, "reward": 0.5413919687271118, "reward_std": 0.0679037794470787, "rewards/VisualizationJSONCombinedORM/mean": 0.5413919687271118, "rewards/VisualizationJSONCombinedORM/std": 0.19339211285114288, "step": 2415, "train_speed(iter/s)": 0.055913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 241.8125, "completions/min_length": 208.0, "epoch": 1.9983457402812241, "grad_norm": 0.16759873926639557, "kl": 0.03497314453125, "learning_rate": 2.084402865754065e-11, "loss": 0.00035068392753601074, "memory(GiB)": 38.05, "reward": 0.2541761100292206, "reward_std": 0.03837994113564491, "rewards/VisualizationJSONCombinedORM/mean": 0.2541761100292206, "rewards/VisualizationJSONCombinedORM/std": 0.10085668414831161, "step": 2416, "train_speed(iter/s)": 0.055906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 280.5625, "completions/min_length": 245.0, "epoch": 1.9991728701406122, "grad_norm": 0.19452543556690216, "kl": 0.1287841796875, "learning_rate": 5.2110098797131244e-12, "loss": 0.0012871623039245605, "memory(GiB)": 38.05, "reward": 0.5469875931739807, "reward_std": 0.08546123653650284, "rewards/VisualizationJSONCombinedORM/mean": 0.5469875931739807, "rewards/VisualizationJSONCombinedORM/std": 0.13107888400554657, "step": 2417, "train_speed(iter/s)": 0.055902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 302.25, "completions/min_length": 251.0, "epoch": 2.0, "grad_norm": 0.20794081687927246, "kl": 0.06231689453125, "learning_rate": 0.0, "loss": 0.000622924417257309, "memory(GiB)": 38.05, "reward": 0.5485554933547974, "reward_std": 0.053238824009895325, "rewards/VisualizationJSONCombinedORM/mean": 0.5485554933547974, "rewards/VisualizationJSONCombinedORM/std": 0.1798371970653534, "step": 2418, "train_speed(iter/s)": 0.05589 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 324.5833333333333, "eval_completions/mean_length": 273.0729166666667, "eval_completions/min_length": 232.79166666666666, "eval_kl": 0.067901611328125, "eval_loss": 0.0006845171446911991, "eval_reward": 0.4598940958579381, "eval_reward_std": 0.06740887979200731, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4598940958579381, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06740888254716992, "eval_runtime": 287.1022, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 267.0625, "completions/min_length": 205.0, "epoch": 2.000827129859388, "grad_norm": 0.15885832905769348, "kl": 0.05908203125, "learning_rate": 3.015673979809457e-06, "loss": 0.0005919747054576874, "memory(GiB)": 36.7, "reward": 0.4801689684391022, "reward_std": 0.06863737851381302, "rewards/VisualizationJSONCombinedORM/mean": 0.4801689684391022, "rewards/VisualizationJSONCombinedORM/std": 0.0710761770606041, "step": 2419, "train_speed(iter/s)": 20.052128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 272.375, "completions/min_length": 209.0, "epoch": 2.0016542597187756, "grad_norm": 0.17562001943588257, "kl": 0.040557861328125, "learning_rate": 3.011257627364945e-06, "loss": 0.00040566548705101013, "memory(GiB)": 37.19, "reward": 0.5258758068084717, "reward_std": 0.06542334705591202, "rewards/VisualizationJSONCombinedORM/mean": 0.5258758068084717, "rewards/VisualizationJSONCombinedORM/std": 0.07789511233568192, "step": 2420, "train_speed(iter/s)": 16.806068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 273.1875, "completions/min_length": 211.0, "epoch": 2.002481389578164, "grad_norm": 0.10552844405174255, "kl": 0.0267333984375, "learning_rate": 3.0068431172950387e-06, "loss": 0.0002672536647878587, "memory(GiB)": 37.41, "reward": 0.4924417734146118, "reward_std": 0.009458310902118683, "rewards/VisualizationJSONCombinedORM/mean": 0.4924417734146118, "rewards/VisualizationJSONCombinedORM/std": 0.21757732331752777, "step": 2421, "train_speed(iter/s)": 14.283473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 269.0, "completions/min_length": 212.0, "epoch": 2.0033085194375517, "grad_norm": 0.16909159719944, "kl": 0.05242919921875, "learning_rate": 3.002430453689347e-06, "loss": 0.0005236193537712097, "memory(GiB)": 37.41, "reward": 0.6363325119018555, "reward_std": 0.05249562859535217, "rewards/VisualizationJSONCombinedORM/mean": 0.6363325119018555, "rewards/VisualizationJSONCombinedORM/std": 0.14950209856033325, "step": 2422, "train_speed(iter/s)": 12.498511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 271.0625, "completions/min_length": 209.0, "epoch": 2.0041356492969395, "grad_norm": 0.17748090624809265, "kl": 0.04388427734375, "learning_rate": 2.998019640635772e-06, "loss": 0.0004381798207759857, "memory(GiB)": 37.41, "reward": 0.5656371116638184, "reward_std": 0.06626701354980469, "rewards/VisualizationJSONCombinedORM/mean": 0.5656371116638184, "rewards/VisualizationJSONCombinedORM/std": 0.0641578957438469, "step": 2423, "train_speed(iter/s)": 11.22022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 263.25, "completions/min_length": 211.0, "epoch": 2.0049627791563274, "grad_norm": 0.16442081332206726, "kl": 0.1629638671875, "learning_rate": 2.9936106822204937e-06, "loss": 0.0016283020377159119, "memory(GiB)": 37.41, "reward": 0.38162463903427124, "reward_std": 0.08267152309417725, "rewards/VisualizationJSONCombinedORM/mean": 0.38162463903427124, "rewards/VisualizationJSONCombinedORM/std": 0.07990234345197678, "step": 2424, "train_speed(iter/s)": 10.311705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 259.375, "completions/min_length": 219.0, "epoch": 2.0057899090157156, "grad_norm": 0.1584974229335785, "kl": 0.104248046875, "learning_rate": 2.9892035825279863e-06, "loss": 0.0010418854653835297, "memory(GiB)": 37.41, "reward": 0.717332124710083, "reward_std": 0.06708096712827682, "rewards/VisualizationJSONCombinedORM/mean": 0.717332124710083, "rewards/VisualizationJSONCombinedORM/std": 0.07381399720907211, "step": 2425, "train_speed(iter/s)": 9.377896 }, { "epoch": 2.0057899090157156, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 337.2083333333333, "eval_completions/mean_length": 275.8125, "eval_completions/min_length": 232.125, "eval_kl": 0.07332356770833333, "eval_loss": 0.000729131221305579, "eval_reward": 0.45098768795530003, "eval_reward_std": 0.066105374135077, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45098768795530003, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06610537646338344, "eval_runtime": 294.3071, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 271.25, "completions/min_length": 213.0, "epoch": 2.0066170388751035, "grad_norm": 0.1699235439300537, "kl": 0.0645751953125, "learning_rate": 2.9847983456409897e-06, "loss": 0.0006453897804021835, "memory(GiB)": 37.41, "reward": 0.3500407636165619, "reward_std": 0.037181321531534195, "rewards/VisualizationJSONCombinedORM/mean": 0.3500407636165619, "rewards/VisualizationJSONCombinedORM/std": 0.03647932410240173, "step": 2426, "train_speed(iter/s)": 4.222099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 277.3125, "completions/min_length": 213.0, "epoch": 2.0074441687344913, "grad_norm": 0.2234882414340973, "kl": 0.0518798828125, "learning_rate": 2.980394975640526e-06, "loss": 0.0005191303789615631, "memory(GiB)": 37.66, "reward": 0.5473743677139282, "reward_std": 0.10435327142477036, "rewards/VisualizationJSONCombinedORM/mean": 0.5473743677139282, "rewards/VisualizationJSONCombinedORM/std": 0.17222706973552704, "step": 2427, "train_speed(iter/s)": 4.035778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 276.625, "completions/min_length": 242.0, "epoch": 2.008271298593879, "grad_norm": 0.1720900982618332, "kl": 0.038116455078125, "learning_rate": 2.975993476605884e-06, "loss": 0.00038113445043563843, "memory(GiB)": 37.66, "reward": 0.432941198348999, "reward_std": 0.05539543926715851, "rewards/VisualizationJSONCombinedORM/mean": 0.432941198348999, "rewards/VisualizationJSONCombinedORM/std": 0.07642052322626114, "step": 2428, "train_speed(iter/s)": 3.901151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 263.5, "completions/min_length": 220.0, "epoch": 2.0090984284532674, "grad_norm": 0.177483931183815, "kl": 0.1263427734375, "learning_rate": 2.971593852614622e-06, "loss": 0.001261778175830841, "memory(GiB)": 37.66, "reward": 0.5168554782867432, "reward_std": 0.0780818834900856, "rewards/VisualizationJSONCombinedORM/mean": 0.5168554782867432, "rewards/VisualizationJSONCombinedORM/std": 0.23250527679920197, "step": 2429, "train_speed(iter/s)": 3.779664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.375, "completions/min_length": 238.0, "epoch": 2.009925558312655, "grad_norm": 0.1900358945131302, "kl": 0.05584716796875, "learning_rate": 2.9671961077425583e-06, "loss": 0.0005584917962551117, "memory(GiB)": 37.66, "reward": 0.40001380443573, "reward_std": 0.06630091369152069, "rewards/VisualizationJSONCombinedORM/mean": 0.40001380443573, "rewards/VisualizationJSONCombinedORM/std": 0.1188466027379036, "step": 2430, "train_speed(iter/s)": 3.622094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 281.625, "completions/min_length": 245.0, "epoch": 2.010752688172043, "grad_norm": 0.20330004394054413, "kl": 0.1002197265625, "learning_rate": 2.962800246063774e-06, "loss": 0.0010014139115810394, "memory(GiB)": 37.66, "reward": 0.4826512336730957, "reward_std": 0.11039452999830246, "rewards/VisualizationJSONCombinedORM/mean": 0.4826512336730957, "rewards/VisualizationJSONCombinedORM/std": 0.1377459168434143, "step": 2431, "train_speed(iter/s)": 3.497135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 281.0625, "completions/min_length": 224.0, "epoch": 2.011579818031431, "grad_norm": 0.16561417281627655, "kl": 0.05572509765625, "learning_rate": 2.9584062716505996e-06, "loss": 0.0005564317107200623, "memory(GiB)": 37.66, "reward": 0.5016152858734131, "reward_std": 0.06775525212287903, "rewards/VisualizationJSONCombinedORM/mean": 0.5016152858734131, "rewards/VisualizationJSONCombinedORM/std": 0.17142963409423828, "step": 2432, "train_speed(iter/s)": 3.370874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 266.1875, "completions/min_length": 209.0, "epoch": 2.0124069478908186, "grad_norm": 0.2006237953901291, "kl": 0.06048583984375, "learning_rate": 2.954014188573626e-06, "loss": 0.0006050374358892441, "memory(GiB)": 37.66, "reward": 0.5932536721229553, "reward_std": 0.09044967591762543, "rewards/VisualizationJSONCombinedORM/mean": 0.5932536721229553, "rewards/VisualizationJSONCombinedORM/std": 0.1123274490237236, "step": 2433, "train_speed(iter/s)": 3.262757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 260.5, "completions/min_length": 217.0, "epoch": 2.013234077750207, "grad_norm": 0.24267728626728058, "kl": 0.0701904296875, "learning_rate": 2.949624000901683e-06, "loss": 0.0007018186151981354, "memory(GiB)": 37.92, "reward": 0.6933331489562988, "reward_std": 0.12276878952980042, "rewards/VisualizationJSONCombinedORM/mean": 0.6933331489562988, "rewards/VisualizationJSONCombinedORM/std": 0.12371105700731277, "step": 2434, "train_speed(iter/s)": 3.136728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 278.75, "completions/min_length": 238.0, "epoch": 2.0140612076095947, "grad_norm": 0.1838821917772293, "kl": 0.05120849609375, "learning_rate": 2.9452357127018516e-06, "loss": 0.0005124751478433609, "memory(GiB)": 37.92, "reward": 0.6539673209190369, "reward_std": 0.08994415402412415, "rewards/VisualizationJSONCombinedORM/mean": 0.6539673209190369, "rewards/VisualizationJSONCombinedORM/std": 0.09115259349346161, "step": 2435, "train_speed(iter/s)": 3.033668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 256.75, "completions/min_length": 207.0, "epoch": 2.0148883374689825, "grad_norm": 0.44831541180610657, "kl": 0.146240234375, "learning_rate": 2.940849328039447e-06, "loss": 0.001461610198020935, "memory(GiB)": 37.92, "reward": 0.6581172347068787, "reward_std": 0.08388600498437881, "rewards/VisualizationJSONCombinedORM/mean": 0.6581172347068787, "rewards/VisualizationJSONCombinedORM/std": 0.09699376672506332, "step": 2436, "train_speed(iter/s)": 2.951453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 264.8125, "completions/min_length": 223.0, "epoch": 2.0157154673283704, "grad_norm": 0.16724203526973724, "kl": 0.09765625, "learning_rate": 2.936464850978027e-06, "loss": 0.0009756162762641907, "memory(GiB)": 37.92, "reward": 0.7632238268852234, "reward_std": 0.058549851179122925, "rewards/VisualizationJSONCombinedORM/mean": 0.7632238268852234, "rewards/VisualizationJSONCombinedORM/std": 0.05848447605967522, "step": 2437, "train_speed(iter/s)": 2.865135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 296.125, "completions/min_length": 241.0, "epoch": 2.0165425971877586, "grad_norm": 0.2168441265821457, "kl": 0.04962158203125, "learning_rate": 2.932082285579377e-06, "loss": 0.0004957839846611023, "memory(GiB)": 37.92, "reward": 0.543663740158081, "reward_std": 0.06446179747581482, "rewards/VisualizationJSONCombinedORM/mean": 0.543663740158081, "rewards/VisualizationJSONCombinedORM/std": 0.20749732851982117, "step": 2438, "train_speed(iter/s)": 2.778073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 288.125, "completions/min_length": 228.0, "epoch": 2.0173697270471465, "grad_norm": 0.20591023564338684, "kl": 0.08685302734375, "learning_rate": 2.9277016359035165e-06, "loss": 0.000867258757352829, "memory(GiB)": 37.92, "reward": 0.49601760506629944, "reward_std": 0.07791194319725037, "rewards/VisualizationJSONCombinedORM/mean": 0.49601760506629944, "rewards/VisualizationJSONCombinedORM/std": 0.12838760018348694, "step": 2439, "train_speed(iter/s)": 2.702879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 249.875, "completions/min_length": 205.0, "epoch": 2.0181968569065343, "grad_norm": 0.19111433625221252, "kl": 0.062255859375, "learning_rate": 2.923322906008683e-06, "loss": 0.0006209881976246834, "memory(GiB)": 37.92, "reward": 0.45574426651000977, "reward_std": 0.07635989785194397, "rewards/VisualizationJSONCombinedORM/mean": 0.45574426651000977, "rewards/VisualizationJSONCombinedORM/std": 0.08036326617002487, "step": 2440, "train_speed(iter/s)": 2.629113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 271.9375, "completions/min_length": 210.0, "epoch": 2.019023986765922, "grad_norm": 0.2098403424024582, "kl": 0.0709228515625, "learning_rate": 2.918946099951345e-06, "loss": 0.0007096566259860992, "memory(GiB)": 37.92, "reward": 0.6044207811355591, "reward_std": 0.13381235301494598, "rewards/VisualizationJSONCombinedORM/mean": 0.6044207811355591, "rewards/VisualizationJSONCombinedORM/std": 0.14076551795005798, "step": 2441, "train_speed(iter/s)": 2.560545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 284.75, "completions/min_length": 214.0, "epoch": 2.0198511166253104, "grad_norm": 0.18779535591602325, "kl": 0.078857421875, "learning_rate": 2.914571221786179e-06, "loss": 0.000787004828453064, "memory(GiB)": 37.92, "reward": 0.6792901754379272, "reward_std": 0.09750179946422577, "rewards/VisualizationJSONCombinedORM/mean": 0.6792901754379272, "rewards/VisualizationJSONCombinedORM/std": 0.10075022280216217, "step": 2442, "train_speed(iter/s)": 2.485593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 277.125, "completions/min_length": 213.0, "epoch": 2.020678246484698, "grad_norm": 0.1566038280725479, "kl": 0.07861328125, "learning_rate": 2.910198275566085e-06, "loss": 0.000786948949098587, "memory(GiB)": 37.92, "reward": 0.5018869042396545, "reward_std": 0.06995044648647308, "rewards/VisualizationJSONCombinedORM/mean": 0.5018869042396545, "rewards/VisualizationJSONCombinedORM/std": 0.07269944995641708, "step": 2443, "train_speed(iter/s)": 2.414564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 285.25, "completions/min_length": 229.0, "epoch": 2.021505376344086, "grad_norm": 0.21808014810085297, "kl": 0.0745849609375, "learning_rate": 2.9058272653421614e-06, "loss": 0.000745914876461029, "memory(GiB)": 37.92, "reward": 0.6458413004875183, "reward_std": 0.07397480309009552, "rewards/VisualizationJSONCombinedORM/mean": 0.6458413004875183, "rewards/VisualizationJSONCombinedORM/std": 0.1740744560956955, "step": 2444, "train_speed(iter/s)": 2.366175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 287.75, "completions/min_length": 224.0, "epoch": 2.022332506203474, "grad_norm": 0.21389856934547424, "kl": 0.083984375, "learning_rate": 2.9014581951637295e-06, "loss": 0.0008387770503759384, "memory(GiB)": 37.92, "reward": 0.44624072313308716, "reward_std": 0.0686451643705368, "rewards/VisualizationJSONCombinedORM/mean": 0.44624072313308716, "rewards/VisualizationJSONCombinedORM/std": 0.18775294721126556, "step": 2445, "train_speed(iter/s)": 2.307191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 285.0, "completions/min_length": 229.0, "epoch": 2.023159636062862, "grad_norm": 0.19375596940517426, "kl": 0.098388671875, "learning_rate": 2.897091069078296e-06, "loss": 0.0009837746620178223, "memory(GiB)": 37.92, "reward": 0.22854173183441162, "reward_std": 0.019503872841596603, "rewards/VisualizationJSONCombinedORM/mean": 0.22854173183441162, "rewards/VisualizationJSONCombinedORM/std": 0.08564712852239609, "step": 2446, "train_speed(iter/s)": 2.25674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 264.75, "completions/min_length": 201.0, "epoch": 2.02398676592225, "grad_norm": 0.183974951505661, "kl": 0.09661865234375, "learning_rate": 2.892725891131581e-06, "loss": 0.0009670369327068329, "memory(GiB)": 37.92, "reward": 0.4609769880771637, "reward_std": 0.08393387496471405, "rewards/VisualizationJSONCombinedORM/mean": 0.4609769880771637, "rewards/VisualizationJSONCombinedORM/std": 0.11858763545751572, "step": 2447, "train_speed(iter/s)": 2.1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 307.875, "completions/min_length": 238.0, "epoch": 2.0248138957816377, "grad_norm": 0.14846859872341156, "kl": 0.035888671875, "learning_rate": 2.8883626653674867e-06, "loss": 0.00035925954580307007, "memory(GiB)": 37.92, "reward": 0.579052209854126, "reward_std": 0.0701383724808693, "rewards/VisualizationJSONCombinedORM/mean": 0.579052209854126, "rewards/VisualizationJSONCombinedORM/std": 0.12756557762622833, "step": 2448, "train_speed(iter/s)": 2.14648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 282.0, "completions/min_length": 198.0, "epoch": 2.0256410256410255, "grad_norm": 0.2317999005317688, "kl": 0.07293701171875, "learning_rate": 2.8840013958281178e-06, "loss": 0.0007289387285709381, "memory(GiB)": 37.92, "reward": 0.5398348569869995, "reward_std": 0.06348278373479843, "rewards/VisualizationJSONCombinedORM/mean": 0.5398348569869995, "rewards/VisualizationJSONCombinedORM/std": 0.20221783220767975, "step": 2449, "train_speed(iter/s)": 2.099957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 272.625, "completions/min_length": 221.0, "epoch": 2.0264681555004134, "grad_norm": 0.19306734204292297, "kl": 0.07275390625, "learning_rate": 2.8796420865537617e-06, "loss": 0.0007260292768478394, "memory(GiB)": 37.92, "reward": 0.6692391633987427, "reward_std": 0.077421173453331, "rewards/VisualizationJSONCombinedORM/mean": 0.6692391633987427, "rewards/VisualizationJSONCombinedORM/std": 0.07905558496713638, "step": 2450, "train_speed(iter/s)": 2.051801 }, { "epoch": 2.0264681555004134, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 327.9583333333333, "eval_completions/mean_length": 275.578125, "eval_completions/min_length": 236.375, "eval_kl": 0.058237711588541664, "eval_loss": 0.000586439564358443, "eval_reward": 0.43245549003283185, "eval_reward_std": 0.058372231743608914, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43245549003283185, "eval_rewards/VisualizationJSONCombinedORM/std": 0.058372234382356204, "eval_runtime": 287.8674, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 299.0, "completions/min_length": 237.0, "epoch": 2.0272952853598016, "grad_norm": 0.18962903320789337, "kl": 0.117431640625, "learning_rate": 2.8752847415828923e-06, "loss": 0.0011742711067199707, "memory(GiB)": 37.92, "reward": 0.5687896013259888, "reward_std": 0.04939322918653488, "rewards/VisualizationJSONCombinedORM/mean": 0.5687896013259888, "rewards/VisualizationJSONCombinedORM/std": 0.17473238706588745, "step": 2451, "train_speed(iter/s)": 1.626881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 311.5625, "completions/min_length": 249.0, "epoch": 2.0281224152191895, "grad_norm": 0.1928662806749344, "kl": 0.0670166015625, "learning_rate": 2.8709293649521575e-06, "loss": 0.0006697587668895721, "memory(GiB)": 37.92, "reward": 0.4824115037918091, "reward_std": 0.059291355311870575, "rewards/VisualizationJSONCombinedORM/mean": 0.4824115037918091, "rewards/VisualizationJSONCombinedORM/std": 0.21639324724674225, "step": 2452, "train_speed(iter/s)": 1.601884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 264.9375, "completions/min_length": 206.0, "epoch": 2.0289495450785773, "grad_norm": 0.17522163689136505, "kl": 0.035400390625, "learning_rate": 2.866575960696391e-06, "loss": 0.00035434961318969727, "memory(GiB)": 37.92, "reward": 0.6641495227813721, "reward_std": 0.09834899008274078, "rewards/VisualizationJSONCombinedORM/mean": 0.6641495227813721, "rewards/VisualizationJSONCombinedORM/std": 0.15837591886520386, "step": 2453, "train_speed(iter/s)": 1.577289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 272.0625, "completions/min_length": 222.0, "epoch": 2.029776674937965, "grad_norm": 0.2161010056734085, "kl": 0.052978515625, "learning_rate": 2.862224532848591e-06, "loss": 0.0005282312631607056, "memory(GiB)": 37.92, "reward": 0.5303592681884766, "reward_std": 0.07885929942131042, "rewards/VisualizationJSONCombinedORM/mean": 0.5303592681884766, "rewards/VisualizationJSONCombinedORM/std": 0.2665097713470459, "step": 2454, "train_speed(iter/s)": 1.553529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 289.375, "completions/min_length": 239.0, "epoch": 2.0306038047973534, "grad_norm": 0.18153484165668488, "kl": 0.05596923828125, "learning_rate": 2.8578750854399294e-06, "loss": 0.000558905303478241, "memory(GiB)": 37.92, "reward": 0.407085657119751, "reward_std": 0.05387134850025177, "rewards/VisualizationJSONCombinedORM/mean": 0.407085657119751, "rewards/VisualizationJSONCombinedORM/std": 0.13010114431381226, "step": 2455, "train_speed(iter/s)": 1.529702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 281.5625, "completions/min_length": 236.0, "epoch": 2.031430934656741, "grad_norm": 0.1828954517841339, "kl": 0.1402587890625, "learning_rate": 2.853527622499742e-06, "loss": 0.0014029666781425476, "memory(GiB)": 37.92, "reward": 0.5080870389938354, "reward_std": 0.046013638377189636, "rewards/VisualizationJSONCombinedORM/mean": 0.5080870389938354, "rewards/VisualizationJSONCombinedORM/std": 0.25186094641685486, "step": 2456, "train_speed(iter/s)": 1.505957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 247.1875, "completions/min_length": 213.0, "epoch": 2.032258064516129, "grad_norm": 0.17155933380126953, "kl": 0.0684814453125, "learning_rate": 2.8491821480555283e-06, "loss": 0.000684790313243866, "memory(GiB)": 37.92, "reward": 0.7241165637969971, "reward_std": 0.07986392825841904, "rewards/VisualizationJSONCombinedORM/mean": 0.7241165637969971, "rewards/VisualizationJSONCombinedORM/std": 0.07881283760070801, "step": 2457, "train_speed(iter/s)": 1.487999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 258.0625, "completions/min_length": 215.0, "epoch": 2.033085194375517, "grad_norm": 0.15298986434936523, "kl": 0.06201171875, "learning_rate": 2.844838666132944e-06, "loss": 0.0006189793348312378, "memory(GiB)": 37.92, "reward": 0.3819977045059204, "reward_std": 0.04458867758512497, "rewards/VisualizationJSONCombinedORM/mean": 0.3819977045059204, "rewards/VisualizationJSONCombinedORM/std": 0.15979988873004913, "step": 2458, "train_speed(iter/s)": 1.465394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 297.25, "completions/min_length": 227.0, "epoch": 2.033912324234905, "grad_norm": 0.1928533911705017, "kl": 0.08319091796875, "learning_rate": 2.8404971807557957e-06, "loss": 0.0008316412568092346, "memory(GiB)": 37.92, "reward": 0.5405057668685913, "reward_std": 0.0748993456363678, "rewards/VisualizationJSONCombinedORM/mean": 0.5405057668685913, "rewards/VisualizationJSONCombinedORM/std": 0.09527090936899185, "step": 2459, "train_speed(iter/s)": 1.446501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 281.125, "completions/min_length": 242.0, "epoch": 2.034739454094293, "grad_norm": 0.20451396703720093, "kl": 0.04840087890625, "learning_rate": 2.836157695946047e-06, "loss": 0.0004838407039642334, "memory(GiB)": 37.92, "reward": 0.5409306883811951, "reward_std": 0.06312192976474762, "rewards/VisualizationJSONCombinedORM/mean": 0.5409306883811951, "rewards/VisualizationJSONCombinedORM/std": 0.20622053742408752, "step": 2460, "train_speed(iter/s)": 1.425012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 283.6875, "completions/min_length": 212.0, "epoch": 2.0355665839536807, "grad_norm": 0.15871664881706238, "kl": 0.032073974609375, "learning_rate": 2.8318202157237984e-06, "loss": 0.0003202706575393677, "memory(GiB)": 37.92, "reward": 0.6404055953025818, "reward_std": 0.06484492868185043, "rewards/VisualizationJSONCombinedORM/mean": 0.6404055953025818, "rewards/VisualizationJSONCombinedORM/std": 0.1899627298116684, "step": 2461, "train_speed(iter/s)": 1.404685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 270.125, "completions/min_length": 200.0, "epoch": 2.0363937138130686, "grad_norm": 0.1950361728668213, "kl": 0.05828857421875, "learning_rate": 2.8274847441073082e-06, "loss": 0.0005830861628055573, "memory(GiB)": 37.92, "reward": 0.4592248201370239, "reward_std": 0.08259105682373047, "rewards/VisualizationJSONCombinedORM/mean": 0.4592248201370239, "rewards/VisualizationJSONCombinedORM/std": 0.2326062023639679, "step": 2462, "train_speed(iter/s)": 1.38334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 257.6875, "completions/min_length": 223.0, "epoch": 2.0372208436724564, "grad_norm": 0.18542207777500153, "kl": 0.094482421875, "learning_rate": 2.8231512851129596e-06, "loss": 0.0009439624845981598, "memory(GiB)": 37.92, "reward": 0.5285909175872803, "reward_std": 0.05127367004752159, "rewards/VisualizationJSONCombinedORM/mean": 0.5285909175872803, "rewards/VisualizationJSONCombinedORM/std": 0.16202975809574127, "step": 2463, "train_speed(iter/s)": 1.365753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 275.6875, "completions/min_length": 241.0, "epoch": 2.0380479735318446, "grad_norm": 0.16825297474861145, "kl": 0.0694580078125, "learning_rate": 2.8188198427552804e-06, "loss": 0.0006947889924049377, "memory(GiB)": 37.92, "reward": 0.6972507238388062, "reward_std": 0.08042393624782562, "rewards/VisualizationJSONCombinedORM/mean": 0.6972507238388062, "rewards/VisualizationJSONCombinedORM/std": 0.08089562505483627, "step": 2464, "train_speed(iter/s)": 1.348751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 272.0, "completions/min_length": 199.0, "epoch": 2.0388751033912325, "grad_norm": 0.1616704761981964, "kl": 0.0426025390625, "learning_rate": 2.8144904210469224e-06, "loss": 0.00042758136987686157, "memory(GiB)": 37.92, "reward": 0.7321649789810181, "reward_std": 0.03671599179506302, "rewards/VisualizationJSONCombinedORM/mean": 0.7321649789810181, "rewards/VisualizationJSONCombinedORM/std": 0.05694803223013878, "step": 2465, "train_speed(iter/s)": 1.330133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 280.5, "completions/min_length": 217.0, "epoch": 2.0397022332506203, "grad_norm": 0.15855807065963745, "kl": 0.05511474609375, "learning_rate": 2.810163023998673e-06, "loss": 0.000550558790564537, "memory(GiB)": 37.92, "reward": 0.6951885223388672, "reward_std": 0.05723964422941208, "rewards/VisualizationJSONCombinedORM/mean": 0.6951885223388672, "rewards/VisualizationJSONCombinedORM/std": 0.11395758390426636, "step": 2466, "train_speed(iter/s)": 1.313317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 268.4375, "completions/min_length": 221.0, "epoch": 2.040529363110008, "grad_norm": 0.2094094604253769, "kl": 0.08697509765625, "learning_rate": 2.80583765561944e-06, "loss": 0.0008701756596565247, "memory(GiB)": 37.92, "reward": 0.3078189790248871, "reward_std": 0.04998098313808441, "rewards/VisualizationJSONCombinedORM/mean": 0.3078189790248871, "rewards/VisualizationJSONCombinedORM/std": 0.057989057153463364, "step": 2467, "train_speed(iter/s)": 1.292337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 269.125, "completions/min_length": 213.0, "epoch": 2.0413564929693964, "grad_norm": 0.1710139811038971, "kl": 0.11279296875, "learning_rate": 2.8015143199162548e-06, "loss": 0.0011270642280578613, "memory(GiB)": 37.92, "reward": 0.4800291061401367, "reward_std": 0.05008404701948166, "rewards/VisualizationJSONCombinedORM/mean": 0.4800291061401367, "rewards/VisualizationJSONCombinedORM/std": 0.25761085748672485, "step": 2468, "train_speed(iter/s)": 1.273299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 263.25, "completions/min_length": 207.0, "epoch": 2.042183622828784, "grad_norm": 0.19070030748844147, "kl": 0.0540771484375, "learning_rate": 2.79719302089426e-06, "loss": 0.0005409866571426392, "memory(GiB)": 37.92, "reward": 0.31794947385787964, "reward_std": 0.041381433606147766, "rewards/VisualizationJSONCombinedORM/mean": 0.31794947385787964, "rewards/VisualizationJSONCombinedORM/std": 0.049838852137327194, "step": 2469, "train_speed(iter/s)": 1.257152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 289.3125, "completions/min_length": 216.0, "epoch": 2.043010752688172, "grad_norm": 0.22284451127052307, "kl": 0.07666015625, "learning_rate": 2.792873762556718e-06, "loss": 0.0007647573947906494, "memory(GiB)": 37.92, "reward": 0.6190376877784729, "reward_std": 0.09831127524375916, "rewards/VisualizationJSONCombinedORM/mean": 0.6190376877784729, "rewards/VisualizationJSONCombinedORM/std": 0.10911086946725845, "step": 2470, "train_speed(iter/s)": 1.242616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 260.5625, "completions/min_length": 203.0, "epoch": 2.04383788254756, "grad_norm": 0.23547644913196564, "kl": 0.045654296875, "learning_rate": 2.7885565489049948e-06, "loss": 0.00045593269169330597, "memory(GiB)": 37.92, "reward": 0.5331353545188904, "reward_std": 0.06592656672000885, "rewards/VisualizationJSONCombinedORM/mean": 0.5331353545188904, "rewards/VisualizationJSONCombinedORM/std": 0.06933468580245972, "step": 2471, "train_speed(iter/s)": 1.228598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 279.625, "completions/min_length": 225.0, "epoch": 2.044665012406948, "grad_norm": 0.17410580813884735, "kl": 0.05133056640625, "learning_rate": 2.784241383938566e-06, "loss": 0.0005139335989952087, "memory(GiB)": 37.92, "reward": 0.6708821058273315, "reward_std": 0.04600907489657402, "rewards/VisualizationJSONCombinedORM/mean": 0.6708821058273315, "rewards/VisualizationJSONCombinedORM/std": 0.12110153585672379, "step": 2472, "train_speed(iter/s)": 1.213887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 291.9375, "completions/min_length": 214.0, "epoch": 2.045492142266336, "grad_norm": 0.23324674367904663, "kl": 0.1458740234375, "learning_rate": 2.7799282716550093e-06, "loss": 0.001457810401916504, "memory(GiB)": 37.92, "reward": 0.49622637033462524, "reward_std": 0.050708454102277756, "rewards/VisualizationJSONCombinedORM/mean": 0.49622637033462524, "rewards/VisualizationJSONCombinedORM/std": 0.29800382256507874, "step": 2473, "train_speed(iter/s)": 1.197052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 282.8125, "completions/min_length": 245.0, "epoch": 2.0463192721257237, "grad_norm": 0.14346040785312653, "kl": 0.17218017578125, "learning_rate": 2.7756172160500016e-06, "loss": 0.0017165374010801315, "memory(GiB)": 37.92, "reward": 0.5796688795089722, "reward_std": 0.12615203857421875, "rewards/VisualizationJSONCombinedORM/mean": 0.5796688795089722, "rewards/VisualizationJSONCombinedORM/std": 0.1283298283815384, "step": 2474, "train_speed(iter/s)": 1.18409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 296.125, "completions/min_length": 253.0, "epoch": 2.0471464019851116, "grad_norm": 0.20429709553718567, "kl": 0.09130859375, "learning_rate": 2.771308221117309e-06, "loss": 0.0009113792330026627, "memory(GiB)": 38.0, "reward": 0.4405028820037842, "reward_std": 0.10063999891281128, "rewards/VisualizationJSONCombinedORM/mean": 0.4405028820037842, "rewards/VisualizationJSONCombinedORM/std": 0.1649741232395172, "step": 2475, "train_speed(iter/s)": 1.166925 }, { "epoch": 2.0471464019851116, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 321.0833333333333, "eval_completions/mean_length": 275.828125, "eval_completions/min_length": 233.91666666666666, "eval_kl": 0.1076507568359375, "eval_loss": 0.0010897926986217499, "eval_reward": 0.4747190438210964, "eval_reward_std": 0.06062715554920336, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4747190438210964, "eval_rewards/VisualizationJSONCombinedORM/std": 0.060627157256628074, "eval_runtime": 284.1942, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.011, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 300.375, "completions/min_length": 247.0, "epoch": 2.0479735318444994, "grad_norm": 0.17379778623580933, "kl": 0.1075439453125, "learning_rate": 2.7670012908487966e-06, "loss": 0.0010763220489025116, "memory(GiB)": 38.0, "reward": 0.7008803486824036, "reward_std": 0.07906471937894821, "rewards/VisualizationJSONCombinedORM/mean": 0.7008803486824036, "rewards/VisualizationJSONCombinedORM/std": 0.07773218303918839, "step": 2476, "train_speed(iter/s)": 1.019383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 299.625, "completions/min_length": 227.0, "epoch": 2.0488006617038876, "grad_norm": 0.17555485665798187, "kl": 0.0645751953125, "learning_rate": 2.762696429234405e-06, "loss": 0.0006453096866607666, "memory(GiB)": 38.0, "reward": 0.7200911045074463, "reward_std": 0.09843315184116364, "rewards/VisualizationJSONCombinedORM/mean": 0.7200911045074463, "rewards/VisualizationJSONCombinedORM/std": 0.1379464864730835, "step": 2477, "train_speed(iter/s)": 1.01017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 270.4375, "completions/min_length": 208.0, "epoch": 2.0496277915632755, "grad_norm": 0.19122126698493958, "kl": 0.0799560546875, "learning_rate": 2.7583936402621753e-06, "loss": 0.0008023828268051147, "memory(GiB)": 38.0, "reward": 0.5729717016220093, "reward_std": 0.08500310778617859, "rewards/VisualizationJSONCombinedORM/mean": 0.5729717016220093, "rewards/VisualizationJSONCombinedORM/std": 0.10825328528881073, "step": 2478, "train_speed(iter/s)": 1.000275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 279.5, "completions/min_length": 227.0, "epoch": 2.0504549214226633, "grad_norm": 0.16528883576393127, "kl": 0.096923828125, "learning_rate": 2.754092927918213e-06, "loss": 0.0009695813059806824, "memory(GiB)": 38.0, "reward": 0.47335460782051086, "reward_std": 0.051546696573495865, "rewards/VisualizationJSONCombinedORM/mean": 0.47335460782051086, "rewards/VisualizationJSONCombinedORM/std": 0.292098730802536, "step": 2479, "train_speed(iter/s)": 0.98921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 302.8125, "completions/min_length": 246.0, "epoch": 2.051282051282051, "grad_norm": 0.16221410036087036, "kl": 0.0723876953125, "learning_rate": 2.7497942961867098e-06, "loss": 0.00072435662150383, "memory(GiB)": 38.0, "reward": 0.6135824918746948, "reward_std": 0.07644607871770859, "rewards/VisualizationJSONCombinedORM/mean": 0.6135824918746948, "rewards/VisualizationJSONCombinedORM/std": 0.1959104686975479, "step": 2480, "train_speed(iter/s)": 0.980029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 287.0625, "completions/min_length": 254.0, "epoch": 2.0521091811414394, "grad_norm": 0.2443254590034485, "kl": 0.135986328125, "learning_rate": 2.745497749049922e-06, "loss": 0.0013597644865512848, "memory(GiB)": 38.0, "reward": 0.31198835372924805, "reward_std": 0.04590015858411789, "rewards/VisualizationJSONCombinedORM/mean": 0.31198835372924805, "rewards/VisualizationJSONCombinedORM/std": 0.11759645491838455, "step": 2481, "train_speed(iter/s)": 0.970331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 284.5625, "completions/min_length": 202.0, "epoch": 2.052936311000827, "grad_norm": 0.2039559930562973, "kl": 0.06817626953125, "learning_rate": 2.7412032904881806e-06, "loss": 0.0006799064576625824, "memory(GiB)": 38.0, "reward": 0.5305753946304321, "reward_std": 0.0830094963312149, "rewards/VisualizationJSONCombinedORM/mean": 0.5305753946304321, "rewards/VisualizationJSONCombinedORM/std": 0.08908189088106155, "step": 2482, "train_speed(iter/s)": 0.961666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 279.5, "completions/min_length": 212.0, "epoch": 2.053763440860215, "grad_norm": 0.23923727869987488, "kl": 0.0889892578125, "learning_rate": 2.736910924479881e-06, "loss": 0.0008890777826309204, "memory(GiB)": 38.0, "reward": 0.36430802941322327, "reward_std": 0.07356707006692886, "rewards/VisualizationJSONCombinedORM/mean": 0.36430802941322327, "rewards/VisualizationJSONCombinedORM/std": 0.2138400673866272, "step": 2483, "train_speed(iter/s)": 0.953245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 285.1875, "completions/min_length": 251.0, "epoch": 2.054590570719603, "grad_norm": 0.172642782330513, "kl": 0.04095458984375, "learning_rate": 2.7326206550014793e-06, "loss": 0.0004077106714248657, "memory(GiB)": 38.0, "reward": 0.41859060525894165, "reward_std": 0.02308458276093006, "rewards/VisualizationJSONCombinedORM/mean": 0.41859060525894165, "rewards/VisualizationJSONCombinedORM/std": 0.02885827235877514, "step": 2484, "train_speed(iter/s)": 0.943338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 278.1875, "completions/min_length": 210.0, "epoch": 2.055417700578991, "grad_norm": 0.17248143255710602, "kl": 0.05084228515625, "learning_rate": 2.7283324860274867e-06, "loss": 0.0005093924701213837, "memory(GiB)": 38.0, "reward": 0.679031491279602, "reward_std": 0.09022270888090134, "rewards/VisualizationJSONCombinedORM/mean": 0.679031491279602, "rewards/VisualizationJSONCombinedORM/std": 0.09101398289203644, "step": 2485, "train_speed(iter/s)": 0.934629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 314.9375, "completions/min_length": 250.0, "epoch": 2.056244830438379, "grad_norm": 0.15667875111103058, "kl": 0.0654296875, "learning_rate": 2.724046421530474e-06, "loss": 0.000654362142086029, "memory(GiB)": 38.0, "reward": 0.7154380083084106, "reward_std": 0.07800880074501038, "rewards/VisualizationJSONCombinedORM/mean": 0.7154380083084106, "rewards/VisualizationJSONCombinedORM/std": 0.11046352982521057, "step": 2486, "train_speed(iter/s)": 0.925294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 287.625, "completions/min_length": 229.0, "epoch": 2.0570719602977667, "grad_norm": 0.18785110116004944, "kl": 0.102783203125, "learning_rate": 2.719762465481055e-06, "loss": 0.0010298583656549454, "memory(GiB)": 38.0, "reward": 0.6925044059753418, "reward_std": 0.06051141023635864, "rewards/VisualizationJSONCombinedORM/mean": 0.6925044059753418, "rewards/VisualizationJSONCombinedORM/std": 0.06586483120918274, "step": 2487, "train_speed(iter/s)": 0.917375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 294.625, "completions/min_length": 238.0, "epoch": 2.0578990901571546, "grad_norm": 0.1704368144273758, "kl": 0.079833984375, "learning_rate": 2.715480621847897e-06, "loss": 0.0007980931550264359, "memory(GiB)": 38.0, "reward": 0.7340400218963623, "reward_std": 0.09956193715333939, "rewards/VisualizationJSONCombinedORM/mean": 0.7340400218963623, "rewards/VisualizationJSONCombinedORM/std": 0.09695329517126083, "step": 2488, "train_speed(iter/s)": 0.908608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 262.3125, "completions/min_length": 216.0, "epoch": 2.058726220016543, "grad_norm": 0.2021443396806717, "kl": 0.089111328125, "learning_rate": 2.7112008945977076e-06, "loss": 0.0008915774524211884, "memory(GiB)": 38.0, "reward": 0.6808828711509705, "reward_std": 0.1108463779091835, "rewards/VisualizationJSONCombinedORM/mean": 0.6808828711509705, "rewards/VisualizationJSONCombinedORM/std": 0.11661390960216522, "step": 2489, "train_speed(iter/s)": 0.899153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 275.875, "completions/min_length": 220.0, "epoch": 2.0595533498759306, "grad_norm": 0.18344484269618988, "kl": 0.1004638671875, "learning_rate": 2.7069232876952368e-06, "loss": 0.0010052211582660675, "memory(GiB)": 38.0, "reward": 0.6985365748405457, "reward_std": 0.06812500953674316, "rewards/VisualizationJSONCombinedORM/mean": 0.6985365748405457, "rewards/VisualizationJSONCombinedORM/std": 0.10399003326892853, "step": 2490, "train_speed(iter/s)": 0.893217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 279.375, "completions/min_length": 200.0, "epoch": 2.0603804797353185, "grad_norm": 0.23259185254573822, "kl": 0.096923828125, "learning_rate": 2.7026478051032625e-06, "loss": 0.0009711124002933502, "memory(GiB)": 38.0, "reward": 0.3735518455505371, "reward_std": 0.06556740403175354, "rewards/VisualizationJSONCombinedORM/mean": 0.3735518455505371, "rewards/VisualizationJSONCombinedORM/std": 0.06578419357538223, "step": 2491, "train_speed(iter/s)": 0.884478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 265.3125, "completions/min_length": 225.0, "epoch": 2.0612076095947063, "grad_norm": 0.27100619673728943, "kl": 0.0660400390625, "learning_rate": 2.6983744507826035e-06, "loss": 0.0006601158529520035, "memory(GiB)": 38.0, "reward": 0.4913378059864044, "reward_std": 0.09829214215278625, "rewards/VisualizationJSONCombinedORM/mean": 0.4913378059864044, "rewards/VisualizationJSONCombinedORM/std": 0.1985805332660675, "step": 2492, "train_speed(iter/s)": 0.878567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 292.5, "completions/min_length": 226.0, "epoch": 2.062034739454094, "grad_norm": 0.15809445083141327, "kl": 0.06732177734375, "learning_rate": 2.694103228692099e-06, "loss": 0.0006735473871231079, "memory(GiB)": 38.0, "reward": 0.5105755925178528, "reward_std": 0.060557834804058075, "rewards/VisualizationJSONCombinedORM/mean": 0.5105755925178528, "rewards/VisualizationJSONCombinedORM/std": 0.15480028092861176, "step": 2493, "train_speed(iter/s)": 0.870278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 282.5, "completions/min_length": 226.0, "epoch": 2.0628618693134824, "grad_norm": 0.16813315451145172, "kl": 0.07305908203125, "learning_rate": 2.689834142788622e-06, "loss": 0.0007298626005649567, "memory(GiB)": 38.0, "reward": 0.6141161322593689, "reward_std": 0.06295575201511383, "rewards/VisualizationJSONCombinedORM/mean": 0.6141161322593689, "rewards/VisualizationJSONCombinedORM/std": 0.06606648862361908, "step": 2494, "train_speed(iter/s)": 0.861895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 299.6875, "completions/min_length": 234.0, "epoch": 2.06368899917287, "grad_norm": 0.2087298333644867, "kl": 0.08984375, "learning_rate": 2.685567197027058e-06, "loss": 0.0008958578109741211, "memory(GiB)": 38.0, "reward": 0.31334590911865234, "reward_std": 0.027104057371616364, "rewards/VisualizationJSONCombinedORM/mean": 0.31334590911865234, "rewards/VisualizationJSONCombinedORM/std": 0.08693305402994156, "step": 2495, "train_speed(iter/s)": 0.85434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 277.625, "completions/min_length": 223.0, "epoch": 2.064516129032258, "grad_norm": 0.189228817820549, "kl": 0.100341796875, "learning_rate": 2.6813023953603168e-06, "loss": 0.0010043084621429443, "memory(GiB)": 38.0, "reward": 0.37180858850479126, "reward_std": 0.05727183446288109, "rewards/VisualizationJSONCombinedORM/mean": 0.37180858850479126, "rewards/VisualizationJSONCombinedORM/std": 0.1648748815059662, "step": 2496, "train_speed(iter/s)": 0.847922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 291.75, "completions/min_length": 213.0, "epoch": 2.065343258891646, "grad_norm": 0.18352024257183075, "kl": 0.090576171875, "learning_rate": 2.6770397417393145e-06, "loss": 0.0009070262312889099, "memory(GiB)": 38.0, "reward": 0.5829606652259827, "reward_std": 0.0649556964635849, "rewards/VisualizationJSONCombinedORM/mean": 0.5829606652259827, "rewards/VisualizationJSONCombinedORM/std": 0.16830499470233917, "step": 2497, "train_speed(iter/s)": 0.841149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 285.4375, "completions/min_length": 207.0, "epoch": 2.066170388751034, "grad_norm": 0.17995455861091614, "kl": 0.0501708984375, "learning_rate": 2.6727792401129837e-06, "loss": 0.0005021467804908752, "memory(GiB)": 38.0, "reward": 0.6770660877227783, "reward_std": 0.06266570091247559, "rewards/VisualizationJSONCombinedORM/mean": 0.6770660877227783, "rewards/VisualizationJSONCombinedORM/std": 0.12619690597057343, "step": 2498, "train_speed(iter/s)": 0.834728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 278.75, "completions/min_length": 232.0, "epoch": 2.066997518610422, "grad_norm": 0.15845036506652832, "kl": 0.04345703125, "learning_rate": 2.668520894428259e-06, "loss": 0.00043470412492752075, "memory(GiB)": 38.0, "reward": 0.6339925527572632, "reward_std": 0.06311935186386108, "rewards/VisualizationJSONCombinedORM/mean": 0.6339925527572632, "rewards/VisualizationJSONCombinedORM/std": 0.19684815406799316, "step": 2499, "train_speed(iter/s)": 0.826917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 274.0, "completions/min_length": 237.0, "epoch": 2.0678246484698097, "grad_norm": 0.20507082343101501, "kl": 0.08514404296875, "learning_rate": 2.6642647086300845e-06, "loss": 0.000852733850479126, "memory(GiB)": 38.0, "reward": 0.5665014982223511, "reward_std": 0.07788842916488647, "rewards/VisualizationJSONCombinedORM/mean": 0.5665014982223511, "rewards/VisualizationJSONCombinedORM/std": 0.14508405327796936, "step": 2500, "train_speed(iter/s)": 0.820502 }, { "epoch": 2.0678246484698097, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.3333333333333, "eval_completions/mean_length": 281.7864583333333, "eval_completions/min_length": 238.875, "eval_kl": 0.08891805013020833, "eval_loss": 0.0008935772930271924, "eval_reward": 0.4954807497560978, "eval_reward_std": 0.06877236991810302, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4954807497560978, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06877237061659495, "eval_runtime": 297.8814, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 268.25, "completions/min_length": 220.0, "epoch": 2.0686517783291976, "grad_norm": 0.17073622345924377, "kl": 0.0997314453125, "learning_rate": 2.660010686661393e-06, "loss": 0.0009987223893404007, "memory(GiB)": 38.0, "reward": 0.5299773216247559, "reward_std": 0.06310690939426422, "rewards/VisualizationJSONCombinedORM/mean": 0.5299773216247559, "rewards/VisualizationJSONCombinedORM/std": 0.12585130333900452, "step": 2501, "train_speed(iter/s)": 0.742289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 266.6875, "completions/min_length": 227.0, "epoch": 2.069478908188586, "grad_norm": 0.17314723134040833, "kl": 0.047607421875, "learning_rate": 2.6557588324631223e-06, "loss": 0.00047548674046993256, "memory(GiB)": 38.0, "reward": 0.46280959248542786, "reward_std": 0.05154823511838913, "rewards/VisualizationJSONCombinedORM/mean": 0.46280959248542786, "rewards/VisualizationJSONCombinedORM/std": 0.2992688715457916, "step": 2502, "train_speed(iter/s)": 0.737444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 275.625, "completions/min_length": 236.0, "epoch": 2.0703060380479736, "grad_norm": 0.1789543777704239, "kl": 0.09771728515625, "learning_rate": 2.6515091499741946e-06, "loss": 0.0009749317541718483, "memory(GiB)": 38.0, "reward": 0.6394494771957397, "reward_std": 0.11404438316822052, "rewards/VisualizationJSONCombinedORM/mean": 0.6394494771957397, "rewards/VisualizationJSONCombinedORM/std": 0.14506185054779053, "step": 2503, "train_speed(iter/s)": 0.732267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 265.1875, "completions/min_length": 222.0, "epoch": 2.0711331679073615, "grad_norm": 0.19654297828674316, "kl": 0.10546875, "learning_rate": 2.6472616431315247e-06, "loss": 0.001057121902704239, "memory(GiB)": 38.0, "reward": 0.5772063136100769, "reward_std": 0.05896209552884102, "rewards/VisualizationJSONCombinedORM/mean": 0.5772063136100769, "rewards/VisualizationJSONCombinedORM/std": 0.2598699629306793, "step": 2504, "train_speed(iter/s)": 0.726209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 275.25, "completions/min_length": 219.0, "epoch": 2.0719602977667493, "grad_norm": 0.18042004108428955, "kl": 0.0509033203125, "learning_rate": 2.6430163158700116e-06, "loss": 0.0005095154047012329, "memory(GiB)": 38.0, "reward": 0.36997807025909424, "reward_std": 0.13394948840141296, "rewards/VisualizationJSONCombinedORM/mean": 0.36997807025909424, "rewards/VisualizationJSONCombinedORM/std": 0.18722979724407196, "step": 2505, "train_speed(iter/s)": 0.721107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 283.5, "completions/min_length": 237.0, "epoch": 2.072787427626137, "grad_norm": 0.18618154525756836, "kl": 0.086669921875, "learning_rate": 2.638773172122534e-06, "loss": 0.0008650906383991241, "memory(GiB)": 38.0, "reward": 0.47133052349090576, "reward_std": 0.08008076250553131, "rewards/VisualizationJSONCombinedORM/mean": 0.47133052349090576, "rewards/VisualizationJSONCombinedORM/std": 0.10513466596603394, "step": 2506, "train_speed(iter/s)": 0.715837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 271.3125, "completions/min_length": 222.0, "epoch": 2.0736145574855254, "grad_norm": 0.2042265087366104, "kl": 0.1104736328125, "learning_rate": 2.6345322158199503e-06, "loss": 0.001103116199374199, "memory(GiB)": 38.0, "reward": 0.43143171072006226, "reward_std": 0.094667449593544, "rewards/VisualizationJSONCombinedORM/mean": 0.43143171072006226, "rewards/VisualizationJSONCombinedORM/std": 0.11634347587823868, "step": 2507, "train_speed(iter/s)": 0.711751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 275.6875, "completions/min_length": 201.0, "epoch": 2.074441687344913, "grad_norm": 0.18742194771766663, "kl": 0.0994873046875, "learning_rate": 2.630293450891086e-06, "loss": 0.0009922198951244354, "memory(GiB)": 38.0, "reward": 0.4250454306602478, "reward_std": 0.04547090455889702, "rewards/VisualizationJSONCombinedORM/mean": 0.4250454306602478, "rewards/VisualizationJSONCombinedORM/std": 0.21510916948318481, "step": 2508, "train_speed(iter/s)": 0.706089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 270.4375, "completions/min_length": 202.0, "epoch": 2.075268817204301, "grad_norm": 0.1741107553243637, "kl": 0.04412841796875, "learning_rate": 2.6260568812627453e-06, "loss": 0.00044079869985580444, "memory(GiB)": 38.0, "reward": 0.4941539466381073, "reward_std": 0.07404068857431412, "rewards/VisualizationJSONCombinedORM/mean": 0.4941539466381073, "rewards/VisualizationJSONCombinedORM/std": 0.08916951715946198, "step": 2509, "train_speed(iter/s)": 0.701784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 289.25, "completions/min_length": 213.0, "epoch": 2.076095947063689, "grad_norm": 0.18183094263076782, "kl": 0.07244873046875, "learning_rate": 2.621822510859687e-06, "loss": 0.0007244721055030823, "memory(GiB)": 38.0, "reward": 0.3504801094532013, "reward_std": 0.05742944777011871, "rewards/VisualizationJSONCombinedORM/mean": 0.3504801094532013, "rewards/VisualizationJSONCombinedORM/std": 0.058246150612831116, "step": 2510, "train_speed(iter/s)": 0.697367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 286.3125, "completions/min_length": 248.0, "epoch": 2.076923076923077, "grad_norm": 0.16091230511665344, "kl": 0.0372314453125, "learning_rate": 2.617590343604648e-06, "loss": 0.0003730505704879761, "memory(GiB)": 38.0, "reward": 0.61435866355896, "reward_std": 0.05112791061401367, "rewards/VisualizationJSONCombinedORM/mean": 0.61435866355896, "rewards/VisualizationJSONCombinedORM/std": 0.18280763924121857, "step": 2511, "train_speed(iter/s)": 0.6928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 271.375, "completions/min_length": 222.0, "epoch": 2.077750206782465, "grad_norm": 0.17308178544044495, "kl": 0.0802001953125, "learning_rate": 2.613360383418311e-06, "loss": 0.0008008256554603577, "memory(GiB)": 38.0, "reward": 0.6776206493377686, "reward_std": 0.046388424932956696, "rewards/VisualizationJSONCombinedORM/mean": 0.6776206493377686, "rewards/VisualizationJSONCombinedORM/std": 0.08108587563037872, "step": 2512, "train_speed(iter/s)": 0.688272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 257.6875, "completions/min_length": 211.0, "epoch": 2.0785773366418527, "grad_norm": 0.18400758504867554, "kl": 0.067626953125, "learning_rate": 2.609132634219321e-06, "loss": 0.000675467774271965, "memory(GiB)": 38.0, "reward": 0.7605139017105103, "reward_std": 0.022824181243777275, "rewards/VisualizationJSONCombinedORM/mean": 0.7605139017105103, "rewards/VisualizationJSONCombinedORM/std": 0.049199555069208145, "step": 2513, "train_speed(iter/s)": 0.684455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 312.3125, "completions/min_length": 260.0, "epoch": 2.0794044665012406, "grad_norm": 0.19078490138053894, "kl": 0.08331298828125, "learning_rate": 2.6049070999242708e-06, "loss": 0.0008324254304170609, "memory(GiB)": 38.0, "reward": 0.4784882664680481, "reward_std": 0.06866864860057831, "rewards/VisualizationJSONCombinedORM/mean": 0.4784882664680481, "rewards/VisualizationJSONCombinedORM/std": 0.08849484473466873, "step": 2514, "train_speed(iter/s)": 0.680041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 265.25, "completions/min_length": 225.0, "epoch": 2.080231596360629, "grad_norm": 0.17072370648384094, "kl": 0.0743408203125, "learning_rate": 2.600683784447704e-06, "loss": 0.000742785632610321, "memory(GiB)": 38.0, "reward": 0.618352472782135, "reward_std": 0.10239634662866592, "rewards/VisualizationJSONCombinedORM/mean": 0.618352472782135, "rewards/VisualizationJSONCombinedORM/std": 0.13490651547908783, "step": 2515, "train_speed(iter/s)": 0.674743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 274.8125, "completions/min_length": 221.0, "epoch": 2.0810587262200166, "grad_norm": 0.16936492919921875, "kl": 0.048583984375, "learning_rate": 2.596462691702108e-06, "loss": 0.0004861988127231598, "memory(GiB)": 38.0, "reward": 0.7413655519485474, "reward_std": 0.07567650824785233, "rewards/VisualizationJSONCombinedORM/mean": 0.7413655519485474, "rewards/VisualizationJSONCombinedORM/std": 0.09116972982883453, "step": 2516, "train_speed(iter/s)": 0.671049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 280.375, "completions/min_length": 189.0, "epoch": 2.0818858560794045, "grad_norm": 0.1478574573993683, "kl": 0.04327392578125, "learning_rate": 2.5922438255979125e-06, "loss": 0.00043315812945365906, "memory(GiB)": 38.0, "reward": 0.47587770223617554, "reward_std": 0.061806898564100266, "rewards/VisualizationJSONCombinedORM/mean": 0.47587770223617554, "rewards/VisualizationJSONCombinedORM/std": 0.25072407722473145, "step": 2517, "train_speed(iter/s)": 0.666352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 290.4375, "completions/min_length": 217.0, "epoch": 2.0827129859387923, "grad_norm": 0.15320011973381042, "kl": 0.10546875, "learning_rate": 2.5880271900434797e-06, "loss": 0.0010553225874900818, "memory(GiB)": 38.0, "reward": 0.5170342326164246, "reward_std": 0.037711113691329956, "rewards/VisualizationJSONCombinedORM/mean": 0.5170342326164246, "rewards/VisualizationJSONCombinedORM/std": 0.29243072867393494, "step": 2518, "train_speed(iter/s)": 0.66217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 295.125, "completions/min_length": 250.0, "epoch": 2.08354011579818, "grad_norm": 0.18462543189525604, "kl": 0.0682373046875, "learning_rate": 2.5838127889451113e-06, "loss": 0.0006826119497418404, "memory(GiB)": 38.0, "reward": 0.6299059391021729, "reward_std": 0.04179855063557625, "rewards/VisualizationJSONCombinedORM/mean": 0.6299059391021729, "rewards/VisualizationJSONCombinedORM/std": 0.1947731077671051, "step": 2519, "train_speed(iter/s)": 0.657809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 276.25, "completions/min_length": 223.0, "epoch": 2.0843672456575684, "grad_norm": 0.19551363587379456, "kl": 0.19189453125, "learning_rate": 2.5796006262070337e-06, "loss": 0.0019193664193153381, "memory(GiB)": 38.0, "reward": 0.605709433555603, "reward_std": 0.12656007707118988, "rewards/VisualizationJSONCombinedORM/mean": 0.605709433555603, "rewards/VisualizationJSONCombinedORM/std": 0.1534036099910736, "step": 2520, "train_speed(iter/s)": 0.653241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 285.5625, "completions/min_length": 247.0, "epoch": 2.085194375516956, "grad_norm": 0.16959743201732635, "kl": 0.0791015625, "learning_rate": 2.575390705731404e-06, "loss": 0.0007910206913948059, "memory(GiB)": 38.0, "reward": 0.6717469692230225, "reward_std": 0.08195098489522934, "rewards/VisualizationJSONCombinedORM/mean": 0.6717469692230225, "rewards/VisualizationJSONCombinedORM/std": 0.12444393336772919, "step": 2521, "train_speed(iter/s)": 0.649101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 258.75, "completions/min_length": 228.0, "epoch": 2.086021505376344, "grad_norm": 0.14406104385852814, "kl": 0.0731201171875, "learning_rate": 2.5711830314182996e-06, "loss": 0.0007316842675209045, "memory(GiB)": 38.0, "reward": 0.7089905142784119, "reward_std": 0.10141284018754959, "rewards/VisualizationJSONCombinedORM/mean": 0.7089905142784119, "rewards/VisualizationJSONCombinedORM/std": 0.09894060343503952, "step": 2522, "train_speed(iter/s)": 0.645376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 281.5, "completions/min_length": 210.0, "epoch": 2.086848635235732, "grad_norm": 0.19910846650600433, "kl": 0.04803466796875, "learning_rate": 2.5669776071657194e-06, "loss": 0.0004800930619239807, "memory(GiB)": 38.0, "reward": 0.26263171434402466, "reward_std": 0.023859679698944092, "rewards/VisualizationJSONCombinedORM/mean": 0.26263171434402466, "rewards/VisualizationJSONCombinedORM/std": 0.024871427565813065, "step": 2523, "train_speed(iter/s)": 0.641305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 281.375, "completions/min_length": 225.0, "epoch": 2.08767576509512, "grad_norm": 0.16267472505569458, "kl": 0.107666015625, "learning_rate": 2.562774436869573e-06, "loss": 0.0010769069194793701, "memory(GiB)": 38.0, "reward": 0.5423940420150757, "reward_std": 0.07361000776290894, "rewards/VisualizationJSONCombinedORM/mean": 0.5423940420150757, "rewards/VisualizationJSONCombinedORM/std": 0.097374826669693, "step": 2524, "train_speed(iter/s)": 0.637137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 302.75, "completions/min_length": 250.0, "epoch": 2.088502894954508, "grad_norm": 0.17601537704467773, "kl": 0.0765380859375, "learning_rate": 2.5585735244236897e-06, "loss": 0.0007665110751986504, "memory(GiB)": 38.0, "reward": 0.351628839969635, "reward_std": 0.03966802358627319, "rewards/VisualizationJSONCombinedORM/mean": 0.351628839969635, "rewards/VisualizationJSONCombinedORM/std": 0.04719458892941475, "step": 2525, "train_speed(iter/s)": 0.633962 }, { "epoch": 2.088502894954508, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 336.625, "eval_completions/mean_length": 281.4114583333333, "eval_completions/min_length": 240.83333333333334, "eval_kl": 0.08842976888020833, "eval_loss": 0.0008825535769574344, "eval_reward": 0.493216410279274, "eval_reward_std": 0.06812923409355183, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.493216410279274, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06812923455921312, "eval_runtime": 293.2816, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 303.5, "completions/min_length": 248.0, "epoch": 2.0893300248138957, "grad_norm": 0.17904040217399597, "kl": 0.0994873046875, "learning_rate": 2.5543748737197953e-06, "loss": 0.0009970925748348236, "memory(GiB)": 38.0, "reward": 0.566288411617279, "reward_std": 0.06583058089017868, "rewards/VisualizationJSONCombinedORM/mean": 0.566288411617279, "rewards/VisualizationJSONCombinedORM/std": 0.13888077437877655, "step": 2526, "train_speed(iter/s)": 0.587279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 274.625, "completions/min_length": 241.0, "epoch": 2.0901571546732836, "grad_norm": 0.22522929310798645, "kl": 0.0838623046875, "learning_rate": 2.550178488647536e-06, "loss": 0.0008380934596061707, "memory(GiB)": 38.0, "reward": 0.49167734384536743, "reward_std": 0.036978818476200104, "rewards/VisualizationJSONCombinedORM/mean": 0.49167734384536743, "rewards/VisualizationJSONCombinedORM/std": 0.04747578501701355, "step": 2527, "train_speed(iter/s)": 0.584714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 311.0625, "completions/min_length": 236.0, "epoch": 2.090984284532672, "grad_norm": 0.17697259783744812, "kl": 0.0887451171875, "learning_rate": 2.545984373094445e-06, "loss": 0.0008862372487783432, "memory(GiB)": 38.0, "reward": 0.7238935232162476, "reward_std": 0.06144919991493225, "rewards/VisualizationJSONCombinedORM/mean": 0.7238935232162476, "rewards/VisualizationJSONCombinedORM/std": 0.11737486720085144, "step": 2528, "train_speed(iter/s)": 0.580551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 278.6875, "completions/min_length": 228.0, "epoch": 2.0918114143920596, "grad_norm": 0.18637479841709137, "kl": 0.1187744140625, "learning_rate": 2.5417925309459623e-06, "loss": 0.001188855618238449, "memory(GiB)": 38.0, "reward": 0.43724188208580017, "reward_std": 0.05004468560218811, "rewards/VisualizationJSONCombinedORM/mean": 0.43724188208580017, "rewards/VisualizationJSONCombinedORM/std": 0.0963042601943016, "step": 2529, "train_speed(iter/s)": 0.577631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 266.75, "completions/min_length": 215.0, "epoch": 2.0926385442514475, "grad_norm": 0.1532078981399536, "kl": 0.08111572265625, "learning_rate": 2.537602966085414e-06, "loss": 0.0008113235235214233, "memory(GiB)": 38.0, "reward": 0.4801670014858246, "reward_std": 0.0580873042345047, "rewards/VisualizationJSONCombinedORM/mean": 0.4801670014858246, "rewards/VisualizationJSONCombinedORM/std": 0.1889858990907669, "step": 2530, "train_speed(iter/s)": 0.574477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 260.3125, "completions/min_length": 214.0, "epoch": 2.0934656741108353, "grad_norm": 0.2475406974554062, "kl": 0.07940673828125, "learning_rate": 2.5334156823940237e-06, "loss": 0.0007930230349302292, "memory(GiB)": 38.0, "reward": 0.5722005367279053, "reward_std": 0.08006957173347473, "rewards/VisualizationJSONCombinedORM/mean": 0.5722005367279053, "rewards/VisualizationJSONCombinedORM/std": 0.17074428498744965, "step": 2531, "train_speed(iter/s)": 0.571812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 289.6875, "completions/min_length": 236.0, "epoch": 2.094292803970223, "grad_norm": 0.18661601841449738, "kl": 0.117919921875, "learning_rate": 2.529230683750897e-06, "loss": 0.001179676502943039, "memory(GiB)": 38.0, "reward": 0.6176410913467407, "reward_std": 0.0681060403585434, "rewards/VisualizationJSONCombinedORM/mean": 0.6176410913467407, "rewards/VisualizationJSONCombinedORM/std": 0.10392811894416809, "step": 2532, "train_speed(iter/s)": 0.568095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 305.6875, "completions/min_length": 217.0, "epoch": 2.0951199338296114, "grad_norm": 0.1434396356344223, "kl": 0.07763671875, "learning_rate": 2.5250479740330285e-06, "loss": 0.0007750242948532104, "memory(GiB)": 38.0, "reward": 0.6896323561668396, "reward_std": 0.06241718679666519, "rewards/VisualizationJSONCombinedORM/mean": 0.6896323561668396, "rewards/VisualizationJSONCombinedORM/std": 0.08743416517972946, "step": 2533, "train_speed(iter/s)": 0.565384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 306.875, "completions/min_length": 235.0, "epoch": 2.095947063688999, "grad_norm": 0.16133974492549896, "kl": 0.048828125, "learning_rate": 2.520867557115283e-06, "loss": 0.0004887059330940247, "memory(GiB)": 38.0, "reward": 0.5685659646987915, "reward_std": 0.06764184683561325, "rewards/VisualizationJSONCombinedORM/mean": 0.5685659646987915, "rewards/VisualizationJSONCombinedORM/std": 0.20776543021202087, "step": 2534, "train_speed(iter/s)": 0.562146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 299.6875, "completions/min_length": 224.0, "epoch": 2.096774193548387, "grad_norm": 0.18545982241630554, "kl": 0.08306884765625, "learning_rate": 2.51668943687041e-06, "loss": 0.0008312761783599854, "memory(GiB)": 38.0, "reward": 0.6410394906997681, "reward_std": 0.05177656188607216, "rewards/VisualizationJSONCombinedORM/mean": 0.6410394906997681, "rewards/VisualizationJSONCombinedORM/std": 0.07377871125936508, "step": 2535, "train_speed(iter/s)": 0.559241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 294.875, "completions/min_length": 245.0, "epoch": 2.097601323407775, "grad_norm": 0.1745690554380417, "kl": 0.1201171875, "learning_rate": 2.5125136171690252e-06, "loss": 0.0011996999382972717, "memory(GiB)": 38.0, "reward": 0.6053042411804199, "reward_std": 0.05215968191623688, "rewards/VisualizationJSONCombinedORM/mean": 0.6053042411804199, "rewards/VisualizationJSONCombinedORM/std": 0.07916217297315598, "step": 2536, "train_speed(iter/s)": 0.556268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 322.375, "completions/min_length": 217.0, "epoch": 2.098428453267163, "grad_norm": 0.1795872002840042, "kl": 0.03680419921875, "learning_rate": 2.508340101879616e-06, "loss": 0.0003674626350402832, "memory(GiB)": 38.0, "reward": 0.43574410676956177, "reward_std": 0.03621358796954155, "rewards/VisualizationJSONCombinedORM/mean": 0.43574410676956177, "rewards/VisualizationJSONCombinedORM/std": 0.07279890775680542, "step": 2537, "train_speed(iter/s)": 0.552945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 302.0625, "completions/min_length": 238.0, "epoch": 2.099255583126551, "grad_norm": 0.19835853576660156, "kl": 0.080810546875, "learning_rate": 2.5041688948685367e-06, "loss": 0.0008069798350334167, "memory(GiB)": 38.0, "reward": 0.5272699594497681, "reward_std": 0.09320996701717377, "rewards/VisualizationJSONCombinedORM/mean": 0.5272699594497681, "rewards/VisualizationJSONCombinedORM/std": 0.11502581089735031, "step": 2538, "train_speed(iter/s)": 0.549724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 267.9375, "completions/min_length": 202.0, "epoch": 2.1000827129859387, "grad_norm": 0.19228336215019226, "kl": 0.105224609375, "learning_rate": 2.5000000000000015e-06, "loss": 0.001051831990480423, "memory(GiB)": 38.0, "reward": 0.5625226497650146, "reward_std": 0.07444331794977188, "rewards/VisualizationJSONCombinedORM/mean": 0.5625226497650146, "rewards/VisualizationJSONCombinedORM/std": 0.21940863132476807, "step": 2539, "train_speed(iter/s)": 0.546821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 291.375, "completions/min_length": 204.0, "epoch": 2.1009098428453266, "grad_norm": 0.1712493896484375, "kl": 0.060791015625, "learning_rate": 2.4958334211360787e-06, "loss": 0.0006076470017433167, "memory(GiB)": 38.0, "reward": 0.4514334201812744, "reward_std": 0.0754259005188942, "rewards/VisualizationJSONCombinedORM/mean": 0.4514334201812744, "rewards/VisualizationJSONCombinedORM/std": 0.07875258475542068, "step": 2540, "train_speed(iter/s)": 0.543981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 290.125, "completions/min_length": 233.0, "epoch": 2.101736972704715, "grad_norm": 0.2047155350446701, "kl": 0.121826171875, "learning_rate": 2.4916691621366984e-06, "loss": 0.0012175887823104858, "memory(GiB)": 38.0, "reward": 0.4044429063796997, "reward_std": 0.07223644852638245, "rewards/VisualizationJSONCombinedORM/mean": 0.4044429063796997, "rewards/VisualizationJSONCombinedORM/std": 0.0962083637714386, "step": 2541, "train_speed(iter/s)": 0.541208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 276.1875, "completions/min_length": 209.0, "epoch": 2.1025641025641026, "grad_norm": 0.1932901293039322, "kl": 0.058837890625, "learning_rate": 2.487507226859633e-06, "loss": 0.0005885325372219086, "memory(GiB)": 38.0, "reward": 0.6161108016967773, "reward_std": 0.10123217105865479, "rewards/VisualizationJSONCombinedORM/mean": 0.6161108016967773, "rewards/VisualizationJSONCombinedORM/std": 0.10465583950281143, "step": 2542, "train_speed(iter/s)": 0.538243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 287.875, "completions/min_length": 217.0, "epoch": 2.1033912324234905, "grad_norm": 0.20365077257156372, "kl": 0.04498291015625, "learning_rate": 2.4833476191605136e-06, "loss": 0.0004491470754146576, "memory(GiB)": 38.0, "reward": 0.6049227714538574, "reward_std": 0.07720542699098587, "rewards/VisualizationJSONCombinedORM/mean": 0.6049227714538574, "rewards/VisualizationJSONCombinedORM/std": 0.15931107103824615, "step": 2543, "train_speed(iter/s)": 0.535447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 255.4375, "completions/min_length": 201.0, "epoch": 2.1042183622828783, "grad_norm": 0.1732095181941986, "kl": 0.08843994140625, "learning_rate": 2.479190342892804e-06, "loss": 0.0008842684328556061, "memory(GiB)": 38.0, "reward": 0.22245420515537262, "reward_std": 0.0168234184384346, "rewards/VisualizationJSONCombinedORM/mean": 0.22245420515537262, "rewards/VisualizationJSONCombinedORM/std": 0.020266588777303696, "step": 2544, "train_speed(iter/s)": 0.533018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 300.75, "completions/min_length": 235.0, "epoch": 2.1050454921422665, "grad_norm": 0.2641850709915161, "kl": 0.1007080078125, "learning_rate": 2.4750354019078148e-06, "loss": 0.0010069161653518677, "memory(GiB)": 38.0, "reward": 0.6280332803726196, "reward_std": 0.10620563477277756, "rewards/VisualizationJSONCombinedORM/mean": 0.6280332803726196, "rewards/VisualizationJSONCombinedORM/std": 0.12566706538200378, "step": 2545, "train_speed(iter/s)": 0.530413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 329.125, "completions/min_length": 255.0, "epoch": 2.1058726220016544, "grad_norm": 0.16365660727024078, "kl": 0.0687255859375, "learning_rate": 2.470882800054688e-06, "loss": 0.0006875339895486832, "memory(GiB)": 38.0, "reward": 0.5305605530738831, "reward_std": 0.06523977965116501, "rewards/VisualizationJSONCombinedORM/mean": 0.5305605530738831, "rewards/VisualizationJSONCombinedORM/std": 0.2589872181415558, "step": 2546, "train_speed(iter/s)": 0.527631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 289.6875, "completions/min_length": 227.0, "epoch": 2.106699751861042, "grad_norm": 0.1928606778383255, "kl": 0.05328369140625, "learning_rate": 2.466732541180404e-06, "loss": 0.0005326420068740845, "memory(GiB)": 38.0, "reward": 0.7373731136322021, "reward_std": 0.07428312301635742, "rewards/VisualizationJSONCombinedORM/mean": 0.7373731136322021, "rewards/VisualizationJSONCombinedORM/std": 0.07663775980472565, "step": 2547, "train_speed(iter/s)": 0.525247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 282.0, "completions/min_length": 238.0, "epoch": 2.10752688172043, "grad_norm": 0.1756397783756256, "kl": 0.11767578125, "learning_rate": 2.4625846291297697e-06, "loss": 0.0011760517954826355, "memory(GiB)": 38.0, "reward": 0.4503840506076813, "reward_std": 0.06992705166339874, "rewards/VisualizationJSONCombinedORM/mean": 0.4503840506076813, "rewards/VisualizationJSONCombinedORM/std": 0.23322317004203796, "step": 2548, "train_speed(iter/s)": 0.522807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 247.5, "completions/min_length": 218.0, "epoch": 2.108354011579818, "grad_norm": 0.15114398300647736, "kl": 0.04742431640625, "learning_rate": 2.45843906774542e-06, "loss": 0.00047400034964084625, "memory(GiB)": 38.0, "reward": 0.6197110414505005, "reward_std": 0.0769898071885109, "rewards/VisualizationJSONCombinedORM/mean": 0.6197110414505005, "rewards/VisualizationJSONCombinedORM/std": 0.1443634331226349, "step": 2549, "train_speed(iter/s)": 0.520771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 282.5625, "completions/min_length": 244.0, "epoch": 2.109181141439206, "grad_norm": 0.18836994469165802, "kl": 0.0576171875, "learning_rate": 2.4542958608678075e-06, "loss": 0.0005747862160205841, "memory(GiB)": 38.0, "reward": 0.6926851272583008, "reward_std": 0.07207176089286804, "rewards/VisualizationJSONCombinedORM/mean": 0.6926851272583008, "rewards/VisualizationJSONCombinedORM/std": 0.09069214016199112, "step": 2550, "train_speed(iter/s)": 0.518116 }, { "epoch": 2.109181141439206, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 342.4583333333333, "eval_completions/mean_length": 291.0989583333333, "eval_completions/min_length": 247.875, "eval_kl": 0.09013875325520833, "eval_loss": 0.0009037765557877719, "eval_reward": 0.4931559811035792, "eval_reward_std": 0.05964255193248391, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4931559811035792, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05964255317424735, "eval_runtime": 296.9915, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 272.125, "completions/min_length": 219.0, "epoch": 2.110008271298594, "grad_norm": 0.23630864918231964, "kl": 0.0963134765625, "learning_rate": 2.4501550123352105e-06, "loss": 0.0009647123515605927, "memory(GiB)": 38.0, "reward": 0.38821953535079956, "reward_std": 0.07066862285137177, "rewards/VisualizationJSONCombinedORM/mean": 0.38821953535079956, "rewards/VisualizationJSONCombinedORM/std": 0.08271060138940811, "step": 2551, "train_speed(iter/s)": 0.486343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 281.8125, "completions/min_length": 245.0, "epoch": 2.1108354011579817, "grad_norm": 0.23081059753894806, "kl": 0.049072265625, "learning_rate": 2.4460165259837145e-06, "loss": 0.0004897415637969971, "memory(GiB)": 38.0, "reward": 0.4516327679157257, "reward_std": 0.07107985019683838, "rewards/VisualizationJSONCombinedORM/mean": 0.4516327679157257, "rewards/VisualizationJSONCombinedORM/std": 0.07032548636198044, "step": 2552, "train_speed(iter/s)": 0.48381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 281.875, "completions/min_length": 213.0, "epoch": 2.1116625310173696, "grad_norm": 0.19306278228759766, "kl": 0.06256103515625, "learning_rate": 2.4418804056472228e-06, "loss": 0.0006261710077524185, "memory(GiB)": 38.0, "reward": 0.2942126989364624, "reward_std": 0.03548435866832733, "rewards/VisualizationJSONCombinedORM/mean": 0.2942126989364624, "rewards/VisualizationJSONCombinedORM/std": 0.0406920462846756, "step": 2553, "train_speed(iter/s)": 0.481977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 283.4375, "completions/min_length": 214.0, "epoch": 2.112489660876758, "grad_norm": 0.18689413368701935, "kl": 0.072265625, "learning_rate": 2.437746655157446e-06, "loss": 0.0007245056331157684, "memory(GiB)": 38.0, "reward": 0.561861515045166, "reward_std": 0.0886363536119461, "rewards/VisualizationJSONCombinedORM/mean": 0.561861515045166, "rewards/VisualizationJSONCombinedORM/std": 0.08886658400297165, "step": 2554, "train_speed(iter/s)": 0.479726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 276.625, "completions/min_length": 235.0, "epoch": 2.1133167907361456, "grad_norm": 0.17290551960468292, "kl": 0.0947265625, "learning_rate": 2.4336152783438984e-06, "loss": 0.0009480342268943787, "memory(GiB)": 38.0, "reward": 0.7408460974693298, "reward_std": 0.08941414207220078, "rewards/VisualizationJSONCombinedORM/mean": 0.7408460974693298, "rewards/VisualizationJSONCombinedORM/std": 0.09378162771463394, "step": 2555, "train_speed(iter/s)": 0.477822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 316.1875, "completions/min_length": 244.0, "epoch": 2.1141439205955335, "grad_norm": 0.17220187187194824, "kl": 0.06597900390625, "learning_rate": 2.429486279033892e-06, "loss": 0.000659458339214325, "memory(GiB)": 38.0, "reward": 0.4221838712692261, "reward_std": 0.04073728993535042, "rewards/VisualizationJSONCombinedORM/mean": 0.4221838712692261, "rewards/VisualizationJSONCombinedORM/std": 0.06276810169219971, "step": 2556, "train_speed(iter/s)": 0.475452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 304.1875, "completions/min_length": 240.0, "epoch": 2.1149710504549213, "grad_norm": 0.18166828155517578, "kl": 0.06573486328125, "learning_rate": 2.425359661052542e-06, "loss": 0.0006574317812919617, "memory(GiB)": 38.0, "reward": 0.39270713925361633, "reward_std": 0.05067872256040573, "rewards/VisualizationJSONCombinedORM/mean": 0.39270713925361633, "rewards/VisualizationJSONCombinedORM/std": 0.07016570121049881, "step": 2557, "train_speed(iter/s)": 0.473525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 264.875, "completions/min_length": 219.0, "epoch": 2.1157981803143096, "grad_norm": 0.1913871020078659, "kl": 0.085205078125, "learning_rate": 2.4212354282227557e-06, "loss": 0.0008520260453224182, "memory(GiB)": 38.0, "reward": 0.413845956325531, "reward_std": 0.04194428026676178, "rewards/VisualizationJSONCombinedORM/mean": 0.413845956325531, "rewards/VisualizationJSONCombinedORM/std": 0.12490788102149963, "step": 2558, "train_speed(iter/s)": 0.471413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 285.5625, "completions/min_length": 218.0, "epoch": 2.1166253101736974, "grad_norm": 0.18438026309013367, "kl": 0.05780029296875, "learning_rate": 2.4171135843652256e-06, "loss": 0.0005779527127742767, "memory(GiB)": 38.0, "reward": 0.4468180537223816, "reward_std": 0.03995800018310547, "rewards/VisualizationJSONCombinedORM/mean": 0.4468180537223816, "rewards/VisualizationJSONCombinedORM/std": 0.0638439953327179, "step": 2559, "train_speed(iter/s)": 0.469376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 279.3125, "completions/min_length": 225.0, "epoch": 2.117452440033085, "grad_norm": 0.20007169246673584, "kl": 0.06451416015625, "learning_rate": 2.4129941332984413e-06, "loss": 0.0006482265889644623, "memory(GiB)": 38.0, "reward": 0.6581432819366455, "reward_std": 0.05623140186071396, "rewards/VisualizationJSONCombinedORM/mean": 0.6581432819366455, "rewards/VisualizationJSONCombinedORM/std": 0.16742321848869324, "step": 2560, "train_speed(iter/s)": 0.467148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 270.9375, "completions/min_length": 213.0, "epoch": 2.118279569892473, "grad_norm": 0.16913717985153198, "kl": 0.08807373046875, "learning_rate": 2.4088770788386655e-06, "loss": 0.0008808821439743042, "memory(GiB)": 38.0, "reward": 0.5276920199394226, "reward_std": 0.0874376893043518, "rewards/VisualizationJSONCombinedORM/mean": 0.5276920199394226, "rewards/VisualizationJSONCombinedORM/std": 0.22209963202476501, "step": 2561, "train_speed(iter/s)": 0.464824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 312.4375, "completions/min_length": 264.0, "epoch": 2.119106699751861, "grad_norm": 0.17075392603874207, "kl": 0.04901123046875, "learning_rate": 2.4047624247999484e-06, "loss": 0.0004902184009552002, "memory(GiB)": 38.0, "reward": 0.6160315871238708, "reward_std": 0.08314007520675659, "rewards/VisualizationJSONCombinedORM/mean": 0.6160315871238708, "rewards/VisualizationJSONCombinedORM/std": 0.1286940723657608, "step": 2562, "train_speed(iter/s)": 0.46255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 281.5625, "completions/min_length": 230.0, "epoch": 2.119933829611249, "grad_norm": 0.18548370897769928, "kl": 0.07672119140625, "learning_rate": 2.4006501749941097e-06, "loss": 0.0007686018943786621, "memory(GiB)": 38.0, "reward": 0.6468897461891174, "reward_std": 0.06586173176765442, "rewards/VisualizationJSONCombinedORM/mean": 0.6468897461891174, "rewards/VisualizationJSONCombinedORM/std": 0.13456782698631287, "step": 2563, "train_speed(iter/s)": 0.460594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 293.4375, "completions/min_length": 230.0, "epoch": 2.120760959470637, "grad_norm": 0.15567275881767273, "kl": 0.03399658203125, "learning_rate": 2.396540333230747e-06, "loss": 0.0003397241234779358, "memory(GiB)": 38.0, "reward": 0.47394710779190063, "reward_std": 0.044888935983181, "rewards/VisualizationJSONCombinedORM/mean": 0.47394710779190063, "rewards/VisualizationJSONCombinedORM/std": 0.19610638916492462, "step": 2564, "train_speed(iter/s)": 0.458754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 287.6875, "completions/min_length": 208.0, "epoch": 2.1215880893300247, "grad_norm": 0.2137443721294403, "kl": 0.072998046875, "learning_rate": 2.3924329033172246e-06, "loss": 0.0007302239537239075, "memory(GiB)": 38.0, "reward": 0.35366225242614746, "reward_std": 0.043524257838726044, "rewards/VisualizationJSONCombinedORM/mean": 0.35366225242614746, "rewards/VisualizationJSONCombinedORM/std": 0.042456962168216705, "step": 2565, "train_speed(iter/s)": 0.456597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 259.125, "completions/min_length": 228.0, "epoch": 2.1224152191894126, "grad_norm": 0.199782595038414, "kl": 0.057861328125, "learning_rate": 2.388327889058676e-06, "loss": 0.0005789250135421753, "memory(GiB)": 38.0, "reward": 0.6860760450363159, "reward_std": 0.06557673960924149, "rewards/VisualizationJSONCombinedORM/mean": 0.6860760450363159, "rewards/VisualizationJSONCombinedORM/std": 0.11961326003074646, "step": 2566, "train_speed(iter/s)": 0.454571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 294.6875, "completions/min_length": 231.0, "epoch": 2.123242349048801, "grad_norm": 0.15782934427261353, "kl": 0.0767822265625, "learning_rate": 2.384225294257989e-06, "loss": 0.0007679425179958344, "memory(GiB)": 38.0, "reward": 0.35912495851516724, "reward_std": 0.044047094881534576, "rewards/VisualizationJSONCombinedORM/mean": 0.35912495851516724, "rewards/VisualizationJSONCombinedORM/std": 0.043219804763793945, "step": 2567, "train_speed(iter/s)": 0.452576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 290.6875, "completions/min_length": 237.0, "epoch": 2.1240694789081886, "grad_norm": 0.1727588176727295, "kl": 0.0523681640625, "learning_rate": 2.38012512271582e-06, "loss": 0.0005238652229309082, "memory(GiB)": 38.0, "reward": 0.6192706227302551, "reward_std": 0.05134684592485428, "rewards/VisualizationJSONCombinedORM/mean": 0.6192706227302551, "rewards/VisualizationJSONCombinedORM/std": 0.12819264829158783, "step": 2568, "train_speed(iter/s)": 0.450782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 279.5625, "completions/min_length": 231.0, "epoch": 2.1248966087675765, "grad_norm": 0.19692401587963104, "kl": 0.072021484375, "learning_rate": 2.3760273782305715e-06, "loss": 0.0007219240069389343, "memory(GiB)": 38.0, "reward": 0.5533905625343323, "reward_std": 0.048125214874744415, "rewards/VisualizationJSONCombinedORM/mean": 0.5533905625343323, "rewards/VisualizationJSONCombinedORM/std": 0.11170632392168045, "step": 2569, "train_speed(iter/s)": 0.449215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 273.1875, "completions/min_length": 196.0, "epoch": 2.1257237386269643, "grad_norm": 0.16608214378356934, "kl": 0.05340576171875, "learning_rate": 2.371932064598403e-06, "loss": 0.0005327798426151276, "memory(GiB)": 38.0, "reward": 0.5319744944572449, "reward_std": 0.0666263997554779, "rewards/VisualizationJSONCombinedORM/mean": 0.5319744944572449, "rewards/VisualizationJSONCombinedORM/std": 0.1100136786699295, "step": 2570, "train_speed(iter/s)": 0.447773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 305.1875, "completions/min_length": 250.0, "epoch": 2.1265508684863526, "grad_norm": 0.20444531738758087, "kl": 0.05712890625, "learning_rate": 2.3678391856132203e-06, "loss": 0.0005707181990146637, "memory(GiB)": 38.0, "reward": 0.5925672054290771, "reward_std": 0.07131384313106537, "rewards/VisualizationJSONCombinedORM/mean": 0.5925672054290771, "rewards/VisualizationJSONCombinedORM/std": 0.14838907122612, "step": 2571, "train_speed(iter/s)": 0.445791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 252.25, "completions/min_length": 234.0, "epoch": 2.1273779983457404, "grad_norm": 0.1893225908279419, "kl": 0.08197021484375, "learning_rate": 2.363748745066677e-06, "loss": 0.0008198432624340057, "memory(GiB)": 38.0, "reward": 0.7976911664009094, "reward_std": 0.08940894901752472, "rewards/VisualizationJSONCombinedORM/mean": 0.7976911664009094, "rewards/VisualizationJSONCombinedORM/std": 0.14352233707904816, "step": 2572, "train_speed(iter/s)": 0.443846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 279.9375, "completions/min_length": 230.0, "epoch": 2.128205128205128, "grad_norm": 0.2103755921125412, "kl": 0.05487060546875, "learning_rate": 2.3596607467481602e-06, "loss": 0.0005481205880641937, "memory(GiB)": 38.0, "reward": 0.501952052116394, "reward_std": 0.0713954046368599, "rewards/VisualizationJSONCombinedORM/mean": 0.501952052116394, "rewards/VisualizationJSONCombinedORM/std": 0.0695662796497345, "step": 2573, "train_speed(iter/s)": 0.441993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 300.375, "completions/min_length": 235.0, "epoch": 2.129032258064516, "grad_norm": 0.16583503782749176, "kl": 0.0589599609375, "learning_rate": 2.3555751944448036e-06, "loss": 0.0005902647972106934, "memory(GiB)": 38.0, "reward": 0.6251684427261353, "reward_std": 0.054628193378448486, "rewards/VisualizationJSONCombinedORM/mean": 0.6251684427261353, "rewards/VisualizationJSONCombinedORM/std": 0.18058554828166962, "step": 2574, "train_speed(iter/s)": 0.43997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 313.3125, "completions/min_length": 254.0, "epoch": 2.1298593879239043, "grad_norm": 0.1824028342962265, "kl": 0.0670166015625, "learning_rate": 2.3514920919414636e-06, "loss": 0.000669572502374649, "memory(GiB)": 38.0, "reward": 0.39275234937667847, "reward_std": 0.0381874144077301, "rewards/VisualizationJSONCombinedORM/mean": 0.39275234937667847, "rewards/VisualizationJSONCombinedORM/std": 0.039983537048101425, "step": 2575, "train_speed(iter/s)": 0.438018 }, { "epoch": 2.1298593879239043, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 333.9583333333333, "eval_completions/mean_length": 283.3020833333333, "eval_completions/min_length": 243.79166666666666, "eval_kl": 0.067718505859375, "eval_loss": 0.0006782834534533322, "eval_reward": 0.460304011280338, "eval_reward_std": 0.05894885813662162, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.460304011280338, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05894886205593745, "eval_runtime": 291.6909, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 269.875, "completions/min_length": 237.0, "epoch": 2.130686517783292, "grad_norm": 0.1548076719045639, "kl": 0.0753173828125, "learning_rate": 2.3474114430207416e-06, "loss": 0.0007528513669967651, "memory(GiB)": 38.0, "reward": 0.775632381439209, "reward_std": 0.03378059342503548, "rewards/VisualizationJSONCombinedORM/mean": 0.775632381439209, "rewards/VisualizationJSONCombinedORM/std": 0.04417070001363754, "step": 2576, "train_speed(iter/s)": 0.415792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 276.75, "completions/min_length": 224.0, "epoch": 2.13151364764268, "grad_norm": 0.20150209963321686, "kl": 0.055419921875, "learning_rate": 2.343333251462954e-06, "loss": 0.000553864985704422, "memory(GiB)": 38.05, "reward": 0.47263121604919434, "reward_std": 0.0630616769194603, "rewards/VisualizationJSONCombinedORM/mean": 0.47263121604919434, "rewards/VisualizationJSONCombinedORM/std": 0.07246614247560501, "step": 2577, "train_speed(iter/s)": 0.413832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 288.1875, "completions/min_length": 229.0, "epoch": 2.1323407775020677, "grad_norm": 0.19107185304164886, "kl": 0.1103515625, "learning_rate": 2.339257521046148e-06, "loss": 0.00110568106174469, "memory(GiB)": 38.05, "reward": 0.446567177772522, "reward_std": 0.04985225573182106, "rewards/VisualizationJSONCombinedORM/mean": 0.446567177772522, "rewards/VisualizationJSONCombinedORM/std": 0.12881720066070557, "step": 2578, "train_speed(iter/s)": 0.412371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 292.25, "completions/min_length": 236.0, "epoch": 2.1331679073614556, "grad_norm": 0.1656467616558075, "kl": 0.1380615234375, "learning_rate": 2.335184255546083e-06, "loss": 0.0013798139989376068, "memory(GiB)": 38.05, "reward": 0.5386258363723755, "reward_std": 0.07712920010089874, "rewards/VisualizationJSONCombinedORM/mean": 0.5386258363723755, "rewards/VisualizationJSONCombinedORM/std": 0.11906305700540543, "step": 2579, "train_speed(iter/s)": 0.411222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 268.25, "completions/min_length": 221.0, "epoch": 2.133995037220844, "grad_norm": 0.17260169982910156, "kl": 0.0611572265625, "learning_rate": 2.3311134587362426e-06, "loss": 0.0006114691495895386, "memory(GiB)": 38.05, "reward": 0.7166321873664856, "reward_std": 0.11150901019573212, "rewards/VisualizationJSONCombinedORM/mean": 0.7166321873664856, "rewards/VisualizationJSONCombinedORM/std": 0.12931546568870544, "step": 2580, "train_speed(iter/s)": 0.409723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 290.75, "completions/min_length": 225.0, "epoch": 2.1348221670802316, "grad_norm": 0.18343842029571533, "kl": 0.06793212890625, "learning_rate": 2.3270451343878208e-06, "loss": 0.0006780102849006653, "memory(GiB)": 38.05, "reward": 0.5032796263694763, "reward_std": 0.1646805703639984, "rewards/VisualizationJSONCombinedORM/mean": 0.5032796263694763, "rewards/VisualizationJSONCombinedORM/std": 0.24173088371753693, "step": 2581, "train_speed(iter/s)": 0.407843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 303.625, "completions/min_length": 220.0, "epoch": 2.1356492969396195, "grad_norm": 0.17667196691036224, "kl": 0.06488037109375, "learning_rate": 2.3229792862697216e-06, "loss": 0.0006481148302555084, "memory(GiB)": 38.05, "reward": 0.19525665044784546, "reward_std": 0.06702342629432678, "rewards/VisualizationJSONCombinedORM/mean": 0.19525665044784546, "rewards/VisualizationJSONCombinedORM/std": 0.07963192462921143, "step": 2582, "train_speed(iter/s)": 0.405922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 290.9375, "completions/min_length": 241.0, "epoch": 2.1364764267990073, "grad_norm": 0.22935034334659576, "kl": 0.0550537109375, "learning_rate": 2.3189159181485517e-06, "loss": 0.0005500093102455139, "memory(GiB)": 38.05, "reward": 0.32913458347320557, "reward_std": 0.05351431667804718, "rewards/VisualizationJSONCombinedORM/mean": 0.32913458347320557, "rewards/VisualizationJSONCombinedORM/std": 0.05631623789668083, "step": 2583, "train_speed(iter/s)": 0.404529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 302.5, "completions/min_length": 262.0, "epoch": 2.1373035566583956, "grad_norm": 0.17910930514335632, "kl": 0.04901123046875, "learning_rate": 2.314855033788625e-06, "loss": 0.0004894733428955078, "memory(GiB)": 38.05, "reward": 0.5470166802406311, "reward_std": 0.05277526006102562, "rewards/VisualizationJSONCombinedORM/mean": 0.5470166802406311, "rewards/VisualizationJSONCombinedORM/std": 0.16663846373558044, "step": 2584, "train_speed(iter/s)": 0.402955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 286.6875, "completions/min_length": 226.0, "epoch": 2.1381306865177834, "grad_norm": 0.17137634754180908, "kl": 0.0546875, "learning_rate": 2.3107966369519503e-06, "loss": 0.000547558069229126, "memory(GiB)": 38.05, "reward": 0.5267512202262878, "reward_std": 0.0784614086151123, "rewards/VisualizationJSONCombinedORM/mean": 0.5267512202262878, "rewards/VisualizationJSONCombinedORM/std": 0.0896039679646492, "step": 2585, "train_speed(iter/s)": 0.401785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 271.75, "completions/min_length": 227.0, "epoch": 2.138957816377171, "grad_norm": 0.2058236002922058, "kl": 0.091552734375, "learning_rate": 2.306740731398234e-06, "loss": 0.0009158439934253693, "memory(GiB)": 38.05, "reward": 0.6191215515136719, "reward_std": 0.06813380867242813, "rewards/VisualizationJSONCombinedORM/mean": 0.6191215515136719, "rewards/VisualizationJSONCombinedORM/std": 0.1276264637708664, "step": 2586, "train_speed(iter/s)": 0.400686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 265.5, "completions/min_length": 219.0, "epoch": 2.139784946236559, "grad_norm": 0.21904875338077545, "kl": 0.08966064453125, "learning_rate": 2.302687320884876e-06, "loss": 0.0008960440754890442, "memory(GiB)": 38.05, "reward": 0.36756646633148193, "reward_std": 0.049473486840724945, "rewards/VisualizationJSONCombinedORM/mean": 0.36756646633148193, "rewards/VisualizationJSONCombinedORM/std": 0.17235662043094635, "step": 2587, "train_speed(iter/s)": 0.399267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 296.375, "completions/min_length": 233.0, "epoch": 2.1406120760959473, "grad_norm": 0.23575745522975922, "kl": 0.058837890625, "learning_rate": 2.2986364091669643e-06, "loss": 0.0005891937762498856, "memory(GiB)": 38.05, "reward": 0.7482229471206665, "reward_std": 0.10039794445037842, "rewards/VisualizationJSONCombinedORM/mean": 0.7482229471206665, "rewards/VisualizationJSONCombinedORM/std": 0.11853454262018204, "step": 2588, "train_speed(iter/s)": 0.397867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 327.1875, "completions/min_length": 264.0, "epoch": 2.141439205955335, "grad_norm": 0.19893361628055573, "kl": 0.069580078125, "learning_rate": 2.2945879999972676e-06, "loss": 0.0006959438323974609, "memory(GiB)": 38.05, "reward": 0.6940032839775085, "reward_std": 0.08068342506885529, "rewards/VisualizationJSONCombinedORM/mean": 0.6940032839775085, "rewards/VisualizationJSONCombinedORM/std": 0.0986635610461235, "step": 2589, "train_speed(iter/s)": 0.396347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 268.5625, "completions/min_length": 224.0, "epoch": 2.142266335814723, "grad_norm": 0.21137428283691406, "kl": 0.085205078125, "learning_rate": 2.290542097126243e-06, "loss": 0.0008511319756507874, "memory(GiB)": 38.05, "reward": 0.7474029660224915, "reward_std": 0.09693840146064758, "rewards/VisualizationJSONCombinedORM/mean": 0.7474029660224915, "rewards/VisualizationJSONCombinedORM/std": 0.13956885039806366, "step": 2590, "train_speed(iter/s)": 0.394967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 307.6875, "completions/min_length": 236.0, "epoch": 2.1430934656741107, "grad_norm": 0.19783881306648254, "kl": 0.0599365234375, "learning_rate": 2.2864987043020176e-06, "loss": 0.0005988888442516327, "memory(GiB)": 38.05, "reward": 0.49545490741729736, "reward_std": 0.07509156316518784, "rewards/VisualizationJSONCombinedORM/mean": 0.49545490741729736, "rewards/VisualizationJSONCombinedORM/std": 0.15999659895896912, "step": 2591, "train_speed(iter/s)": 0.393624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 294.0625, "completions/min_length": 223.0, "epoch": 2.1439205955334986, "grad_norm": 0.18325506150722504, "kl": 0.1026611328125, "learning_rate": 2.2824578252704042e-06, "loss": 0.0010251030325889587, "memory(GiB)": 38.05, "reward": 0.39447587728500366, "reward_std": 0.051864661276340485, "rewards/VisualizationJSONCombinedORM/mean": 0.39447587728500366, "rewards/VisualizationJSONCombinedORM/std": 0.11424248665571213, "step": 2592, "train_speed(iter/s)": 0.392201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 286.3125, "completions/min_length": 235.0, "epoch": 2.144747725392887, "grad_norm": 0.21374063193798065, "kl": 0.1158447265625, "learning_rate": 2.2784194637748764e-06, "loss": 0.0011575594544410706, "memory(GiB)": 38.05, "reward": 0.3951992690563202, "reward_std": 0.06558883935213089, "rewards/VisualizationJSONCombinedORM/mean": 0.3951992690563202, "rewards/VisualizationJSONCombinedORM/std": 0.10883620381355286, "step": 2593, "train_speed(iter/s)": 0.390748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 294.625, "completions/min_length": 217.0, "epoch": 2.1455748552522746, "grad_norm": 0.15235435962677002, "kl": 0.0389404296875, "learning_rate": 2.2743836235565826e-06, "loss": 0.0003883875906467438, "memory(GiB)": 38.05, "reward": 0.6267224550247192, "reward_std": 0.05904744565486908, "rewards/VisualizationJSONCombinedORM/mean": 0.6267224550247192, "rewards/VisualizationJSONCombinedORM/std": 0.07097697257995605, "step": 2594, "train_speed(iter/s)": 0.389288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 273.625, "completions/min_length": 228.0, "epoch": 2.1464019851116625, "grad_norm": 0.15692463517189026, "kl": 0.07183837890625, "learning_rate": 2.2703503083543288e-06, "loss": 0.0007177442312240601, "memory(GiB)": 38.05, "reward": 0.46296441555023193, "reward_std": 0.0348757803440094, "rewards/VisualizationJSONCombinedORM/mean": 0.46296441555023193, "rewards/VisualizationJSONCombinedORM/std": 0.13267885148525238, "step": 2595, "train_speed(iter/s)": 0.387971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 257.6875, "completions/min_length": 214.0, "epoch": 2.1472291149710503, "grad_norm": 0.34022486209869385, "kl": 0.2769775390625, "learning_rate": 2.266319521904588e-06, "loss": 0.0027628540992736816, "memory(GiB)": 38.05, "reward": 0.4340253472328186, "reward_std": 0.04026012122631073, "rewards/VisualizationJSONCombinedORM/mean": 0.4340253472328186, "rewards/VisualizationJSONCombinedORM/std": 0.17267321050167084, "step": 2596, "train_speed(iter/s)": 0.386704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 307.4375, "completions/min_length": 237.0, "epoch": 2.1480562448304386, "grad_norm": 0.18803490698337555, "kl": 0.1063232421875, "learning_rate": 2.262291267941488e-06, "loss": 0.001059599220752716, "memory(GiB)": 38.05, "reward": 0.5442148447036743, "reward_std": 0.10970726609230042, "rewards/VisualizationJSONCombinedORM/mean": 0.5442148447036743, "rewards/VisualizationJSONCombinedORM/std": 0.18732261657714844, "step": 2597, "train_speed(iter/s)": 0.385676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 297.5, "completions/min_length": 190.0, "epoch": 2.1488833746898264, "grad_norm": 0.1915494054555893, "kl": 0.1304931640625, "learning_rate": 2.258265550196812e-06, "loss": 0.0013063387013971806, "memory(GiB)": 38.05, "reward": 0.5555181503295898, "reward_std": 0.03156731277704239, "rewards/VisualizationJSONCombinedORM/mean": 0.5555181503295898, "rewards/VisualizationJSONCombinedORM/std": 0.2756795287132263, "step": 2598, "train_speed(iter/s)": 0.384236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 272.375, "completions/min_length": 213.0, "epoch": 2.149710504549214, "grad_norm": 0.19140107929706573, "kl": 0.095458984375, "learning_rate": 2.2542423723999896e-06, "loss": 0.0009528845548629761, "memory(GiB)": 38.05, "reward": 0.5958833694458008, "reward_std": 0.0807914137840271, "rewards/VisualizationJSONCombinedORM/mean": 0.5958833694458008, "rewards/VisualizationJSONCombinedORM/std": 0.15486234426498413, "step": 2599, "train_speed(iter/s)": 0.382682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 291.125, "completions/min_length": 225.0, "epoch": 2.150537634408602, "grad_norm": 0.16918480396270752, "kl": 0.04339599609375, "learning_rate": 2.2502217382781033e-06, "loss": 0.00043382495641708374, "memory(GiB)": 38.05, "reward": 0.389883428812027, "reward_std": 0.03544450178742409, "rewards/VisualizationJSONCombinedORM/mean": 0.389883428812027, "rewards/VisualizationJSONCombinedORM/std": 0.12952405214309692, "step": 2600, "train_speed(iter/s)": 0.381424 }, { "epoch": 2.150537634408602, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 333.8333333333333, "eval_completions/mean_length": 284.9479166666667, "eval_completions/min_length": 240.70833333333334, "eval_kl": 0.07613118489583333, "eval_loss": 0.0007634054054506123, "eval_reward": 0.4723253821333249, "eval_reward_std": 0.06310823423943172, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4723253821333249, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06310823369616021, "eval_runtime": 292.4707, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 282.5, "completions/min_length": 217.0, "epoch": 2.1513647642679903, "grad_norm": 0.1715988963842392, "kl": 0.039581298828125, "learning_rate": 2.2462036515558726e-06, "loss": 0.00039569567888975143, "memory(GiB)": 38.05, "reward": 0.4382956027984619, "reward_std": 0.06693099439144135, "rewards/VisualizationJSONCombinedORM/mean": 0.4382956027984619, "rewards/VisualizationJSONCombinedORM/std": 0.13542930781841278, "step": 2601, "train_speed(iter/s)": 0.364345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 292.1875, "completions/min_length": 231.0, "epoch": 2.152191894127378, "grad_norm": 0.17841610312461853, "kl": 0.0892333984375, "learning_rate": 2.242188115955662e-06, "loss": 0.0008912906050682068, "memory(GiB)": 38.05, "reward": 0.6737037301063538, "reward_std": 0.09445615112781525, "rewards/VisualizationJSONCombinedORM/mean": 0.6737037301063538, "rewards/VisualizationJSONCombinedORM/std": 0.09681076556444168, "step": 2602, "train_speed(iter/s)": 0.363233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 267.5625, "completions/min_length": 222.0, "epoch": 2.153019023986766, "grad_norm": 0.18491052091121674, "kl": 0.09271240234375, "learning_rate": 2.238175135197471e-06, "loss": 0.0009271986782550812, "memory(GiB)": 38.05, "reward": 0.5615016222000122, "reward_std": 0.04445439577102661, "rewards/VisualizationJSONCombinedORM/mean": 0.5615016222000122, "rewards/VisualizationJSONCombinedORM/std": 0.23793809115886688, "step": 2603, "train_speed(iter/s)": 0.362077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 254.0625, "completions/min_length": 226.0, "epoch": 2.1538461538461537, "grad_norm": 0.1877926141023636, "kl": 0.080810546875, "learning_rate": 2.234164712998935e-06, "loss": 0.0008065700531005859, "memory(GiB)": 38.05, "reward": 0.3865940272808075, "reward_std": 0.04767843335866928, "rewards/VisualizationJSONCombinedORM/mean": 0.3865940272808075, "rewards/VisualizationJSONCombinedORM/std": 0.04647574573755264, "step": 2604, "train_speed(iter/s)": 0.361209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 280.1875, "completions/min_length": 238.0, "epoch": 2.1546732837055416, "grad_norm": 0.2004467099905014, "kl": 0.0926513671875, "learning_rate": 2.2301568530753113e-06, "loss": 0.0009256228804588318, "memory(GiB)": 38.05, "reward": 0.3641695976257324, "reward_std": 0.04324956238269806, "rewards/VisualizationJSONCombinedORM/mean": 0.3641695976257324, "rewards/VisualizationJSONCombinedORM/std": 0.06283702701330185, "step": 2605, "train_speed(iter/s)": 0.360117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 275.1875, "completions/min_length": 190.0, "epoch": 2.15550041356493, "grad_norm": 0.22464273869991302, "kl": 0.05010986328125, "learning_rate": 2.2261515591394937e-06, "loss": 0.0005010366439819336, "memory(GiB)": 38.05, "reward": 0.4580901563167572, "reward_std": 0.08033815026283264, "rewards/VisualizationJSONCombinedORM/mean": 0.4580901563167572, "rewards/VisualizationJSONCombinedORM/std": 0.12255623936653137, "step": 2606, "train_speed(iter/s)": 0.359033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 277.875, "completions/min_length": 242.0, "epoch": 2.1563275434243176, "grad_norm": 0.1791582554578781, "kl": 0.0809326171875, "learning_rate": 2.2221488349019903e-06, "loss": 0.0008096992969512939, "memory(GiB)": 38.05, "reward": 0.6990295648574829, "reward_std": 0.06240059807896614, "rewards/VisualizationJSONCombinedORM/mean": 0.6990295648574829, "rewards/VisualizationJSONCombinedORM/std": 0.06703176349401474, "step": 2607, "train_speed(iter/s)": 0.357726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 290.375, "completions/min_length": 241.0, "epoch": 2.1571546732837055, "grad_norm": 0.15906201303005219, "kl": 0.06805419921875, "learning_rate": 2.2181486840709314e-06, "loss": 0.0006797164678573608, "memory(GiB)": 38.05, "reward": 0.6027246713638306, "reward_std": 0.07010681927204132, "rewards/VisualizationJSONCombinedORM/mean": 0.6027246713638306, "rewards/VisualizationJSONCombinedORM/std": 0.07662324607372284, "step": 2608, "train_speed(iter/s)": 0.35667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 298.6875, "completions/min_length": 231.0, "epoch": 2.1579818031430933, "grad_norm": 0.19223922491073608, "kl": 0.059326171875, "learning_rate": 2.2141511103520703e-06, "loss": 0.0005941316485404968, "memory(GiB)": 38.05, "reward": 0.518401026725769, "reward_std": 0.0595049224793911, "rewards/VisualizationJSONCombinedORM/mean": 0.518401026725769, "rewards/VisualizationJSONCombinedORM/std": 0.2532413601875305, "step": 2609, "train_speed(iter/s)": 0.355564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 280.6875, "completions/min_length": 227.0, "epoch": 2.1588089330024816, "grad_norm": 0.19255100190639496, "kl": 0.0582275390625, "learning_rate": 2.2101561174487606e-06, "loss": 0.0005826987326145172, "memory(GiB)": 38.05, "reward": 0.5141993761062622, "reward_std": 0.11862294375896454, "rewards/VisualizationJSONCombinedORM/mean": 0.5141993761062622, "rewards/VisualizationJSONCombinedORM/std": 0.17916199564933777, "step": 2610, "train_speed(iter/s)": 0.354603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 314.25, "completions/min_length": 204.0, "epoch": 2.1596360628618694, "grad_norm": 0.16121499240398407, "kl": 0.055908203125, "learning_rate": 2.206163709061976e-06, "loss": 0.0005577541887760162, "memory(GiB)": 38.05, "reward": 0.6757369041442871, "reward_std": 0.06040515378117561, "rewards/VisualizationJSONCombinedORM/mean": 0.6757369041442871, "rewards/VisualizationJSONCombinedORM/std": 0.11392809450626373, "step": 2611, "train_speed(iter/s)": 0.353546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 271.75, "completions/min_length": 224.0, "epoch": 2.160463192721257, "grad_norm": 0.15434692800045013, "kl": 0.04193115234375, "learning_rate": 2.2021738888902854e-06, "loss": 0.00042044371366500854, "memory(GiB)": 38.05, "reward": 0.6587372422218323, "reward_std": 0.059239789843559265, "rewards/VisualizationJSONCombinedORM/mean": 0.6587372422218323, "rewards/VisualizationJSONCombinedORM/std": 0.09354601055383682, "step": 2612, "train_speed(iter/s)": 0.352512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 286.125, "completions/min_length": 239.0, "epoch": 2.161290322580645, "grad_norm": 0.21683797240257263, "kl": 0.0985107421875, "learning_rate": 2.1981866606298684e-06, "loss": 0.000984281301498413, "memory(GiB)": 38.05, "reward": 0.5395700931549072, "reward_std": 0.05512530729174614, "rewards/VisualizationJSONCombinedORM/mean": 0.5395700931549072, "rewards/VisualizationJSONCombinedORM/std": 0.18895912170410156, "step": 2613, "train_speed(iter/s)": 0.351475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 281.0, "completions/min_length": 222.0, "epoch": 2.1621174524400333, "grad_norm": 0.1680280566215515, "kl": 0.033355712890625, "learning_rate": 2.194202027974501e-06, "loss": 0.000333515927195549, "memory(GiB)": 38.05, "reward": 0.46636348962783813, "reward_std": 0.11910455673933029, "rewards/VisualizationJSONCombinedORM/mean": 0.46636348962783813, "rewards/VisualizationJSONCombinedORM/std": 0.14820823073387146, "step": 2614, "train_speed(iter/s)": 0.350351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 275.875, "completions/min_length": 229.0, "epoch": 2.162944582299421, "grad_norm": 0.2524341642856598, "kl": 0.06988525390625, "learning_rate": 2.1902199946155555e-06, "loss": 0.0006990954279899597, "memory(GiB)": 38.05, "reward": 0.3739909529685974, "reward_std": 0.053151294589042664, "rewards/VisualizationJSONCombinedORM/mean": 0.3739909529685974, "rewards/VisualizationJSONCombinedORM/std": 0.052611712366342545, "step": 2615, "train_speed(iter/s)": 0.349389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 280.3125, "completions/min_length": 234.0, "epoch": 2.163771712158809, "grad_norm": 0.20314563810825348, "kl": 0.06951904296875, "learning_rate": 2.186240564241992e-06, "loss": 0.0006954651325941086, "memory(GiB)": 38.05, "reward": 0.5641696453094482, "reward_std": 0.10470319539308548, "rewards/VisualizationJSONCombinedORM/mean": 0.5641696453094482, "rewards/VisualizationJSONCombinedORM/std": 0.10589912533760071, "step": 2616, "train_speed(iter/s)": 0.348347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 284.9375, "completions/min_length": 236.0, "epoch": 2.1645988420181967, "grad_norm": 0.1791619062423706, "kl": 0.07135009765625, "learning_rate": 2.1822637405403647e-06, "loss": 0.000712268054485321, "memory(GiB)": 38.05, "reward": 0.7229807376861572, "reward_std": 0.04736733436584473, "rewards/VisualizationJSONCombinedORM/mean": 0.7229807376861572, "rewards/VisualizationJSONCombinedORM/std": 0.1669028103351593, "step": 2617, "train_speed(iter/s)": 0.347332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 309.3125, "completions/min_length": 241.0, "epoch": 2.1654259718775846, "grad_norm": 0.1781080812215805, "kl": 0.07110595703125, "learning_rate": 2.178289527194807e-06, "loss": 0.0007105618715286255, "memory(GiB)": 38.05, "reward": 0.4221862554550171, "reward_std": 0.04988773912191391, "rewards/VisualizationJSONCombinedORM/mean": 0.4221862554550171, "rewards/VisualizationJSONCombinedORM/std": 0.07108227908611298, "step": 2618, "train_speed(iter/s)": 0.346269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 252.5625, "completions/min_length": 204.0, "epoch": 2.166253101736973, "grad_norm": 0.14541283249855042, "kl": 0.04168701171875, "learning_rate": 2.174317927887041e-06, "loss": 0.0004162117838859558, "memory(GiB)": 38.05, "reward": 0.5682806968688965, "reward_std": 0.030112989246845245, "rewards/VisualizationJSONCombinedORM/mean": 0.5682806968688965, "rewards/VisualizationJSONCombinedORM/std": 0.1969490796327591, "step": 2619, "train_speed(iter/s)": 0.345381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 298.4375, "completions/min_length": 236.0, "epoch": 2.1670802315963607, "grad_norm": 0.23650836944580078, "kl": 0.092041015625, "learning_rate": 2.1703489462963613e-06, "loss": 0.0009201988577842712, "memory(GiB)": 38.05, "reward": 0.5865980386734009, "reward_std": 0.1015540212392807, "rewards/VisualizationJSONCombinedORM/mean": 0.5865980386734009, "rewards/VisualizationJSONCombinedORM/std": 0.12261933833360672, "step": 2620, "train_speed(iter/s)": 0.344319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 314.875, "completions/min_length": 243.0, "epoch": 2.1679073614557485, "grad_norm": 0.1658109724521637, "kl": 0.0628662109375, "learning_rate": 2.166382586099643e-06, "loss": 0.0006292015314102173, "memory(GiB)": 38.05, "reward": 0.6099138259887695, "reward_std": 0.0498017743229866, "rewards/VisualizationJSONCombinedORM/mean": 0.6099138259887695, "rewards/VisualizationJSONCombinedORM/std": 0.10345641523599625, "step": 2621, "train_speed(iter/s)": 0.34328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 281.75, "completions/min_length": 213.0, "epoch": 2.1687344913151363, "grad_norm": 0.1757022738456726, "kl": 0.0765380859375, "learning_rate": 2.162418850971325e-06, "loss": 0.0007671918720006943, "memory(GiB)": 38.05, "reward": 0.5785216093063354, "reward_std": 0.0626499280333519, "rewards/VisualizationJSONCombinedORM/mean": 0.5785216093063354, "rewards/VisualizationJSONCombinedORM/std": 0.12256549298763275, "step": 2622, "train_speed(iter/s)": 0.3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 305.0625, "completions/min_length": 221.0, "epoch": 2.1695616211745246, "grad_norm": 0.22070235013961792, "kl": 0.05169677734375, "learning_rate": 2.1584577445834234e-06, "loss": 0.0005185157060623169, "memory(GiB)": 38.05, "reward": 0.4423195421695709, "reward_std": 0.06207273155450821, "rewards/VisualizationJSONCombinedORM/mean": 0.4423195421695709, "rewards/VisualizationJSONCombinedORM/std": 0.07521596550941467, "step": 2623, "train_speed(iter/s)": 0.340993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 309.1875, "completions/min_length": 241.0, "epoch": 2.1703887510339124, "grad_norm": 0.19763503968715668, "kl": 0.073974609375, "learning_rate": 2.154499270605508e-06, "loss": 0.000738922506570816, "memory(GiB)": 38.05, "reward": 0.42142167687416077, "reward_std": 0.06955060362815857, "rewards/VisualizationJSONCombinedORM/mean": 0.42142167687416077, "rewards/VisualizationJSONCombinedORM/std": 0.08078974485397339, "step": 2624, "train_speed(iter/s)": 0.340162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 298.8125, "completions/min_length": 232.0, "epoch": 2.1712158808933, "grad_norm": 0.18225280940532684, "kl": 0.077392578125, "learning_rate": 2.1505434327047246e-06, "loss": 0.000773254781961441, "memory(GiB)": 38.05, "reward": 0.3616713881492615, "reward_std": 0.06039136275649071, "rewards/VisualizationJSONCombinedORM/mean": 0.3616713881492615, "rewards/VisualizationJSONCombinedORM/std": 0.07878164947032928, "step": 2625, "train_speed(iter/s)": 0.339415 }, { "epoch": 2.1712158808933, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 342.375, "eval_completions/mean_length": 284.734375, "eval_completions/min_length": 240.20833333333334, "eval_kl": 0.09187825520833333, "eval_loss": 0.0009289185400120914, "eval_reward": 0.4802319724112749, "eval_reward_std": 0.06128680791395406, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4802319724112749, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06128681031987071, "eval_runtime": 297.0997, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 276.0, "completions/min_length": 214.0, "epoch": 2.172043010752688, "grad_norm": 0.15923884510993958, "kl": 0.04315185546875, "learning_rate": 2.146590234545763e-06, "loss": 0.00043218210339546204, "memory(GiB)": 38.05, "reward": 0.441348671913147, "reward_std": 0.05082507058978081, "rewards/VisualizationJSONCombinedORM/mean": 0.441348671913147, "rewards/VisualizationJSONCombinedORM/std": 0.24668370187282562, "step": 2626, "train_speed(iter/s)": 0.326019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 292.0, "completions/min_length": 223.0, "epoch": 2.1728701406120763, "grad_norm": 0.2036205381155014, "kl": 0.05810546875, "learning_rate": 2.1426396797908764e-06, "loss": 0.0005813799798488617, "memory(GiB)": 38.05, "reward": 0.3605537712574005, "reward_std": 0.04175886511802673, "rewards/VisualizationJSONCombinedORM/mean": 0.3605537712574005, "rewards/VisualizationJSONCombinedORM/std": 0.04911840707063675, "step": 2627, "train_speed(iter/s)": 0.325115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 282.875, "completions/min_length": 234.0, "epoch": 2.173697270471464, "grad_norm": 0.16678205132484436, "kl": 0.15966796875, "learning_rate": 2.138691772099863e-06, "loss": 0.0016001388430595398, "memory(GiB)": 38.05, "reward": 0.5507466793060303, "reward_std": 0.07314454019069672, "rewards/VisualizationJSONCombinedORM/mean": 0.5507466793060303, "rewards/VisualizationJSONCombinedORM/std": 0.08713790029287338, "step": 2628, "train_speed(iter/s)": 0.324184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 296.75, "completions/min_length": 242.0, "epoch": 2.174524400330852, "grad_norm": 0.16393548250198364, "kl": 0.0833740234375, "learning_rate": 2.1347465151300733e-06, "loss": 0.0008330345153808594, "memory(GiB)": 38.05, "reward": 0.4511316418647766, "reward_std": 0.06535684317350388, "rewards/VisualizationJSONCombinedORM/mean": 0.4511316418647766, "rewards/VisualizationJSONCombinedORM/std": 0.09206738322973251, "step": 2629, "train_speed(iter/s)": 0.323232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 270.0, "completions/min_length": 220.0, "epoch": 2.1753515301902397, "grad_norm": 0.18919312953948975, "kl": 0.0921630859375, "learning_rate": 2.130803912536401e-06, "loss": 0.0009193457663059235, "memory(GiB)": 38.05, "reward": 0.7437320351600647, "reward_std": 0.08943185955286026, "rewards/VisualizationJSONCombinedORM/mean": 0.7437320351600647, "rewards/VisualizationJSONCombinedORM/std": 0.08771861344575882, "step": 2630, "train_speed(iter/s)": 0.322463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 256.5625, "completions/min_length": 209.0, "epoch": 2.1761786600496276, "grad_norm": 0.1963590383529663, "kl": 0.080810546875, "learning_rate": 2.1268639679712814e-06, "loss": 0.0008089020848274231, "memory(GiB)": 38.05, "reward": 0.5252779722213745, "reward_std": 0.06655188649892807, "rewards/VisualizationJSONCombinedORM/mean": 0.5252779722213745, "rewards/VisualizationJSONCombinedORM/std": 0.06785120069980621, "step": 2631, "train_speed(iter/s)": 0.321437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 314.5625, "completions/min_length": 259.0, "epoch": 2.177005789909016, "grad_norm": 0.24934035539627075, "kl": 0.13916015625, "learning_rate": 2.122926685084684e-06, "loss": 0.0013943202793598175, "memory(GiB)": 38.05, "reward": 0.6625872254371643, "reward_std": 0.10614201426506042, "rewards/VisualizationJSONCombinedORM/mean": 0.6625872254371643, "rewards/VisualizationJSONCombinedORM/std": 0.15225493907928467, "step": 2632, "train_speed(iter/s)": 0.320755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 312.0, "completions/min_length": 226.0, "epoch": 2.1778329197684037, "grad_norm": 0.1929461658000946, "kl": 0.075927734375, "learning_rate": 2.118992067524118e-06, "loss": 0.0007616877555847168, "memory(GiB)": 38.05, "reward": 0.3226800858974457, "reward_std": 0.05498063936829567, "rewards/VisualizationJSONCombinedORM/mean": 0.3226800858974457, "rewards/VisualizationJSONCombinedORM/std": 0.08892926573753357, "step": 2633, "train_speed(iter/s)": 0.319765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 292.0625, "completions/min_length": 214.0, "epoch": 2.1786600496277915, "grad_norm": 0.16707706451416016, "kl": 0.0552978515625, "learning_rate": 2.115060118934616e-06, "loss": 0.0005520791746675968, "memory(GiB)": 38.05, "reward": 0.5598459243774414, "reward_std": 0.11262505501508713, "rewards/VisualizationJSONCombinedORM/mean": 0.5598459243774414, "rewards/VisualizationJSONCombinedORM/std": 0.13382376730442047, "step": 2634, "train_speed(iter/s)": 0.318603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 330.3125, "completions/min_length": 281.0, "epoch": 2.1794871794871793, "grad_norm": 0.15244653820991516, "kl": 0.0458984375, "learning_rate": 2.1111308429587446e-06, "loss": 0.0004581082612276077, "memory(GiB)": 38.05, "reward": 0.6234117746353149, "reward_std": 0.09738790988922119, "rewards/VisualizationJSONCombinedORM/mean": 0.6234117746353149, "rewards/VisualizationJSONCombinedORM/std": 0.1291564404964447, "step": 2635, "train_speed(iter/s)": 0.317759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 265.125, "completions/min_length": 210.0, "epoch": 2.1803143093465676, "grad_norm": 0.18549540638923645, "kl": 0.042236328125, "learning_rate": 2.1072042432365934e-06, "loss": 0.00042182207107543945, "memory(GiB)": 38.05, "reward": 0.6096391677856445, "reward_std": 0.05780477449297905, "rewards/VisualizationJSONCombinedORM/mean": 0.6096391677856445, "rewards/VisualizationJSONCombinedORM/std": 0.1316678524017334, "step": 2636, "train_speed(iter/s)": 0.31692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 315.6875, "completions/min_length": 234.0, "epoch": 2.1811414392059554, "grad_norm": 0.3203279674053192, "kl": 0.156005859375, "learning_rate": 2.1032803234057725e-06, "loss": 0.0015606433153152466, "memory(GiB)": 38.05, "reward": 0.47194021940231323, "reward_std": 0.09388184547424316, "rewards/VisualizationJSONCombinedORM/mean": 0.47194021940231323, "rewards/VisualizationJSONCombinedORM/std": 0.15623171627521515, "step": 2637, "train_speed(iter/s)": 0.316083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 304.4375, "completions/min_length": 248.0, "epoch": 2.181968569065343, "grad_norm": 0.15721841156482697, "kl": 0.03118896484375, "learning_rate": 2.0993590871014048e-06, "loss": 0.00031160563230514526, "memory(GiB)": 38.05, "reward": 0.782509982585907, "reward_std": 0.03695978969335556, "rewards/VisualizationJSONCombinedORM/mean": 0.782509982585907, "rewards/VisualizationJSONCombinedORM/std": 0.035814084112644196, "step": 2638, "train_speed(iter/s)": 0.31515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 294.75, "completions/min_length": 245.0, "epoch": 2.182795698924731, "grad_norm": 0.16516940295696259, "kl": 0.04522705078125, "learning_rate": 2.0954405379561345e-06, "loss": 0.00045263394713401794, "memory(GiB)": 38.05, "reward": 0.47142326831817627, "reward_std": 0.05480936914682388, "rewards/VisualizationJSONCombinedORM/mean": 0.47142326831817627, "rewards/VisualizationJSONCombinedORM/std": 0.07333356887102127, "step": 2639, "train_speed(iter/s)": 0.314073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 282.375, "completions/min_length": 246.0, "epoch": 2.1836228287841193, "grad_norm": 0.20448307693004608, "kl": 0.05303955078125, "learning_rate": 2.0915246796001077e-06, "loss": 0.0005303751677274704, "memory(GiB)": 38.05, "reward": 0.6044875383377075, "reward_std": 0.07570762187242508, "rewards/VisualizationJSONCombinedORM/mean": 0.6044875383377075, "rewards/VisualizationJSONCombinedORM/std": 0.15568765997886658, "step": 2640, "train_speed(iter/s)": 0.313293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 305.4375, "completions/min_length": 246.0, "epoch": 2.184449958643507, "grad_norm": 0.16617964208126068, "kl": 0.08349609375, "learning_rate": 2.08761151566099e-06, "loss": 0.0008368659764528275, "memory(GiB)": 38.05, "reward": 0.49505239725112915, "reward_std": 0.06615000218153, "rewards/VisualizationJSONCombinedORM/mean": 0.49505239725112915, "rewards/VisualizationJSONCombinedORM/std": 0.2645142078399658, "step": 2641, "train_speed(iter/s)": 0.312463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 284.5, "completions/min_length": 251.0, "epoch": 2.185277088502895, "grad_norm": 0.1873863935470581, "kl": 0.06964111328125, "learning_rate": 2.083701049763938e-06, "loss": 0.0006948672235012054, "memory(GiB)": 38.05, "reward": 0.48471927642822266, "reward_std": 0.06780194491147995, "rewards/VisualizationJSONCombinedORM/mean": 0.48471927642822266, "rewards/VisualizationJSONCombinedORM/std": 0.13069583475589752, "step": 2642, "train_speed(iter/s)": 0.311626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 261.4375, "completions/min_length": 217.0, "epoch": 2.1861042183622827, "grad_norm": 0.1810118556022644, "kl": 0.033172607421875, "learning_rate": 2.0797932855316183e-06, "loss": 0.0003314167261123657, "memory(GiB)": 38.05, "reward": 0.6241257786750793, "reward_std": 0.07819686830043793, "rewards/VisualizationJSONCombinedORM/mean": 0.6241257786750793, "rewards/VisualizationJSONCombinedORM/std": 0.16563312709331512, "step": 2643, "train_speed(iter/s)": 0.310984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 307.1875, "completions/min_length": 230.0, "epoch": 2.1869313482216706, "grad_norm": 0.17196021974086761, "kl": 0.0325927734375, "learning_rate": 2.075888226584187e-06, "loss": 0.0003265887498855591, "memory(GiB)": 38.05, "reward": 0.4079504609107971, "reward_std": 0.023716218769550323, "rewards/VisualizationJSONCombinedORM/mean": 0.4079504609107971, "rewards/VisualizationJSONCombinedORM/std": 0.0307607538998127, "step": 2644, "train_speed(iter/s)": 0.310089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 314.4375, "completions/min_length": 256.0, "epoch": 2.187758478081059, "grad_norm": 0.1815698742866516, "kl": 0.0704345703125, "learning_rate": 2.0719858765393e-06, "loss": 0.000704057514667511, "memory(GiB)": 38.05, "reward": 0.5402054786682129, "reward_std": 0.07281018793582916, "rewards/VisualizationJSONCombinedORM/mean": 0.5402054786682129, "rewards/VisualizationJSONCombinedORM/std": 0.2529982924461365, "step": 2645, "train_speed(iter/s)": 0.309185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 296.875, "completions/min_length": 258.0, "epoch": 2.1885856079404467, "grad_norm": 0.16689391434192657, "kl": 0.03472900390625, "learning_rate": 2.0680862390121015e-06, "loss": 0.0003468245267868042, "memory(GiB)": 38.05, "reward": 0.7240405082702637, "reward_std": 0.0723310112953186, "rewards/VisualizationJSONCombinedORM/mean": 0.7240405082702637, "rewards/VisualizationJSONCombinedORM/std": 0.07162904739379883, "step": 2646, "train_speed(iter/s)": 0.308238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 276.75, "completions/min_length": 235.0, "epoch": 2.1894127377998345, "grad_norm": 0.23322661221027374, "kl": 0.04638671875, "learning_rate": 2.064189317615225e-06, "loss": 0.0004632696509361267, "memory(GiB)": 38.05, "reward": 0.6786725521087646, "reward_std": 0.09689194709062576, "rewards/VisualizationJSONCombinedORM/mean": 0.6786725521087646, "rewards/VisualizationJSONCombinedORM/std": 0.17419475317001343, "step": 2647, "train_speed(iter/s)": 0.307461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 261.1875, "completions/min_length": 224.0, "epoch": 2.1902398676592223, "grad_norm": 0.1660475730895996, "kl": 0.10198974609375, "learning_rate": 2.0602951159587817e-06, "loss": 0.001017838716506958, "memory(GiB)": 38.05, "reward": 0.3326411843299866, "reward_std": 0.0226387120783329, "rewards/VisualizationJSONCombinedORM/mean": 0.3326411843299866, "rewards/VisualizationJSONCombinedORM/std": 0.023628780618309975, "step": 2648, "train_speed(iter/s)": 0.306749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 294.1875, "completions/min_length": 255.0, "epoch": 2.1910669975186106, "grad_norm": 0.17295457422733307, "kl": 0.060791015625, "learning_rate": 2.056403637650371e-06, "loss": 0.0006073806434869766, "memory(GiB)": 38.05, "reward": 0.5129438638687134, "reward_std": 0.07294595241546631, "rewards/VisualizationJSONCombinedORM/mean": 0.5129438638687134, "rewards/VisualizationJSONCombinedORM/std": 0.2027905285358429, "step": 2649, "train_speed(iter/s)": 0.305911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 288.8125, "completions/min_length": 207.0, "epoch": 2.1918941273779984, "grad_norm": 0.23059654235839844, "kl": 0.069580078125, "learning_rate": 2.052514886295062e-06, "loss": 0.0006954837590456009, "memory(GiB)": 38.13, "reward": 0.5174398422241211, "reward_std": 0.07917965948581696, "rewards/VisualizationJSONCombinedORM/mean": 0.5174398422241211, "rewards/VisualizationJSONCombinedORM/std": 0.08448459953069687, "step": 2650, "train_speed(iter/s)": 0.304898 }, { "epoch": 2.1918941273779984, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.5416666666667, "eval_completions/mean_length": 295.0416666666667, "eval_completions/min_length": 245.54166666666666, "eval_kl": 0.067108154296875, "eval_loss": 0.0006817678804509342, "eval_reward": 0.46385687837998074, "eval_reward_std": 0.060411773001154266, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46385687837998074, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06041177540707091, "eval_runtime": 307.9502, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 298.25, "completions/min_length": 246.0, "epoch": 2.192721257237386, "grad_norm": 0.21466509997844696, "kl": 0.050537109375, "learning_rate": 2.048628865495403e-06, "loss": 0.0005051866173744202, "memory(GiB)": 38.13, "reward": 0.5068829655647278, "reward_std": 0.07462906092405319, "rewards/VisualizationJSONCombinedORM/mean": 0.5068829655647278, "rewards/VisualizationJSONCombinedORM/std": 0.23480021953582764, "step": 2651, "train_speed(iter/s)": 0.293754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 276.5, "completions/min_length": 220.0, "epoch": 2.193548387096774, "grad_norm": 0.16362828016281128, "kl": 0.0970458984375, "learning_rate": 2.0447455788514105e-06, "loss": 0.0009713247418403625, "memory(GiB)": 38.13, "reward": 0.5483800172805786, "reward_std": 0.07822167128324509, "rewards/VisualizationJSONCombinedORM/mean": 0.5483800172805786, "rewards/VisualizationJSONCombinedORM/std": 0.17103558778762817, "step": 2652, "train_speed(iter/s)": 0.292982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 290.1875, "completions/min_length": 239.0, "epoch": 2.1943755169561623, "grad_norm": 0.19060048460960388, "kl": 0.04888916015625, "learning_rate": 2.0408650299605704e-06, "loss": 0.0004891008138656616, "memory(GiB)": 38.13, "reward": 0.3842686414718628, "reward_std": 0.043985262513160706, "rewards/VisualizationJSONCombinedORM/mean": 0.3842686414718628, "rewards/VisualizationJSONCombinedORM/std": 0.04995032772421837, "step": 2653, "train_speed(iter/s)": 0.29241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 277.0625, "completions/min_length": 227.0, "epoch": 2.19520264681555, "grad_norm": 0.1577073186635971, "kl": 0.038909912109375, "learning_rate": 2.0369872224178267e-06, "loss": 0.0003890693187713623, "memory(GiB)": 38.13, "reward": 0.510407567024231, "reward_std": 0.062356945127248764, "rewards/VisualizationJSONCombinedORM/mean": 0.510407567024231, "rewards/VisualizationJSONCombinedORM/std": 0.27168768644332886, "step": 2654, "train_speed(iter/s)": 0.29169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 285.625, "completions/min_length": 242.0, "epoch": 2.196029776674938, "grad_norm": 0.161901593208313, "kl": 0.0986328125, "learning_rate": 2.0331121598155905e-06, "loss": 0.000984642654657364, "memory(GiB)": 38.13, "reward": 0.37793946266174316, "reward_std": 0.024056999012827873, "rewards/VisualizationJSONCombinedORM/mean": 0.37793946266174316, "rewards/VisualizationJSONCombinedORM/std": 0.08132355660200119, "step": 2655, "train_speed(iter/s)": 0.291002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 265.5625, "completions/min_length": 217.0, "epoch": 2.1968569065343257, "grad_norm": 0.23452861607074738, "kl": 0.07220458984375, "learning_rate": 2.029239845743723e-06, "loss": 0.0007227733731269836, "memory(GiB)": 38.13, "reward": 0.6805648803710938, "reward_std": 0.12688998878002167, "rewards/VisualizationJSONCombinedORM/mean": 0.6805648803710938, "rewards/VisualizationJSONCombinedORM/std": 0.13004450500011444, "step": 2656, "train_speed(iter/s)": 0.290354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 317.0625, "completions/min_length": 241.0, "epoch": 2.197684036393714, "grad_norm": 0.18791237473487854, "kl": 0.0716552734375, "learning_rate": 2.0253702837895495e-06, "loss": 0.0007152929902076721, "memory(GiB)": 38.13, "reward": 0.512567400932312, "reward_std": 0.07099736481904984, "rewards/VisualizationJSONCombinedORM/mean": 0.512567400932312, "rewards/VisualizationJSONCombinedORM/std": 0.09954883903265, "step": 2657, "train_speed(iter/s)": 0.289595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 309.5, "completions/min_length": 259.0, "epoch": 2.198511166253102, "grad_norm": 0.1713515669107437, "kl": 0.14947509765625, "learning_rate": 2.0215034775378336e-06, "loss": 0.001489449292421341, "memory(GiB)": 38.13, "reward": 0.6123297810554504, "reward_std": 0.08279214054346085, "rewards/VisualizationJSONCombinedORM/mean": 0.6123297810554504, "rewards/VisualizationJSONCombinedORM/std": 0.17825020849704742, "step": 2658, "train_speed(iter/s)": 0.288894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 292.4375, "completions/min_length": 227.0, "epoch": 2.1993382961124897, "grad_norm": 0.14899368584156036, "kl": 0.039093017578125, "learning_rate": 2.017639430570794e-06, "loss": 0.00039027631282806396, "memory(GiB)": 38.13, "reward": 0.38957804441452026, "reward_std": 0.03680304437875748, "rewards/VisualizationJSONCombinedORM/mean": 0.38957804441452026, "rewards/VisualizationJSONCombinedORM/std": 0.14048059284687042, "step": 2659, "train_speed(iter/s)": 0.288156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 282.625, "completions/min_length": 219.0, "epoch": 2.2001654259718775, "grad_norm": 0.19669805467128754, "kl": 0.06488037109375, "learning_rate": 2.0137781464680922e-06, "loss": 0.0006496775895357132, "memory(GiB)": 38.13, "reward": 0.6789815425872803, "reward_std": 0.08908204734325409, "rewards/VisualizationJSONCombinedORM/mean": 0.6789815425872803, "rewards/VisualizationJSONCombinedORM/std": 0.10648959875106812, "step": 2660, "train_speed(iter/s)": 0.287549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 295.5, "completions/min_length": 224.0, "epoch": 2.2009925558312657, "grad_norm": 0.19239424169063568, "kl": 0.0443115234375, "learning_rate": 2.009919628806826e-06, "loss": 0.0004435107111930847, "memory(GiB)": 38.13, "reward": 0.6431015729904175, "reward_std": 0.11647486686706543, "rewards/VisualizationJSONCombinedORM/mean": 0.6431015729904175, "rewards/VisualizationJSONCombinedORM/std": 0.16377979516983032, "step": 2661, "train_speed(iter/s)": 0.286824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 310.0, "completions/min_length": 245.0, "epoch": 2.2018196856906536, "grad_norm": 0.1705438196659088, "kl": 0.085693359375, "learning_rate": 2.006063881161535e-06, "loss": 0.0008579939603805542, "memory(GiB)": 38.13, "reward": 0.4710782766342163, "reward_std": 0.04768560826778412, "rewards/VisualizationJSONCombinedORM/mean": 0.4710782766342163, "rewards/VisualizationJSONCombinedORM/std": 0.30684223771095276, "step": 2662, "train_speed(iter/s)": 0.286095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 303.1875, "completions/min_length": 233.0, "epoch": 2.2026468155500414, "grad_norm": 0.1630452424287796, "kl": 0.05120849609375, "learning_rate": 2.0022109071041905e-06, "loss": 0.00051121786236763, "memory(GiB)": 38.13, "reward": 0.6360993385314941, "reward_std": 0.10504792630672455, "rewards/VisualizationJSONCombinedORM/mean": 0.6360993385314941, "rewards/VisualizationJSONCombinedORM/std": 0.12486910820007324, "step": 2663, "train_speed(iter/s)": 0.285254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 258.3125, "completions/min_length": 223.0, "epoch": 2.203473945409429, "grad_norm": 0.16629990935325623, "kl": 0.029693603515625, "learning_rate": 1.9983607102041974e-06, "loss": 0.0002965223975479603, "memory(GiB)": 38.13, "reward": 0.5859755277633667, "reward_std": 0.07381580770015717, "rewards/VisualizationJSONCombinedORM/mean": 0.5859755277633667, "rewards/VisualizationJSONCombinedORM/std": 0.11683396250009537, "step": 2664, "train_speed(iter/s)": 0.284761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 302.5, "completions/min_length": 244.0, "epoch": 2.204301075268817, "grad_norm": 0.17564328014850616, "kl": 0.0623779296875, "learning_rate": 1.9945132940283805e-06, "loss": 0.0006239861249923706, "memory(GiB)": 38.13, "reward": 0.4091752767562866, "reward_std": 0.036975838243961334, "rewards/VisualizationJSONCombinedORM/mean": 0.4091752767562866, "rewards/VisualizationJSONCombinedORM/std": 0.09792977571487427, "step": 2665, "train_speed(iter/s)": 0.284147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 287.75, "completions/min_length": 244.0, "epoch": 2.2051282051282053, "grad_norm": 0.16378441452980042, "kl": 0.0960693359375, "learning_rate": 1.990668662140998e-06, "loss": 0.0009586680680513382, "memory(GiB)": 38.13, "reward": 0.7129568457603455, "reward_std": 0.0702534019947052, "rewards/VisualizationJSONCombinedORM/mean": 0.7129568457603455, "rewards/VisualizationJSONCombinedORM/std": 0.09077747166156769, "step": 2666, "train_speed(iter/s)": 0.283626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 290.375, "completions/min_length": 229.0, "epoch": 2.205955334987593, "grad_norm": 0.20365214347839355, "kl": 0.0792236328125, "learning_rate": 1.9868268181037186e-06, "loss": 0.0007903091609477997, "memory(GiB)": 38.13, "reward": 0.6158623099327087, "reward_std": 0.08530428260564804, "rewards/VisualizationJSONCombinedORM/mean": 0.6158623099327087, "rewards/VisualizationJSONCombinedORM/std": 0.18940989673137665, "step": 2667, "train_speed(iter/s)": 0.283085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 293.5, "completions/min_length": 242.0, "epoch": 2.206782464846981, "grad_norm": 0.15596027672290802, "kl": 0.03631591796875, "learning_rate": 1.9829877654756373e-06, "loss": 0.00036311522126197815, "memory(GiB)": 38.13, "reward": 0.6255525350570679, "reward_std": 0.0403934009373188, "rewards/VisualizationJSONCombinedORM/mean": 0.6255525350570679, "rewards/VisualizationJSONCombinedORM/std": 0.06958425045013428, "step": 2668, "train_speed(iter/s)": 0.282474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 276.5625, "completions/min_length": 230.0, "epoch": 2.2076095947063687, "grad_norm": 0.1783510446548462, "kl": 0.0755615234375, "learning_rate": 1.9791515078132588e-06, "loss": 0.000756971538066864, "memory(GiB)": 38.13, "reward": 0.5704174041748047, "reward_std": 0.06326271593570709, "rewards/VisualizationJSONCombinedORM/mean": 0.5704174041748047, "rewards/VisualizationJSONCombinedORM/std": 0.0780947208404541, "step": 2669, "train_speed(iter/s)": 0.281779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 296.3125, "completions/min_length": 246.0, "epoch": 2.208436724565757, "grad_norm": 0.2230035662651062, "kl": 0.03411865234375, "learning_rate": 1.9753180486705013e-06, "loss": 0.0003407429903745651, "memory(GiB)": 38.13, "reward": 0.3368256986141205, "reward_std": 0.04156984016299248, "rewards/VisualizationJSONCombinedORM/mean": 0.3368256986141205, "rewards/VisualizationJSONCombinedORM/std": 0.04497183486819267, "step": 2670, "train_speed(iter/s)": 0.280918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 300.25, "completions/min_length": 246.0, "epoch": 2.209263854425145, "grad_norm": 0.16532060503959656, "kl": 0.0511474609375, "learning_rate": 1.9714873915986848e-06, "loss": 0.0005126763135194778, "memory(GiB)": 38.13, "reward": 0.5719437599182129, "reward_std": 0.05375853180885315, "rewards/VisualizationJSONCombinedORM/mean": 0.5719437599182129, "rewards/VisualizationJSONCombinedORM/std": 0.1760561168193817, "step": 2671, "train_speed(iter/s)": 0.280277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 335.0, "completions/min_length": 232.0, "epoch": 2.2100909842845327, "grad_norm": 0.1866428703069687, "kl": 0.04669189453125, "learning_rate": 1.967659540146541e-06, "loss": 0.00046703964471817017, "memory(GiB)": 38.13, "reward": 0.49652761220932007, "reward_std": 0.100393146276474, "rewards/VisualizationJSONCombinedORM/mean": 0.49652761220932007, "rewards/VisualizationJSONCombinedORM/std": 0.25317132472991943, "step": 2672, "train_speed(iter/s)": 0.279594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 294.0, "completions/min_length": 239.0, "epoch": 2.2109181141439205, "grad_norm": 0.1903497874736786, "kl": 0.029937744140625, "learning_rate": 1.963834497860192e-06, "loss": 0.0002993941307067871, "memory(GiB)": 38.13, "reward": 0.4158935844898224, "reward_std": 0.050540536642074585, "rewards/VisualizationJSONCombinedORM/mean": 0.4158935844898224, "rewards/VisualizationJSONCombinedORM/std": 0.11696678400039673, "step": 2673, "train_speed(iter/s)": 0.278907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 279.125, "completions/min_length": 228.0, "epoch": 2.2117452440033087, "grad_norm": 0.19892697036266327, "kl": 0.083984375, "learning_rate": 1.9600122682831723e-06, "loss": 0.0008404478430747986, "memory(GiB)": 38.13, "reward": 0.4875483214855194, "reward_std": 0.06025318056344986, "rewards/VisualizationJSONCombinedORM/mean": 0.4875483214855194, "rewards/VisualizationJSONCombinedORM/std": 0.08889294415712357, "step": 2674, "train_speed(iter/s)": 0.278136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 273.375, "completions/min_length": 218.0, "epoch": 2.2125723738626966, "grad_norm": 0.19223414361476898, "kl": 0.05224609375, "learning_rate": 1.956192854956397e-06, "loss": 0.000522807240486145, "memory(GiB)": 38.13, "reward": 0.6272987723350525, "reward_std": 0.07307098060846329, "rewards/VisualizationJSONCombinedORM/mean": 0.6272987723350525, "rewards/VisualizationJSONCombinedORM/std": 0.0849262997508049, "step": 2675, "train_speed(iter/s)": 0.277581 }, { "epoch": 2.2125723738626966, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.75, "eval_completions/mean_length": 296.2083333333333, "eval_completions/min_length": 245.58333333333334, "eval_kl": 0.07428995768229167, "eval_loss": 0.000753036409150809, "eval_reward": 0.43774004094302654, "eval_reward_std": 0.060949474262694515, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43774004094302654, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06094947647458563, "eval_runtime": 310.8197, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 300.6875, "completions/min_length": 220.0, "epoch": 2.2133995037220844, "grad_norm": 0.18495772778987885, "kl": 0.0660400390625, "learning_rate": 1.95237626141818e-06, "loss": 0.0006610527634620667, "memory(GiB)": 38.13, "reward": 0.681431233882904, "reward_std": 0.09699784964323044, "rewards/VisualizationJSONCombinedORM/mean": 0.681431233882904, "rewards/VisualizationJSONCombinedORM/std": 0.14033730328083038, "step": 2676, "train_speed(iter/s)": 0.268233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 302.5, "completions/min_length": 226.0, "epoch": 2.214226633581472, "grad_norm": 0.20753517746925354, "kl": 0.033721923828125, "learning_rate": 1.948562491204219e-06, "loss": 0.0003381781280040741, "memory(GiB)": 38.13, "reward": 0.6478875875473022, "reward_std": 0.041137468069791794, "rewards/VisualizationJSONCombinedORM/mean": 0.6478875875473022, "rewards/VisualizationJSONCombinedORM/std": 0.12351489812135696, "step": 2677, "train_speed(iter/s)": 0.267665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 276.75, "completions/min_length": 220.0, "epoch": 2.21505376344086, "grad_norm": 0.2504517138004303, "kl": 0.03314208984375, "learning_rate": 1.944751547847598e-06, "loss": 0.0003309473395347595, "memory(GiB)": 38.13, "reward": 0.46827432513237, "reward_std": 0.06152456998825073, "rewards/VisualizationJSONCombinedORM/mean": 0.46827432513237, "rewards/VisualizationJSONCombinedORM/std": 0.26104944944381714, "step": 2678, "train_speed(iter/s)": 0.267072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 274.9375, "completions/min_length": 212.0, "epoch": 2.2158808933002483, "grad_norm": 0.17706310749053955, "kl": 0.16064453125, "learning_rate": 1.9409434348787824e-06, "loss": 0.001606486737728119, "memory(GiB)": 38.13, "reward": 0.3034641444683075, "reward_std": 0.04493524879217148, "rewards/VisualizationJSONCombinedORM/mean": 0.3034641444683075, "rewards/VisualizationJSONCombinedORM/std": 0.08034615218639374, "step": 2679, "train_speed(iter/s)": 0.266438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 322.4375, "completions/min_length": 232.0, "epoch": 2.216708023159636, "grad_norm": 0.16057680547237396, "kl": 0.0570068359375, "learning_rate": 1.9371381558256175e-06, "loss": 0.0005716728046536446, "memory(GiB)": 38.13, "reward": 0.33902615308761597, "reward_std": 0.12053006142377853, "rewards/VisualizationJSONCombinedORM/mean": 0.33902615308761597, "rewards/VisualizationJSONCombinedORM/std": 0.21034762263298035, "step": 2680, "train_speed(iter/s)": 0.265782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 289.625, "completions/min_length": 215.0, "epoch": 2.217535153019024, "grad_norm": 0.16691239178180695, "kl": 0.0838623046875, "learning_rate": 1.9333357142133167e-06, "loss": 0.0008373521268367767, "memory(GiB)": 38.13, "reward": 0.44829225540161133, "reward_std": 0.055680230259895325, "rewards/VisualizationJSONCombinedORM/mean": 0.44829225540161133, "rewards/VisualizationJSONCombinedORM/std": 0.05521642789244652, "step": 2681, "train_speed(iter/s)": 0.265185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 289.25, "completions/min_length": 229.0, "epoch": 2.2183622828784118, "grad_norm": 0.18605215847492218, "kl": 0.0445556640625, "learning_rate": 1.9295361135644724e-06, "loss": 0.0004452168941497803, "memory(GiB)": 38.13, "reward": 0.5611726641654968, "reward_std": 0.05290095508098602, "rewards/VisualizationJSONCombinedORM/mean": 0.5611726641654968, "rewards/VisualizationJSONCombinedORM/std": 0.10087267309427261, "step": 2682, "train_speed(iter/s)": 0.264541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 314.3125, "completions/min_length": 233.0, "epoch": 2.2191894127378, "grad_norm": 0.18642866611480713, "kl": 0.08551025390625, "learning_rate": 1.925739357399038e-06, "loss": 0.0008550658822059631, "memory(GiB)": 38.13, "reward": 0.49613678455352783, "reward_std": 0.04947558045387268, "rewards/VisualizationJSONCombinedORM/mean": 0.49613678455352783, "rewards/VisualizationJSONCombinedORM/std": 0.10295598953962326, "step": 2683, "train_speed(iter/s)": 0.263803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 293.75, "completions/min_length": 235.0, "epoch": 2.220016542597188, "grad_norm": 0.17585153877735138, "kl": 0.1544189453125, "learning_rate": 1.9219454492343374e-06, "loss": 0.0015388857573270798, "memory(GiB)": 38.13, "reward": 0.6280661821365356, "reward_std": 0.04804789274930954, "rewards/VisualizationJSONCombinedORM/mean": 0.6280661821365356, "rewards/VisualizationJSONCombinedORM/std": 0.04944637417793274, "step": 2684, "train_speed(iter/s)": 0.263213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 281.125, "completions/min_length": 245.0, "epoch": 2.2208436724565757, "grad_norm": 0.1865808218717575, "kl": 0.075439453125, "learning_rate": 1.9181543925850544e-06, "loss": 0.0007559247314929962, "memory(GiB)": 38.13, "reward": 0.2690507173538208, "reward_std": 0.027176616713404655, "rewards/VisualizationJSONCombinedORM/mean": 0.2690507173538208, "rewards/VisualizationJSONCombinedORM/std": 0.05619790405035019, "step": 2685, "train_speed(iter/s)": 0.262659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 326.25, "completions/min_length": 271.0, "epoch": 2.2216708023159635, "grad_norm": 0.17267876863479614, "kl": 0.026611328125, "learning_rate": 1.914366190963232e-06, "loss": 0.0002657100558280945, "memory(GiB)": 38.13, "reward": 0.7874448299407959, "reward_std": 0.06772929430007935, "rewards/VisualizationJSONCombinedORM/mean": 0.7874448299407959, "rewards/VisualizationJSONCombinedORM/std": 0.06689158827066422, "step": 2686, "train_speed(iter/s)": 0.262116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 287.375, "completions/min_length": 240.0, "epoch": 2.2224979321753517, "grad_norm": 0.17345063388347626, "kl": 0.0462646484375, "learning_rate": 1.9105808478782644e-06, "loss": 0.0004625171422958374, "memory(GiB)": 38.13, "reward": 0.42219310998916626, "reward_std": 0.049650888890028, "rewards/VisualizationJSONCombinedORM/mean": 0.42219310998916626, "rewards/VisualizationJSONCombinedORM/std": 0.09299582242965698, "step": 2687, "train_speed(iter/s)": 0.261563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 299.0, "completions/min_length": 246.0, "epoch": 2.2233250620347396, "grad_norm": 0.21924947202205658, "kl": 0.0760498046875, "learning_rate": 1.9067983668369038e-06, "loss": 0.0007610619068145752, "memory(GiB)": 38.13, "reward": 0.4733079671859741, "reward_std": 0.0690440833568573, "rewards/VisualizationJSONCombinedORM/mean": 0.4733079671859741, "rewards/VisualizationJSONCombinedORM/std": 0.11351805180311203, "step": 2688, "train_speed(iter/s)": 0.261068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 279.3125, "completions/min_length": 248.0, "epoch": 2.2241521918941274, "grad_norm": 0.1819416582584381, "kl": 0.1534423828125, "learning_rate": 1.9030187513432419e-06, "loss": 0.001535654067993164, "memory(GiB)": 38.13, "reward": 0.5590132474899292, "reward_std": 0.08211690187454224, "rewards/VisualizationJSONCombinedORM/mean": 0.5590132474899292, "rewards/VisualizationJSONCombinedORM/std": 0.10894563049077988, "step": 2689, "train_speed(iter/s)": 0.260582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 285.4375, "completions/min_length": 229.0, "epoch": 2.224979321753515, "grad_norm": 0.20428742468357086, "kl": 0.1268310546875, "learning_rate": 1.8992420048987287e-06, "loss": 0.0012682527303695679, "memory(GiB)": 38.13, "reward": 0.46672123670578003, "reward_std": 0.04947870969772339, "rewards/VisualizationJSONCombinedORM/mean": 0.46672123670578003, "rewards/VisualizationJSONCombinedORM/std": 0.17084336280822754, "step": 2690, "train_speed(iter/s)": 0.26004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 285.6875, "completions/min_length": 218.0, "epoch": 2.225806451612903, "grad_norm": 0.18902525305747986, "kl": 0.06573486328125, "learning_rate": 1.8954681310021434e-06, "loss": 0.0006569698452949524, "memory(GiB)": 38.13, "reward": 0.6403714418411255, "reward_std": 0.06920431554317474, "rewards/VisualizationJSONCombinedORM/mean": 0.6403714418411255, "rewards/VisualizationJSONCombinedORM/std": 0.08459257334470749, "step": 2691, "train_speed(iter/s)": 0.259482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 339.375, "completions/min_length": 238.0, "epoch": 2.2266335814722913, "grad_norm": 0.1657366305589676, "kl": 0.07122802734375, "learning_rate": 1.8916971331496143e-06, "loss": 0.0007123071700334549, "memory(GiB)": 38.13, "reward": 0.7154644727706909, "reward_std": 0.1093282699584961, "rewards/VisualizationJSONCombinedORM/mean": 0.7154644727706909, "rewards/VisualizationJSONCombinedORM/std": 0.10781027376651764, "step": 2692, "train_speed(iter/s)": 0.258907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 313.25, "completions/min_length": 231.0, "epoch": 2.227460711331679, "grad_norm": 0.20976395905017853, "kl": 0.074951171875, "learning_rate": 1.8879290148345963e-06, "loss": 0.0007504411041736603, "memory(GiB)": 38.13, "reward": 0.5702663660049438, "reward_std": 0.0682058036327362, "rewards/VisualizationJSONCombinedORM/mean": 0.5702663660049438, "rewards/VisualizationJSONCombinedORM/std": 0.16437996923923492, "step": 2693, "train_speed(iter/s)": 0.258226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 286.3125, "completions/min_length": 231.0, "epoch": 2.228287841191067, "grad_norm": 0.21834667026996613, "kl": 0.08709716796875, "learning_rate": 1.8841637795478835e-06, "loss": 0.0008688494563102722, "memory(GiB)": 38.13, "reward": 0.5261955261230469, "reward_std": 0.07269148528575897, "rewards/VisualizationJSONCombinedORM/mean": 0.5261955261230469, "rewards/VisualizationJSONCombinedORM/std": 0.22237718105316162, "step": 2694, "train_speed(iter/s)": 0.257796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 293.0, "completions/min_length": 223.0, "epoch": 2.2291149710504548, "grad_norm": 0.1915895938873291, "kl": 0.0506591796875, "learning_rate": 1.8804014307775965e-06, "loss": 0.0005065500736236572, "memory(GiB)": 38.13, "reward": 0.5164419412612915, "reward_std": 0.06316737085580826, "rewards/VisualizationJSONCombinedORM/mean": 0.5164419412612915, "rewards/VisualizationJSONCombinedORM/std": 0.267553448677063, "step": 2695, "train_speed(iter/s)": 0.257304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 301.25, "completions/min_length": 244.0, "epoch": 2.229942100909843, "grad_norm": 0.17578761279582977, "kl": 0.0992431640625, "learning_rate": 1.876641972009184e-06, "loss": 0.0009915130212903023, "memory(GiB)": 38.13, "reward": 0.7032476663589478, "reward_std": 0.09589327871799469, "rewards/VisualizationJSONCombinedORM/mean": 0.7032476663589478, "rewards/VisualizationJSONCombinedORM/std": 0.10407815128564835, "step": 2696, "train_speed(iter/s)": 0.256627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 300.8125, "completions/min_length": 243.0, "epoch": 2.230769230769231, "grad_norm": 0.20968061685562134, "kl": 0.096923828125, "learning_rate": 1.872885406725412e-06, "loss": 0.0009700767695903778, "memory(GiB)": 38.13, "reward": 0.5847659111022949, "reward_std": 0.09604756534099579, "rewards/VisualizationJSONCombinedORM/mean": 0.5847659111022949, "rewards/VisualizationJSONCombinedORM/std": 0.10692104697227478, "step": 2697, "train_speed(iter/s)": 0.256067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 309.75, "completions/min_length": 219.0, "epoch": 2.2315963606286187, "grad_norm": 0.20888403058052063, "kl": 0.0770263671875, "learning_rate": 1.869131738406373e-06, "loss": 0.0007693544030189514, "memory(GiB)": 38.13, "reward": 0.4764023423194885, "reward_std": 0.06786343455314636, "rewards/VisualizationJSONCombinedORM/mean": 0.4764023423194885, "rewards/VisualizationJSONCombinedORM/std": 0.12065723538398743, "step": 2698, "train_speed(iter/s)": 0.255536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 283.75, "completions/min_length": 210.0, "epoch": 2.2324234904880065, "grad_norm": 0.17728790640830994, "kl": 0.07568359375, "learning_rate": 1.865380970529469e-06, "loss": 0.0007569938898086548, "memory(GiB)": 38.13, "reward": 0.47626572847366333, "reward_std": 0.07364855706691742, "rewards/VisualizationJSONCombinedORM/mean": 0.47626572847366333, "rewards/VisualizationJSONCombinedORM/std": 0.15118318796157837, "step": 2699, "train_speed(iter/s)": 0.254973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 302.5, "completions/min_length": 231.0, "epoch": 2.2332506203473947, "grad_norm": 0.2101757675409317, "kl": 0.107177734375, "learning_rate": 1.8616331065694193e-06, "loss": 0.0010709762573242188, "memory(GiB)": 38.13, "reward": 0.4642428159713745, "reward_std": 0.05137278884649277, "rewards/VisualizationJSONCombinedORM/mean": 0.4642428159713745, "rewards/VisualizationJSONCombinedORM/std": 0.28538742661476135, "step": 2700, "train_speed(iter/s)": 0.254493 }, { "epoch": 2.2332506203473947, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.7916666666667, "eval_completions/mean_length": 297.9010416666667, "eval_completions/min_length": 246.29166666666666, "eval_kl": 0.09193929036458333, "eval_loss": 0.0009196003084070981, "eval_reward": 0.4846872289975484, "eval_reward_std": 0.07009679754264653, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4846872289975484, "eval_rewards/VisualizationJSONCombinedORM/std": 0.07009679940529168, "eval_runtime": 305.563, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 334.625, "completions/min_length": 242.0, "epoch": 2.2340777502067826, "grad_norm": 0.16741858422756195, "kl": 0.0484619140625, "learning_rate": 1.8578881499982532e-06, "loss": 0.00048482976853847504, "memory(GiB)": 38.13, "reward": 0.5645124912261963, "reward_std": 0.03514750301837921, "rewards/VisualizationJSONCombinedORM/mean": 0.5645124912261963, "rewards/VisualizationJSONCombinedORM/std": 0.30514559149742126, "step": 2701, "train_speed(iter/s)": 0.246815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 286.0625, "completions/min_length": 234.0, "epoch": 2.2349048800661704, "grad_norm": 0.24597451090812683, "kl": 0.1142578125, "learning_rate": 1.854146104285306e-06, "loss": 0.0011462867259979248, "memory(GiB)": 38.13, "reward": 0.34024447202682495, "reward_std": 0.05907205492258072, "rewards/VisualizationJSONCombinedORM/mean": 0.34024447202682495, "rewards/VisualizationJSONCombinedORM/std": 0.05928725376725197, "step": 2702, "train_speed(iter/s)": 0.246431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 292.9375, "completions/min_length": 254.0, "epoch": 2.235732009925558, "grad_norm": 0.14489495754241943, "kl": 0.0684814453125, "learning_rate": 1.8504069728972124e-06, "loss": 0.0006847381591796875, "memory(GiB)": 38.13, "reward": 0.5299575924873352, "reward_std": 0.03962979093194008, "rewards/VisualizationJSONCombinedORM/mean": 0.5299575924873352, "rewards/VisualizationJSONCombinedORM/std": 0.23839271068572998, "step": 2703, "train_speed(iter/s)": 0.245943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 316.25, "completions/min_length": 249.0, "epoch": 2.236559139784946, "grad_norm": 0.22715894877910614, "kl": 0.07061767578125, "learning_rate": 1.846670759297915e-06, "loss": 0.0007045045495033264, "memory(GiB)": 38.13, "reward": 0.5000455379486084, "reward_std": 0.07011807709932327, "rewards/VisualizationJSONCombinedORM/mean": 0.5000455379486084, "rewards/VisualizationJSONCombinedORM/std": 0.11495190113782883, "step": 2704, "train_speed(iter/s)": 0.245409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 299.4375, "completions/min_length": 240.0, "epoch": 2.2373862696443343, "grad_norm": 0.18716198205947876, "kl": 0.0938720703125, "learning_rate": 1.8429374669486434e-06, "loss": 0.0009372532367706299, "memory(GiB)": 38.13, "reward": 0.48086482286453247, "reward_std": 0.06497594714164734, "rewards/VisualizationJSONCombinedORM/mean": 0.48086482286453247, "rewards/VisualizationJSONCombinedORM/std": 0.09520591050386429, "step": 2705, "train_speed(iter/s)": 0.24491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 299.625, "completions/min_length": 234.0, "epoch": 2.238213399503722, "grad_norm": 0.18654578924179077, "kl": 0.05712890625, "learning_rate": 1.8392070993079326e-06, "loss": 0.000571124255657196, "memory(GiB)": 38.13, "reward": 0.6648324728012085, "reward_std": 0.08661177009344101, "rewards/VisualizationJSONCombinedORM/mean": 0.6648324728012085, "rewards/VisualizationJSONCombinedORM/std": 0.11221015453338623, "step": 2706, "train_speed(iter/s)": 0.244488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 306.6875, "completions/min_length": 205.0, "epoch": 2.23904052936311, "grad_norm": 0.2301725298166275, "kl": 0.0948486328125, "learning_rate": 1.8354796598315977e-06, "loss": 0.0009467490017414093, "memory(GiB)": 38.13, "reward": 0.5792464017868042, "reward_std": 0.035767119377851486, "rewards/VisualizationJSONCombinedORM/mean": 0.5792464017868042, "rewards/VisualizationJSONCombinedORM/std": 0.2285049855709076, "step": 2707, "train_speed(iter/s)": 0.244005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 311.3125, "completions/min_length": 246.0, "epoch": 2.2398676592224978, "grad_norm": 0.23118656873703003, "kl": 0.0640869140625, "learning_rate": 1.831755151972749e-06, "loss": 0.0006406232714653015, "memory(GiB)": 38.13, "reward": 0.6291691064834595, "reward_std": 0.11075451970100403, "rewards/VisualizationJSONCombinedORM/mean": 0.6291691064834595, "rewards/VisualizationJSONCombinedORM/std": 0.12866714596748352, "step": 2708, "train_speed(iter/s)": 0.243366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 311.5625, "completions/min_length": 253.0, "epoch": 2.240694789081886, "grad_norm": 0.20247621834278107, "kl": 0.1025390625, "learning_rate": 1.8280335791817733e-06, "loss": 0.001025766134262085, "memory(GiB)": 38.13, "reward": 0.3245086669921875, "reward_std": 0.0424574576318264, "rewards/VisualizationJSONCombinedORM/mean": 0.3245086669921875, "rewards/VisualizationJSONCombinedORM/std": 0.07886732369661331, "step": 2709, "train_speed(iter/s)": 0.242951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 308.75, "completions/min_length": 244.0, "epoch": 2.241521918941274, "grad_norm": 0.19834193587303162, "kl": 0.082763671875, "learning_rate": 1.8243149449063452e-06, "loss": 0.000828959047794342, "memory(GiB)": 38.13, "reward": 0.42638805508613586, "reward_std": 0.046074412763118744, "rewards/VisualizationJSONCombinedORM/mean": 0.42638805508613586, "rewards/VisualizationJSONCombinedORM/std": 0.2516002058982849, "step": 2710, "train_speed(iter/s)": 0.242503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 312.625, "completions/min_length": 260.0, "epoch": 2.2423490488006617, "grad_norm": 0.16714882850646973, "kl": 0.075927734375, "learning_rate": 1.8205992525914135e-06, "loss": 0.0007592476904392242, "memory(GiB)": 38.13, "reward": 0.656495988368988, "reward_std": 0.04409988969564438, "rewards/VisualizationJSONCombinedORM/mean": 0.656495988368988, "rewards/VisualizationJSONCombinedORM/std": 0.14608238637447357, "step": 2711, "train_speed(iter/s)": 0.242045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/mean_length": 357.4375, "completions/min_length": 267.0, "epoch": 2.2431761786600495, "grad_norm": 0.16326342523097992, "kl": 0.11407470703125, "learning_rate": 1.8168865056792029e-06, "loss": 0.001144368201494217, "memory(GiB)": 38.13, "reward": 0.6740810871124268, "reward_std": 0.10327280312776566, "rewards/VisualizationJSONCombinedORM/mean": 0.6740810871124268, "rewards/VisualizationJSONCombinedORM/std": 0.13182801008224487, "step": 2712, "train_speed(iter/s)": 0.241543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 306.6875, "completions/min_length": 257.0, "epoch": 2.2440033085194377, "grad_norm": 0.20497865974903107, "kl": 0.13330078125, "learning_rate": 1.813176707609211e-06, "loss": 0.0013334201648831367, "memory(GiB)": 38.13, "reward": 0.6253560781478882, "reward_std": 0.06911017745733261, "rewards/VisualizationJSONCombinedORM/mean": 0.6253560781478882, "rewards/VisualizationJSONCombinedORM/std": 0.13437266647815704, "step": 2713, "train_speed(iter/s)": 0.241098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 303.75, "completions/min_length": 239.0, "epoch": 2.2448304383788256, "grad_norm": 0.22724050283432007, "kl": 0.0858154296875, "learning_rate": 1.8094698618181972e-06, "loss": 0.0008570030331611633, "memory(GiB)": 38.13, "reward": 0.6157461404800415, "reward_std": 0.0786571204662323, "rewards/VisualizationJSONCombinedORM/mean": 0.6157461404800415, "rewards/VisualizationJSONCombinedORM/std": 0.21165712177753448, "step": 2714, "train_speed(iter/s)": 0.240635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 314.375, "completions/min_length": 244.0, "epoch": 2.2456575682382134, "grad_norm": 0.1595141440629959, "kl": 0.05511474609375, "learning_rate": 1.8057659717401948e-06, "loss": 0.0005525778979063034, "memory(GiB)": 38.13, "reward": 0.5794550776481628, "reward_std": 0.047516822814941406, "rewards/VisualizationJSONCombinedORM/mean": 0.5794550776481628, "rewards/VisualizationJSONCombinedORM/std": 0.20539160072803497, "step": 2715, "train_speed(iter/s)": 0.240149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 285.5625, "completions/min_length": 221.0, "epoch": 2.246484698097601, "grad_norm": 0.20322482287883759, "kl": 0.0592041015625, "learning_rate": 1.8020650408064898e-06, "loss": 0.0005932934582233429, "memory(GiB)": 38.13, "reward": 0.46331632137298584, "reward_std": 0.05872470885515213, "rewards/VisualizationJSONCombinedORM/mean": 0.46331632137298584, "rewards/VisualizationJSONCombinedORM/std": 0.1360587179660797, "step": 2716, "train_speed(iter/s)": 0.23971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 323.5625, "completions/min_length": 253.0, "epoch": 2.247311827956989, "grad_norm": 0.1631532609462738, "kl": 0.0782470703125, "learning_rate": 1.7983670724456342e-06, "loss": 0.0007832478731870651, "memory(GiB)": 38.13, "reward": 0.4201076924800873, "reward_std": 0.05406242609024048, "rewards/VisualizationJSONCombinedORM/mean": 0.4201076924800873, "rewards/VisualizationJSONCombinedORM/std": 0.11147621273994446, "step": 2717, "train_speed(iter/s)": 0.239207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 313.5625, "completions/min_length": 236.0, "epoch": 2.2481389578163773, "grad_norm": 0.21960386633872986, "kl": 0.08203125, "learning_rate": 1.7946720700834324e-06, "loss": 0.0008198879659175873, "memory(GiB)": 38.13, "reward": 0.4504343569278717, "reward_std": 0.05773387849330902, "rewards/VisualizationJSONCombinedORM/mean": 0.4504343569278717, "rewards/VisualizationJSONCombinedORM/std": 0.07450972497463226, "step": 2718, "train_speed(iter/s)": 0.238772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 289.375, "completions/min_length": 240.0, "epoch": 2.248966087675765, "grad_norm": 0.1615343540906906, "kl": 0.029937744140625, "learning_rate": 1.7909800371429425e-06, "loss": 0.00029921531677246094, "memory(GiB)": 38.13, "reward": 0.5667482614517212, "reward_std": 0.05404715985059738, "rewards/VisualizationJSONCombinedORM/mean": 0.5667482614517212, "rewards/VisualizationJSONCombinedORM/std": 0.22310790419578552, "step": 2719, "train_speed(iter/s)": 0.238292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 317.6875, "completions/min_length": 229.0, "epoch": 2.249793217535153, "grad_norm": 0.1750611513853073, "kl": 0.04803466796875, "learning_rate": 1.787290977044468e-06, "loss": 0.0004803687334060669, "memory(GiB)": 38.13, "reward": 0.692215085029602, "reward_std": 0.10110118985176086, "rewards/VisualizationJSONCombinedORM/mean": 0.692215085029602, "rewards/VisualizationJSONCombinedORM/std": 0.14311441779136658, "step": 2720, "train_speed(iter/s)": 0.237869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 317.5625, "completions/min_length": 245.0, "epoch": 2.2506203473945408, "grad_norm": 0.260862797498703, "kl": 0.04315185546875, "learning_rate": 1.7836048932055643e-06, "loss": 0.00043182820081710815, "memory(GiB)": 38.13, "reward": 0.6367958784103394, "reward_std": 0.1005239337682724, "rewards/VisualizationJSONCombinedORM/mean": 0.6367958784103394, "rewards/VisualizationJSONCombinedORM/std": 0.18112608790397644, "step": 2721, "train_speed(iter/s)": 0.237437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 317.9375, "completions/min_length": 244.0, "epoch": 2.251447477253929, "grad_norm": 0.20746451616287231, "kl": 0.090087890625, "learning_rate": 1.7799217890410208e-06, "loss": 0.0009017437696456909, "memory(GiB)": 38.13, "reward": 0.49268975853919983, "reward_std": 0.05759410187602043, "rewards/VisualizationJSONCombinedORM/mean": 0.49268975853919983, "rewards/VisualizationJSONCombinedORM/std": 0.2231437861919403, "step": 2722, "train_speed(iter/s)": 0.237073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 330.0625, "completions/min_length": 286.0, "epoch": 2.252274607113317, "grad_norm": 0.18584680557250977, "kl": 0.0712890625, "learning_rate": 1.7762416679628792e-06, "loss": 0.0007126554846763611, "memory(GiB)": 38.13, "reward": 0.5624674558639526, "reward_std": 0.07661764323711395, "rewards/VisualizationJSONCombinedORM/mean": 0.5624674558639526, "rewards/VisualizationJSONCombinedORM/std": 0.18789920210838318, "step": 2723, "train_speed(iter/s)": 0.236592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 307.8125, "completions/min_length": 243.0, "epoch": 2.2531017369727047, "grad_norm": 0.17270377278327942, "kl": 0.04998779296875, "learning_rate": 1.7725645333804054e-06, "loss": 0.0005001835525035858, "memory(GiB)": 38.13, "reward": 0.5918203592300415, "reward_std": 0.07890203595161438, "rewards/VisualizationJSONCombinedORM/mean": 0.5918203592300415, "rewards/VisualizationJSONCombinedORM/std": 0.17499889433383942, "step": 2724, "train_speed(iter/s)": 0.236128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 320.25, "completions/min_length": 242.0, "epoch": 2.2539288668320925, "grad_norm": 0.21893365681171417, "kl": 0.04083251953125, "learning_rate": 1.7688903887001064e-06, "loss": 0.00040945783257484436, "memory(GiB)": 38.13, "reward": 0.7131199240684509, "reward_std": 0.08028340339660645, "rewards/VisualizationJSONCombinedORM/mean": 0.7131199240684509, "rewards/VisualizationJSONCombinedORM/std": 0.10533156245946884, "step": 2725, "train_speed(iter/s)": 0.235677 }, { "epoch": 2.2539288668320925, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 370.0833333333333, "eval_completions/mean_length": 310.7708333333333, "eval_completions/min_length": 253.58333333333334, "eval_kl": 0.08060709635416667, "eval_loss": 0.0008079732651822269, "eval_reward": 0.47097960362831753, "eval_reward_std": 0.06313557270914316, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47097960362831753, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06313557387329638, "eval_runtime": 313.8478, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 300.6875, "completions/min_length": 228.0, "epoch": 2.2547559966914807, "grad_norm": 0.2621224522590637, "kl": 0.04425048828125, "learning_rate": 1.7652192373257137e-06, "loss": 0.0004431307315826416, "memory(GiB)": 38.13, "reward": 0.5699970126152039, "reward_std": 0.09975247085094452, "rewards/VisualizationJSONCombinedORM/mean": 0.5699970126152039, "rewards/VisualizationJSONCombinedORM/std": 0.10430458933115005, "step": 2726, "train_speed(iter/s)": 0.228994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 328.8125, "completions/min_length": 260.0, "epoch": 2.2555831265508686, "grad_norm": 0.1927294135093689, "kl": 0.06500244140625, "learning_rate": 1.7615510826581906e-06, "loss": 0.0006500966846942902, "memory(GiB)": 38.13, "reward": 0.4395056962966919, "reward_std": 0.03855019435286522, "rewards/VisualizationJSONCombinedORM/mean": 0.4395056962966919, "rewards/VisualizationJSONCombinedORM/std": 0.04828384518623352, "step": 2727, "train_speed(iter/s)": 0.228534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 296.125, "completions/min_length": 239.0, "epoch": 2.2564102564102564, "grad_norm": 0.15444537997245789, "kl": 0.0723876953125, "learning_rate": 1.7578859280957216e-06, "loss": 0.0007246322929859161, "memory(GiB)": 38.13, "reward": 0.401441365480423, "reward_std": 0.08734947443008423, "rewards/VisualizationJSONCombinedORM/mean": 0.401441365480423, "rewards/VisualizationJSONCombinedORM/std": 0.08735301345586777, "step": 2728, "train_speed(iter/s)": 0.228029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 318.0625, "completions/min_length": 242.0, "epoch": 2.257237386269644, "grad_norm": 0.17213210463523865, "kl": 0.082763671875, "learning_rate": 1.7542237770337151e-06, "loss": 0.0008278489112854004, "memory(GiB)": 38.13, "reward": 0.7288406491279602, "reward_std": 0.0669335350394249, "rewards/VisualizationJSONCombinedORM/mean": 0.7288406491279602, "rewards/VisualizationJSONCombinedORM/std": 0.07036753743886948, "step": 2729, "train_speed(iter/s)": 0.227701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 293.0625, "completions/min_length": 222.0, "epoch": 2.258064516129032, "grad_norm": 0.13782340288162231, "kl": 0.03631591796875, "learning_rate": 1.7505646328647913e-06, "loss": 0.00036296620965003967, "memory(GiB)": 38.13, "reward": 0.7681139707565308, "reward_std": 0.08220396190881729, "rewards/VisualizationJSONCombinedORM/mean": 0.7681139707565308, "rewards/VisualizationJSONCombinedORM/std": 0.09433358162641525, "step": 2730, "train_speed(iter/s)": 0.227385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 323.5, "completions/min_length": 263.0, "epoch": 2.2588916459884203, "grad_norm": 0.17037147283554077, "kl": 0.05145263671875, "learning_rate": 1.746908498978791e-06, "loss": 0.0005138441920280457, "memory(GiB)": 38.13, "reward": 0.6627068519592285, "reward_std": 0.06389543414115906, "rewards/VisualizationJSONCombinedORM/mean": 0.6627068519592285, "rewards/VisualizationJSONCombinedORM/std": 0.16316655278205872, "step": 2731, "train_speed(iter/s)": 0.226938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 339.75, "completions/min_length": 267.0, "epoch": 2.259718775847808, "grad_norm": 0.19442898035049438, "kl": 0.05743408203125, "learning_rate": 1.7432553787627598e-06, "loss": 0.0005747601389884949, "memory(GiB)": 38.13, "reward": 0.5094691514968872, "reward_std": 0.06231047213077545, "rewards/VisualizationJSONCombinedORM/mean": 0.5094691514968872, "rewards/VisualizationJSONCombinedORM/std": 0.2107740193605423, "step": 2732, "train_speed(iter/s)": 0.226592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 306.75, "completions/min_length": 244.0, "epoch": 2.260545905707196, "grad_norm": 0.20969074964523315, "kl": 0.06134033203125, "learning_rate": 1.7396052756009574e-06, "loss": 0.0006143674254417419, "memory(GiB)": 38.13, "reward": 0.30080610513687134, "reward_std": 0.04795917123556137, "rewards/VisualizationJSONCombinedORM/mean": 0.30080610513687134, "rewards/VisualizationJSONCombinedORM/std": 0.1656048595905304, "step": 2733, "train_speed(iter/s)": 0.226189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 293.75, "completions/min_length": 231.0, "epoch": 2.261373035566584, "grad_norm": 0.23299041390419006, "kl": 0.09033203125, "learning_rate": 1.7359581928748454e-06, "loss": 0.0009021162986755371, "memory(GiB)": 38.13, "reward": 0.41627633571624756, "reward_std": 0.06545114517211914, "rewards/VisualizationJSONCombinedORM/mean": 0.41627633571624756, "rewards/VisualizationJSONCombinedORM/std": 0.07100298255681992, "step": 2734, "train_speed(iter/s)": 0.225759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 296.4375, "completions/min_length": 231.0, "epoch": 2.262200165425972, "grad_norm": 0.20724999904632568, "kl": 0.0478515625, "learning_rate": 1.7323141339630905e-06, "loss": 0.00047770142555236816, "memory(GiB)": 38.13, "reward": 0.6538698673248291, "reward_std": 0.09016630053520203, "rewards/VisualizationJSONCombinedORM/mean": 0.6538698673248291, "rewards/VisualizationJSONCombinedORM/std": 0.09562558680772781, "step": 2735, "train_speed(iter/s)": 0.225323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 323.8125, "completions/min_length": 243.0, "epoch": 2.26302729528536, "grad_norm": 0.18234162032604218, "kl": 0.0736083984375, "learning_rate": 1.7286731022415515e-06, "loss": 0.0007359012961387634, "memory(GiB)": 38.13, "reward": 0.5513166189193726, "reward_std": 0.06966114044189453, "rewards/VisualizationJSONCombinedORM/mean": 0.5513166189193726, "rewards/VisualizationJSONCombinedORM/std": 0.07561109215021133, "step": 2736, "train_speed(iter/s)": 0.224938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 308.6875, "completions/min_length": 251.0, "epoch": 2.2638544251447477, "grad_norm": 0.1733151525259018, "kl": 0.064453125, "learning_rate": 1.7250351010832906e-06, "loss": 0.0006457958370447159, "memory(GiB)": 38.13, "reward": 0.3072221577167511, "reward_std": 0.04368060827255249, "rewards/VisualizationJSONCombinedORM/mean": 0.3072221577167511, "rewards/VisualizationJSONCombinedORM/std": 0.05800142139196396, "step": 2737, "train_speed(iter/s)": 0.224494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 296.125, "completions/min_length": 236.0, "epoch": 2.2646815550041355, "grad_norm": 0.235521599650383, "kl": 0.06787109375, "learning_rate": 1.7214001338585533e-06, "loss": 0.0006799865514039993, "memory(GiB)": 38.13, "reward": 0.3669772148132324, "reward_std": 0.0550268217921257, "rewards/VisualizationJSONCombinedORM/mean": 0.3669772148132324, "rewards/VisualizationJSONCombinedORM/std": 0.12045019119977951, "step": 2738, "train_speed(iter/s)": 0.224176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 317.375, "completions/min_length": 248.0, "epoch": 2.2655086848635237, "grad_norm": 0.16477110981941223, "kl": 0.0533447265625, "learning_rate": 1.7177682039347875e-06, "loss": 0.0005332604050636292, "memory(GiB)": 38.13, "reward": 0.4533141851425171, "reward_std": 0.04401706904172897, "rewards/VisualizationJSONCombinedORM/mean": 0.4533141851425171, "rewards/VisualizationJSONCombinedORM/std": 0.049269288778305054, "step": 2739, "train_speed(iter/s)": 0.223789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 299.25, "completions/min_length": 230.0, "epoch": 2.2663358147229116, "grad_norm": 0.2093980461359024, "kl": 0.0760498046875, "learning_rate": 1.7141393146766145e-06, "loss": 0.0007626973092556, "memory(GiB)": 38.13, "reward": 0.5268179178237915, "reward_std": 0.09562726318836212, "rewards/VisualizationJSONCombinedORM/mean": 0.5268179178237915, "rewards/VisualizationJSONCombinedORM/std": 0.09831050038337708, "step": 2740, "train_speed(iter/s)": 0.223465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 277.8125, "completions/min_length": 225.0, "epoch": 2.2671629445822994, "grad_norm": 0.17461691796779633, "kl": 0.042236328125, "learning_rate": 1.710513469445847e-06, "loss": 0.00042144395411014557, "memory(GiB)": 38.13, "reward": 0.6933311223983765, "reward_std": 0.08001280575990677, "rewards/VisualizationJSONCombinedORM/mean": 0.6933311223983765, "rewards/VisualizationJSONCombinedORM/std": 0.09158886969089508, "step": 2741, "train_speed(iter/s)": 0.223135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 276.625, "completions/min_length": 228.0, "epoch": 2.267990074441687, "grad_norm": 0.17800644040107727, "kl": 0.0557861328125, "learning_rate": 1.706890671601471e-06, "loss": 0.0005580112338066101, "memory(GiB)": 38.13, "reward": 0.6437706351280212, "reward_std": 0.09067703783512115, "rewards/VisualizationJSONCombinedORM/mean": 0.6437706351280212, "rewards/VisualizationJSONCombinedORM/std": 0.13058434426784515, "step": 2742, "train_speed(iter/s)": 0.222766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 308.1875, "completions/min_length": 246.0, "epoch": 2.268817204301075, "grad_norm": 0.17222149670124054, "kl": 0.071533203125, "learning_rate": 1.7032709244996559e-06, "loss": 0.0007144398987293243, "memory(GiB)": 38.13, "reward": 0.7062585949897766, "reward_std": 0.052333153784275055, "rewards/VisualizationJSONCombinedORM/mean": 0.7062585949897766, "rewards/VisualizationJSONCombinedORM/std": 0.05323881655931473, "step": 2743, "train_speed(iter/s)": 0.222325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 322.25, "completions/min_length": 282.0, "epoch": 2.2696443341604633, "grad_norm": 0.2082212120294571, "kl": 0.05853271484375, "learning_rate": 1.6996542314937413e-06, "loss": 0.0005841273814439774, "memory(GiB)": 38.13, "reward": 0.4897916316986084, "reward_std": 0.07427215576171875, "rewards/VisualizationJSONCombinedORM/mean": 0.4897916316986084, "rewards/VisualizationJSONCombinedORM/std": 0.15127518773078918, "step": 2744, "train_speed(iter/s)": 0.221961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/mean_length": 330.5, "completions/min_length": 240.0, "epoch": 2.270471464019851, "grad_norm": 0.16013114154338837, "kl": 0.029296875, "learning_rate": 1.6960405959342402e-06, "loss": 0.0002929121255874634, "memory(GiB)": 38.13, "reward": 0.5732696056365967, "reward_std": 0.06950898468494415, "rewards/VisualizationJSONCombinedORM/mean": 0.5732696056365967, "rewards/VisualizationJSONCombinedORM/std": 0.13174039125442505, "step": 2745, "train_speed(iter/s)": 0.221476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 295.5, "completions/min_length": 217.0, "epoch": 2.271298593879239, "grad_norm": 0.24641546607017517, "kl": 0.06878662109375, "learning_rate": 1.6924300211688277e-06, "loss": 0.0006887689232826233, "memory(GiB)": 38.13, "reward": 0.5862913727760315, "reward_std": 0.13755233585834503, "rewards/VisualizationJSONCombinedORM/mean": 0.5862913727760315, "rewards/VisualizationJSONCombinedORM/std": 0.15807855129241943, "step": 2746, "train_speed(iter/s)": 0.221119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 310.25, "completions/min_length": 246.0, "epoch": 2.272125723738627, "grad_norm": 0.1857219636440277, "kl": 0.0479736328125, "learning_rate": 1.6888225105423505e-06, "loss": 0.00047900713980197906, "memory(GiB)": 38.13, "reward": 0.6177698969841003, "reward_std": 0.06940218061208725, "rewards/VisualizationJSONCombinedORM/mean": 0.6177698969841003, "rewards/VisualizationJSONCombinedORM/std": 0.08482911437749863, "step": 2747, "train_speed(iter/s)": 0.220694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 344.25, "completions/min_length": 270.0, "epoch": 2.272952853598015, "grad_norm": 0.19804461300373077, "kl": 0.0625, "learning_rate": 1.6852180673968093e-06, "loss": 0.0006256625056266785, "memory(GiB)": 38.13, "reward": 0.4789806604385376, "reward_std": 0.07516927272081375, "rewards/VisualizationJSONCombinedORM/mean": 0.4789806604385376, "rewards/VisualizationJSONCombinedORM/std": 0.09343335032463074, "step": 2748, "train_speed(iter/s)": 0.220233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 298.6875, "completions/min_length": 245.0, "epoch": 2.273779983457403, "grad_norm": 0.19096779823303223, "kl": 0.05804443359375, "learning_rate": 1.6816166950713697e-06, "loss": 0.0005808304995298386, "memory(GiB)": 38.13, "reward": 0.548365592956543, "reward_std": 0.14798414707183838, "rewards/VisualizationJSONCombinedORM/mean": 0.548365592956543, "rewards/VisualizationJSONCombinedORM/std": 0.15352191030979156, "step": 2749, "train_speed(iter/s)": 0.219889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 312.25, "completions/min_length": 254.0, "epoch": 2.2746071133167907, "grad_norm": 0.17319458723068237, "kl": 0.04510498046875, "learning_rate": 1.6780183969023483e-06, "loss": 0.0004524514079093933, "memory(GiB)": 38.13, "reward": 0.670997142791748, "reward_std": 0.057969674468040466, "rewards/VisualizationJSONCombinedORM/mean": 0.670997142791748, "rewards/VisualizationJSONCombinedORM/std": 0.08841472119092941, "step": 2750, "train_speed(iter/s)": 0.219489 }, { "epoch": 2.2746071133167907, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.5, "eval_completions/mean_length": 308.8020833333333, "eval_completions/min_length": 257.0833333333333, "eval_kl": 0.067901611328125, "eval_loss": 0.0006785169243812561, "eval_reward": 0.4560042954981327, "eval_reward_std": 0.05736202707824608, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4560042954981327, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05736202715585629, "eval_runtime": 312.4352, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 294.6875, "completions/min_length": 234.0, "epoch": 2.2754342431761785, "grad_norm": 0.20540475845336914, "kl": 0.0513916015625, "learning_rate": 1.6744231762232178e-06, "loss": 0.0005142912268638611, "memory(GiB)": 38.13, "reward": 0.46423131227493286, "reward_std": 0.06225786358118057, "rewards/VisualizationJSONCombinedORM/mean": 0.46423131227493286, "rewards/VisualizationJSONCombinedORM/std": 0.081108957529068, "step": 2751, "train_speed(iter/s)": 0.213791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 311.625, "completions/min_length": 247.0, "epoch": 2.2762613730355667, "grad_norm": 0.17786705493927002, "kl": 0.06817626953125, "learning_rate": 1.6708310363645936e-06, "loss": 0.000680580735206604, "memory(GiB)": 38.13, "reward": 0.6310787200927734, "reward_std": 0.03875007480382919, "rewards/VisualizationJSONCombinedORM/mean": 0.6310787200927734, "rewards/VisualizationJSONCombinedORM/std": 0.09558381885290146, "step": 2752, "train_speed(iter/s)": 0.213419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 300.0625, "completions/min_length": 248.0, "epoch": 2.2770885028949546, "grad_norm": 0.16559860110282898, "kl": 0.04461669921875, "learning_rate": 1.6672419806542433e-06, "loss": 0.00044611096382141113, "memory(GiB)": 38.13, "reward": 0.6941552758216858, "reward_std": 0.03839698061347008, "rewards/VisualizationJSONCombinedORM/mean": 0.6941552758216858, "rewards/VisualizationJSONCombinedORM/std": 0.08036991208791733, "step": 2753, "train_speed(iter/s)": 0.213073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 331.4375, "completions/min_length": 224.0, "epoch": 2.2779156327543424, "grad_norm": 0.16134850680828094, "kl": 0.0777587890625, "learning_rate": 1.6636560124170713e-06, "loss": 0.0007781982421875, "memory(GiB)": 38.13, "reward": 0.4350960850715637, "reward_std": 0.056738369166851044, "rewards/VisualizationJSONCombinedORM/mean": 0.4350960850715637, "rewards/VisualizationJSONCombinedORM/std": 0.22269809246063232, "step": 2754, "train_speed(iter/s)": 0.212632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 281.9375, "completions/min_length": 222.0, "epoch": 2.27874276261373, "grad_norm": 0.17443321645259857, "kl": 0.05364990234375, "learning_rate": 1.6600731349751303e-06, "loss": 0.0005360394716262817, "memory(GiB)": 38.13, "reward": 0.4641135334968567, "reward_std": 0.037005092948675156, "rewards/VisualizationJSONCombinedORM/mean": 0.4641135334968567, "rewards/VisualizationJSONCombinedORM/std": 0.324684202671051, "step": 2755, "train_speed(iter/s)": 0.212339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 302.8125, "completions/min_length": 218.0, "epoch": 2.279569892473118, "grad_norm": 0.18325388431549072, "kl": 0.06207275390625, "learning_rate": 1.6564933516476012e-06, "loss": 0.0006200596690177917, "memory(GiB)": 38.13, "reward": 0.5146816968917847, "reward_std": 0.09839023649692535, "rewards/VisualizationJSONCombinedORM/mean": 0.5146816968917847, "rewards/VisualizationJSONCombinedORM/std": 0.13827702403068542, "step": 2756, "train_speed(iter/s)": 0.211978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 293.6875, "completions/min_length": 226.0, "epoch": 2.2803970223325063, "grad_norm": 0.21067291498184204, "kl": 0.12109375, "learning_rate": 1.6529166657508033e-06, "loss": 0.0012116096913814545, "memory(GiB)": 38.13, "reward": 0.4639981985092163, "reward_std": 0.045475251972675323, "rewards/VisualizationJSONCombinedORM/mean": 0.4639981985092163, "rewards/VisualizationJSONCombinedORM/std": 0.16159893572330475, "step": 2757, "train_speed(iter/s)": 0.21165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 302.0625, "completions/min_length": 226.0, "epoch": 2.281224152191894, "grad_norm": 0.16598981618881226, "kl": 0.0721435546875, "learning_rate": 1.6493430805981836e-06, "loss": 0.0007211491465568542, "memory(GiB)": 38.13, "reward": 0.6666395664215088, "reward_std": 0.06165088713169098, "rewards/VisualizationJSONCombinedORM/mean": 0.6666395664215088, "rewards/VisualizationJSONCombinedORM/std": 0.1040239930152893, "step": 2758, "train_speed(iter/s)": 0.211241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 322.125, "completions/min_length": 253.0, "epoch": 2.282051282051282, "grad_norm": 0.16911180317401886, "kl": 0.07330322265625, "learning_rate": 1.6457725995003182e-06, "loss": 0.0007337778806686401, "memory(GiB)": 38.13, "reward": 0.6570941805839539, "reward_std": 0.08125367760658264, "rewards/VisualizationJSONCombinedORM/mean": 0.6570941805839539, "rewards/VisualizationJSONCombinedORM/std": 0.08003150671720505, "step": 2759, "train_speed(iter/s)": 0.210845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 315.9375, "completions/min_length": 252.0, "epoch": 2.28287841191067, "grad_norm": 0.20094066858291626, "kl": 0.073974609375, "learning_rate": 1.642205225764908e-06, "loss": 0.0007400792092084885, "memory(GiB)": 38.13, "reward": 0.35161975026130676, "reward_std": 0.025059733539819717, "rewards/VisualizationJSONCombinedORM/mean": 0.35161975026130676, "rewards/VisualizationJSONCombinedORM/std": 0.09224677085876465, "step": 2760, "train_speed(iter/s)": 0.210507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 290.5625, "completions/min_length": 235.0, "epoch": 2.283705541770058, "grad_norm": 0.21992136538028717, "kl": 0.056884765625, "learning_rate": 1.6386409626967742e-06, "loss": 0.000569421797990799, "memory(GiB)": 38.13, "reward": 0.39953044056892395, "reward_std": 0.06942348182201385, "rewards/VisualizationJSONCombinedORM/mean": 0.39953044056892395, "rewards/VisualizationJSONCombinedORM/std": 0.06786172837018967, "step": 2761, "train_speed(iter/s)": 0.210128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 336.375, "completions/min_length": 243.0, "epoch": 2.284532671629446, "grad_norm": 0.1701258420944214, "kl": 0.05419921875, "learning_rate": 1.635079813597859e-06, "loss": 0.0005424618721008301, "memory(GiB)": 38.13, "reward": 0.5024218559265137, "reward_std": 0.07727357745170593, "rewards/VisualizationJSONCombinedORM/mean": 0.5024218559265137, "rewards/VisualizationJSONCombinedORM/std": 0.1580357551574707, "step": 2762, "train_speed(iter/s)": 0.209743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 299.5, "completions/min_length": 233.0, "epoch": 2.2853598014888337, "grad_norm": 0.23097966611385345, "kl": 0.08001708984375, "learning_rate": 1.6315217817672142e-06, "loss": 0.0008019208908081055, "memory(GiB)": 38.13, "reward": 0.3703986406326294, "reward_std": 0.08125177025794983, "rewards/VisualizationJSONCombinedORM/mean": 0.3703986406326294, "rewards/VisualizationJSONCombinedORM/std": 0.11460787802934647, "step": 2763, "train_speed(iter/s)": 0.209427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 338.5625, "completions/min_length": 233.0, "epoch": 2.2861869313482215, "grad_norm": 0.22139562666416168, "kl": 0.07135009765625, "learning_rate": 1.6279668705010094e-06, "loss": 0.000713638961315155, "memory(GiB)": 38.13, "reward": 0.40716561675071716, "reward_std": 0.037021346390247345, "rewards/VisualizationJSONCombinedORM/mean": 0.40716561675071716, "rewards/VisualizationJSONCombinedORM/std": 0.08200366050004959, "step": 2764, "train_speed(iter/s)": 0.208983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 313.5625, "completions/min_length": 253.0, "epoch": 2.2870140612076097, "grad_norm": 0.18306699395179749, "kl": 0.1097412109375, "learning_rate": 1.624415083092518e-06, "loss": 0.001095835119485855, "memory(GiB)": 38.13, "reward": 0.67529296875, "reward_std": 0.10495324432849884, "rewards/VisualizationJSONCombinedORM/mean": 0.67529296875, "rewards/VisualizationJSONCombinedORM/std": 0.10747110098600388, "step": 2765, "train_speed(iter/s)": 0.208652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 325.75, "completions/min_length": 239.0, "epoch": 2.2878411910669976, "grad_norm": 0.1680283397436142, "kl": 0.0609130859375, "learning_rate": 1.6208664228321254e-06, "loss": 0.0006077587604522705, "memory(GiB)": 38.13, "reward": 0.6195550560951233, "reward_std": 0.08602562546730042, "rewards/VisualizationJSONCombinedORM/mean": 0.6195550560951233, "rewards/VisualizationJSONCombinedORM/std": 0.17781279981136322, "step": 2766, "train_speed(iter/s)": 0.208218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 284.5, "completions/min_length": 217.0, "epoch": 2.2886683209263854, "grad_norm": 0.21087104082107544, "kl": 0.0826416015625, "learning_rate": 1.6173208930073152e-06, "loss": 0.0008275844156742096, "memory(GiB)": 38.13, "reward": 0.4362904727458954, "reward_std": 0.06610013544559479, "rewards/VisualizationJSONCombinedORM/mean": 0.4362904727458954, "rewards/VisualizationJSONCombinedORM/std": 0.10031136870384216, "step": 2767, "train_speed(iter/s)": 0.207896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 322.4375, "completions/min_length": 262.0, "epoch": 2.289495450785773, "grad_norm": 0.1802934855222702, "kl": 0.051513671875, "learning_rate": 1.6137784969026754e-06, "loss": 0.0005155429244041443, "memory(GiB)": 38.13, "reward": 0.534311056137085, "reward_std": 0.06905388832092285, "rewards/VisualizationJSONCombinedORM/mean": 0.534311056137085, "rewards/VisualizationJSONCombinedORM/std": 0.06940030306577682, "step": 2768, "train_speed(iter/s)": 0.207568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 327.6875, "completions/min_length": 224.0, "epoch": 2.2903225806451615, "grad_norm": 0.1696787178516388, "kl": 0.08172607421875, "learning_rate": 1.610239237799885e-06, "loss": 0.0008171908557415009, "memory(GiB)": 38.13, "reward": 0.5148176550865173, "reward_std": 0.05380368232727051, "rewards/VisualizationJSONCombinedORM/mean": 0.5148176550865173, "rewards/VisualizationJSONCombinedORM/std": 0.11681482940912247, "step": 2769, "train_speed(iter/s)": 0.207151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 325.625, "completions/min_length": 262.0, "epoch": 2.2911497105045493, "grad_norm": 0.18078963458538055, "kl": 0.04193115234375, "learning_rate": 1.6067031189777226e-06, "loss": 0.00041956454515457153, "memory(GiB)": 38.13, "reward": 0.5747393369674683, "reward_std": 0.0733739584684372, "rewards/VisualizationJSONCombinedORM/mean": 0.5747393369674683, "rewards/VisualizationJSONCombinedORM/std": 0.11657610535621643, "step": 2770, "train_speed(iter/s)": 0.206847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 350.0625, "completions/min_length": 256.0, "epoch": 2.291976840363937, "grad_norm": 0.16492953896522522, "kl": 0.0777587890625, "learning_rate": 1.6031701437120512e-06, "loss": 0.0007772017270326614, "memory(GiB)": 38.13, "reward": 0.5698398351669312, "reward_std": 0.08259448409080505, "rewards/VisualizationJSONCombinedORM/mean": 0.5698398351669312, "rewards/VisualizationJSONCombinedORM/std": 0.21188433468341827, "step": 2771, "train_speed(iter/s)": 0.206526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 322.5, "completions/min_length": 241.0, "epoch": 2.292803970223325, "grad_norm": 0.1835968792438507, "kl": 0.05126953125, "learning_rate": 1.5996403152758315e-06, "loss": 0.0005113333463668823, "memory(GiB)": 38.13, "reward": 0.6862926483154297, "reward_std": 0.07493665814399719, "rewards/VisualizationJSONCombinedORM/mean": 0.6862926483154297, "rewards/VisualizationJSONCombinedORM/std": 0.10221388190984726, "step": 2772, "train_speed(iter/s)": 0.206211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 324.5625, "completions/min_length": 260.0, "epoch": 2.293631100082713, "grad_norm": 0.19395212829113007, "kl": 0.0433349609375, "learning_rate": 1.5961136369390995e-06, "loss": 0.0004345513880252838, "memory(GiB)": 38.13, "reward": 0.42669937014579773, "reward_std": 0.05690651759505272, "rewards/VisualizationJSONCombinedORM/mean": 0.42669937014579773, "rewards/VisualizationJSONCombinedORM/std": 0.05961104854941368, "step": 2773, "train_speed(iter/s)": 0.205931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 305.5625, "completions/min_length": 242.0, "epoch": 2.294458229942101, "grad_norm": 0.1982807070016861, "kl": 0.110107421875, "learning_rate": 1.592590111968978e-06, "loss": 0.0011055916547775269, "memory(GiB)": 38.13, "reward": 0.32635754346847534, "reward_std": 0.05971888452768326, "rewards/VisualizationJSONCombinedORM/mean": 0.32635754346847534, "rewards/VisualizationJSONCombinedORM/std": 0.10671035200357437, "step": 2774, "train_speed(iter/s)": 0.205616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 312.6875, "completions/min_length": 272.0, "epoch": 2.295285359801489, "grad_norm": 0.17722013592720032, "kl": 0.05584716796875, "learning_rate": 1.5890697436296648e-06, "loss": 0.0005589015781879425, "memory(GiB)": 38.13, "reward": 0.694556474685669, "reward_std": 0.06286921352148056, "rewards/VisualizationJSONCombinedORM/mean": 0.694556474685669, "rewards/VisualizationJSONCombinedORM/std": 0.16237786412239075, "step": 2775, "train_speed(iter/s)": 0.205211 }, { "epoch": 2.295285359801489, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.9583333333333, "eval_completions/mean_length": 310.9583333333333, "eval_completions/min_length": 255.33333333333334, "eval_kl": 0.07621256510416667, "eval_loss": 0.0007700836285948753, "eval_reward": 0.4756703202923139, "eval_reward_std": 0.061709469611135624, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4756703202923139, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06170947050365309, "eval_runtime": 315.2684, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 306.3125, "completions/min_length": 243.0, "epoch": 2.2961124896608767, "grad_norm": 0.23933741450309753, "kl": 0.09014892578125, "learning_rate": 1.585552535182437e-06, "loss": 0.0009025074541568756, "memory(GiB)": 38.13, "reward": 0.33006471395492554, "reward_std": 0.05075442045927048, "rewards/VisualizationJSONCombinedORM/mean": 0.33006471395492554, "rewards/VisualizationJSONCombinedORM/std": 0.08320648223161697, "step": 2776, "train_speed(iter/s)": 0.200203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 297.5, "completions/min_length": 235.0, "epoch": 2.2969396195202645, "grad_norm": 0.2089514434337616, "kl": 0.05609130859375, "learning_rate": 1.5820384898856433e-06, "loss": 0.0005609914660453796, "memory(GiB)": 38.13, "reward": 0.4039691686630249, "reward_std": 0.07520073652267456, "rewards/VisualizationJSONCombinedORM/mean": 0.4039691686630249, "rewards/VisualizationJSONCombinedORM/std": 0.10206390917301178, "step": 2777, "train_speed(iter/s)": 0.199899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 312.6875, "completions/min_length": 266.0, "epoch": 2.2977667493796528, "grad_norm": 0.180910125374794, "kl": 0.114990234375, "learning_rate": 1.5785276109947028e-06, "loss": 0.0011492539197206497, "memory(GiB)": 38.13, "reward": 0.5150856971740723, "reward_std": 0.04766594618558884, "rewards/VisualizationJSONCombinedORM/mean": 0.5150856971740723, "rewards/VisualizationJSONCombinedORM/std": 0.2636241316795349, "step": 2778, "train_speed(iter/s)": 0.199583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 293.4375, "completions/min_length": 240.0, "epoch": 2.2985938792390406, "grad_norm": 0.157606303691864, "kl": 0.06695556640625, "learning_rate": 1.575019901762097e-06, "loss": 0.0006709322333335876, "memory(GiB)": 38.13, "reward": 0.7145165205001831, "reward_std": 0.09268045425415039, "rewards/VisualizationJSONCombinedORM/mean": 0.7145165205001831, "rewards/VisualizationJSONCombinedORM/std": 0.09496255964040756, "step": 2779, "train_speed(iter/s)": 0.199238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 312.9375, "completions/min_length": 253.0, "epoch": 2.2994210090984284, "grad_norm": 0.17909923195838928, "kl": 0.0802001953125, "learning_rate": 1.5715153654373782e-06, "loss": 0.00080157071352005, "memory(GiB)": 38.13, "reward": 0.35899627208709717, "reward_std": 0.0777527317404747, "rewards/VisualizationJSONCombinedORM/mean": 0.35899627208709717, "rewards/VisualizationJSONCombinedORM/std": 0.08959643542766571, "step": 2780, "train_speed(iter/s)": 0.198942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 304.625, "completions/min_length": 236.0, "epoch": 2.300248138957816, "grad_norm": 0.14568525552749634, "kl": 0.0439453125, "learning_rate": 1.5680140052671516e-06, "loss": 0.000439448282122612, "memory(GiB)": 38.13, "reward": 0.49878498911857605, "reward_std": 0.05310124531388283, "rewards/VisualizationJSONCombinedORM/mean": 0.49878498911857605, "rewards/VisualizationJSONCombinedORM/std": 0.2402016669511795, "step": 2781, "train_speed(iter/s)": 0.198619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 326.625, "completions/min_length": 258.0, "epoch": 2.3010752688172045, "grad_norm": 0.17158563435077667, "kl": 0.0478515625, "learning_rate": 1.5645158244950853e-06, "loss": 0.0004785582423210144, "memory(GiB)": 38.13, "reward": 0.6866648197174072, "reward_std": 0.06439989805221558, "rewards/VisualizationJSONCombinedORM/mean": 0.6866648197174072, "rewards/VisualizationJSONCombinedORM/std": 0.11500313878059387, "step": 2782, "train_speed(iter/s)": 0.198326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 317.8125, "completions/min_length": 257.0, "epoch": 2.3019023986765923, "grad_norm": 0.16560998558998108, "kl": 0.06732177734375, "learning_rate": 1.5610208263619002e-06, "loss": 0.0006732270121574402, "memory(GiB)": 38.13, "reward": 0.5246757864952087, "reward_std": 0.04451281949877739, "rewards/VisualizationJSONCombinedORM/mean": 0.5246757864952087, "rewards/VisualizationJSONCombinedORM/std": 0.0441417321562767, "step": 2783, "train_speed(iter/s)": 0.198012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 285.5, "completions/min_length": 219.0, "epoch": 2.30272952853598, "grad_norm": 0.18905426561832428, "kl": 0.09039306640625, "learning_rate": 1.5575290141053712e-06, "loss": 0.0009065084159374237, "memory(GiB)": 38.13, "reward": 0.4492145776748657, "reward_std": 0.04063825309276581, "rewards/VisualizationJSONCombinedORM/mean": 0.4492145776748657, "rewards/VisualizationJSONCombinedORM/std": 0.15283484756946564, "step": 2784, "train_speed(iter/s)": 0.197657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 329.75, "completions/min_length": 224.0, "epoch": 2.303556658395368, "grad_norm": 0.1699196994304657, "kl": 0.0521240234375, "learning_rate": 1.5540403909603158e-06, "loss": 0.0005221068859100342, "memory(GiB)": 38.13, "reward": 0.39638322591781616, "reward_std": 0.03493838012218475, "rewards/VisualizationJSONCombinedORM/mean": 0.39638322591781616, "rewards/VisualizationJSONCombinedORM/std": 0.07945287227630615, "step": 2785, "train_speed(iter/s)": 0.197322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 279.5, "completions/min_length": 238.0, "epoch": 2.304383788254756, "grad_norm": 0.159159317612648, "kl": 0.07281494140625, "learning_rate": 1.5505549601586045e-06, "loss": 0.0007293596863746643, "memory(GiB)": 38.13, "reward": 0.5884092450141907, "reward_std": 0.04677724465727806, "rewards/VisualizationJSONCombinedORM/mean": 0.5884092450141907, "rewards/VisualizationJSONCombinedORM/std": 0.14085093140602112, "step": 2786, "train_speed(iter/s)": 0.197109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 299.875, "completions/min_length": 253.0, "epoch": 2.305210918114144, "grad_norm": 0.1687237024307251, "kl": 0.05352783203125, "learning_rate": 1.5470727249291423e-06, "loss": 0.0005354136228561401, "memory(GiB)": 38.13, "reward": 0.7409639954566956, "reward_std": 0.058205097913742065, "rewards/VisualizationJSONCombinedORM/mean": 0.7409639954566956, "rewards/VisualizationJSONCombinedORM/std": 0.06345468014478683, "step": 2787, "train_speed(iter/s)": 0.196832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 312.0, "completions/min_length": 277.0, "epoch": 2.306038047973532, "grad_norm": 0.20433644950389862, "kl": 0.0667724609375, "learning_rate": 1.5435936884978842e-06, "loss": 0.0006667338311672211, "memory(GiB)": 38.13, "reward": 0.5574462413787842, "reward_std": 0.05408701300621033, "rewards/VisualizationJSONCombinedORM/mean": 0.5574462413787842, "rewards/VisualizationJSONCombinedORM/std": 0.21696211397647858, "step": 2788, "train_speed(iter/s)": 0.19658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 304.5625, "completions/min_length": 239.0, "epoch": 2.3068651778329197, "grad_norm": 0.1656820923089981, "kl": 0.0863037109375, "learning_rate": 1.540117854087811e-06, "loss": 0.0008626948110759258, "memory(GiB)": 38.13, "reward": 0.43847930431365967, "reward_std": 0.05215878412127495, "rewards/VisualizationJSONCombinedORM/mean": 0.43847930431365967, "rewards/VisualizationJSONCombinedORM/std": 0.06898587197065353, "step": 2789, "train_speed(iter/s)": 0.196296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 302.0625, "completions/min_length": 242.0, "epoch": 2.3076923076923075, "grad_norm": 0.21152864396572113, "kl": 0.06719970703125, "learning_rate": 1.5366452249189462e-06, "loss": 0.0006723031401634216, "memory(GiB)": 38.13, "reward": 0.5478404760360718, "reward_std": 0.0989612266421318, "rewards/VisualizationJSONCombinedORM/mean": 0.5478404760360718, "rewards/VisualizationJSONCombinedORM/std": 0.11914584040641785, "step": 2790, "train_speed(iter/s)": 0.196008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 272.5625, "completions/min_length": 209.0, "epoch": 2.3085194375516958, "grad_norm": 0.2232143133878708, "kl": 0.0494384765625, "learning_rate": 1.5331758042083355e-06, "loss": 0.0004949420690536499, "memory(GiB)": 38.13, "reward": 0.4726261496543884, "reward_std": 0.07104703783988953, "rewards/VisualizationJSONCombinedORM/mean": 0.4726261496543884, "rewards/VisualizationJSONCombinedORM/std": 0.10255862772464752, "step": 2791, "train_speed(iter/s)": 0.195775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 263.4375, "completions/min_length": 238.0, "epoch": 2.3093465674110836, "grad_norm": 0.14330248534679413, "kl": 0.1978759765625, "learning_rate": 1.5297095951700598e-06, "loss": 0.0019783377647399902, "memory(GiB)": 38.13, "reward": 0.4008501172065735, "reward_std": 0.0446210578083992, "rewards/VisualizationJSONCombinedORM/mean": 0.4008501172065735, "rewards/VisualizationJSONCombinedORM/std": 0.13303008675575256, "step": 2792, "train_speed(iter/s)": 0.195505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 310.625, "completions/min_length": 270.0, "epoch": 2.3101736972704714, "grad_norm": 0.16280466318130493, "kl": 0.0574951171875, "learning_rate": 1.52624660101522e-06, "loss": 0.0005735605955123901, "memory(GiB)": 38.13, "reward": 0.6467204093933105, "reward_std": 0.04026452824473381, "rewards/VisualizationJSONCombinedORM/mean": 0.6467204093933105, "rewards/VisualizationJSONCombinedORM/std": 0.2036646455526352, "step": 2793, "train_speed(iter/s)": 0.195235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 307.0625, "completions/min_length": 260.0, "epoch": 2.311000827129859, "grad_norm": 0.19462205469608307, "kl": 0.04132080078125, "learning_rate": 1.5227868249519423e-06, "loss": 0.0004127994179725647, "memory(GiB)": 38.13, "reward": 0.5441316366195679, "reward_std": 0.09528881311416626, "rewards/VisualizationJSONCombinedORM/mean": 0.5441316366195679, "rewards/VisualizationJSONCombinedORM/std": 0.10360068082809448, "step": 2794, "train_speed(iter/s)": 0.194925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 305.625, "completions/min_length": 249.0, "epoch": 2.3118279569892475, "grad_norm": 0.17568139731884003, "kl": 0.04571533203125, "learning_rate": 1.5193302701853674e-06, "loss": 0.00045657530426979065, "memory(GiB)": 38.13, "reward": 0.5338222980499268, "reward_std": 0.0793101042509079, "rewards/VisualizationJSONCombinedORM/mean": 0.5338222980499268, "rewards/VisualizationJSONCombinedORM/std": 0.17068327963352203, "step": 2795, "train_speed(iter/s)": 0.194656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 308.4375, "completions/min_length": 249.0, "epoch": 2.3126550868486353, "grad_norm": 0.16644617915153503, "kl": 0.0810546875, "learning_rate": 1.5158769399176559e-06, "loss": 0.0008104303851723671, "memory(GiB)": 38.13, "reward": 0.2931205630302429, "reward_std": 0.02981431782245636, "rewards/VisualizationJSONCombinedORM/mean": 0.2931205630302429, "rewards/VisualizationJSONCombinedORM/std": 0.07296611368656158, "step": 2796, "train_speed(iter/s)": 0.194381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 316.8125, "completions/min_length": 242.0, "epoch": 2.313482216708023, "grad_norm": 0.1722581386566162, "kl": 0.085693359375, "learning_rate": 1.5124268373479768e-06, "loss": 0.000857226550579071, "memory(GiB)": 38.13, "reward": 0.7052264213562012, "reward_std": 0.04249628260731697, "rewards/VisualizationJSONCombinedORM/mean": 0.7052264213562012, "rewards/VisualizationJSONCombinedORM/std": 0.07161478698253632, "step": 2797, "train_speed(iter/s)": 0.194109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 286.6875, "completions/min_length": 205.0, "epoch": 2.314309346567411, "grad_norm": 0.15078797936439514, "kl": 0.0880126953125, "learning_rate": 1.5089799656725134e-06, "loss": 0.0008808374404907227, "memory(GiB)": 38.13, "reward": 0.6806865930557251, "reward_std": 0.049191415309906006, "rewards/VisualizationJSONCombinedORM/mean": 0.6806865930557251, "rewards/VisualizationJSONCombinedORM/std": 0.1407455950975418, "step": 2798, "train_speed(iter/s)": 0.193749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 302.875, "completions/min_length": 246.0, "epoch": 2.315136476426799, "grad_norm": 0.17069391906261444, "kl": 0.0938720703125, "learning_rate": 1.505536328084453e-06, "loss": 0.0009408518671989441, "memory(GiB)": 38.13, "reward": 0.5981451272964478, "reward_std": 0.05911122262477875, "rewards/VisualizationJSONCombinedORM/mean": 0.5981451272964478, "rewards/VisualizationJSONCombinedORM/std": 0.09672436118125916, "step": 2799, "train_speed(iter/s)": 0.193442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 312.125, "completions/min_length": 261.0, "epoch": 2.315963606286187, "grad_norm": 0.22976697981357574, "kl": 0.1036376953125, "learning_rate": 1.5020959277739905e-06, "loss": 0.0010363385081291199, "memory(GiB)": 38.13, "reward": 0.5779519081115723, "reward_std": 0.10664209723472595, "rewards/VisualizationJSONCombinedORM/mean": 0.5779519081115723, "rewards/VisualizationJSONCombinedORM/std": 0.12795418500900269, "step": 2800, "train_speed(iter/s)": 0.19318 }, { "epoch": 2.315963606286187, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.1666666666667, "eval_completions/mean_length": 310.7291666666667, "eval_completions/min_length": 256.0833333333333, "eval_kl": 0.0814208984375, "eval_loss": 0.0008149656350724399, "eval_reward": 0.47949066137274104, "eval_reward_std": 0.0637559008706982, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47949066137274104, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06375590308258931, "eval_runtime": 315.4213, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 284.9375, "completions/min_length": 213.0, "epoch": 2.316790736145575, "grad_norm": 0.16510364413261414, "kl": 0.036834716796875, "learning_rate": 1.4986587679283148e-06, "loss": 0.0003672465682029724, "memory(GiB)": 38.13, "reward": 0.4714053273200989, "reward_std": 0.06956163048744202, "rewards/VisualizationJSONCombinedORM/mean": 0.4714053273200989, "rewards/VisualizationJSONCombinedORM/std": 0.06930657476186752, "step": 2801, "train_speed(iter/s)": 0.188783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 312.75, "completions/min_length": 253.0, "epoch": 2.3176178660049627, "grad_norm": 0.1743442267179489, "kl": 0.082763671875, "learning_rate": 1.4952248517316215e-06, "loss": 0.0008297041058540344, "memory(GiB)": 38.13, "reward": 0.5773881077766418, "reward_std": 0.04633841663599014, "rewards/VisualizationJSONCombinedORM/mean": 0.5773881077766418, "rewards/VisualizationJSONCombinedORM/std": 0.21721512079238892, "step": 2802, "train_speed(iter/s)": 0.188489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 291.0, "completions/min_length": 244.0, "epoch": 2.3184449958643505, "grad_norm": 0.1870303601026535, "kl": 0.08026123046875, "learning_rate": 1.4917941823650917e-06, "loss": 0.0008017346262931824, "memory(GiB)": 38.13, "reward": 0.6171428561210632, "reward_std": 0.09398864209651947, "rewards/VisualizationJSONCombinedORM/mean": 0.6171428561210632, "rewards/VisualizationJSONCombinedORM/std": 0.12391627579927444, "step": 2803, "train_speed(iter/s)": 0.188251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 288.25, "completions/min_length": 232.0, "epoch": 2.3192721257237388, "grad_norm": 0.2228519469499588, "kl": 0.087646484375, "learning_rate": 1.4883667630069093e-06, "loss": 0.0008781217038631439, "memory(GiB)": 38.13, "reward": 0.5492041110992432, "reward_std": 0.09661461412906647, "rewards/VisualizationJSONCombinedORM/mean": 0.5492041110992432, "rewards/VisualizationJSONCombinedORM/std": 0.15150108933448792, "step": 2804, "train_speed(iter/s)": 0.188001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 291.4375, "completions/min_length": 242.0, "epoch": 2.3200992555831266, "grad_norm": 0.16968485713005066, "kl": 0.0751953125, "learning_rate": 1.4849425968322384e-06, "loss": 0.0007508993148803711, "memory(GiB)": 38.13, "reward": 0.4259868860244751, "reward_std": 0.04938773065805435, "rewards/VisualizationJSONCombinedORM/mean": 0.4259868860244751, "rewards/VisualizationJSONCombinedORM/std": 0.05844886228442192, "step": 2805, "train_speed(iter/s)": 0.187722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 318.5, "completions/min_length": 258.0, "epoch": 2.3209263854425144, "grad_norm": 0.167565256357193, "kl": 0.04052734375, "learning_rate": 1.4815216870132354e-06, "loss": 0.0004043504595756531, "memory(GiB)": 38.13, "reward": 0.6047625541687012, "reward_std": 0.07769443094730377, "rewards/VisualizationJSONCombinedORM/mean": 0.6047625541687012, "rewards/VisualizationJSONCombinedORM/std": 0.0764467865228653, "step": 2806, "train_speed(iter/s)": 0.187414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 284.625, "completions/min_length": 212.0, "epoch": 2.321753515301902, "grad_norm": 0.20201407372951508, "kl": 0.10296630859375, "learning_rate": 1.478104036719034e-06, "loss": 0.0010307561606168747, "memory(GiB)": 38.13, "reward": 0.5225064754486084, "reward_std": 0.09022796154022217, "rewards/VisualizationJSONCombinedORM/mean": 0.5225064754486084, "rewards/VisualizationJSONCombinedORM/std": 0.0894130989909172, "step": 2807, "train_speed(iter/s)": 0.187187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 293.625, "completions/min_length": 230.0, "epoch": 2.3225806451612905, "grad_norm": 0.15447187423706055, "kl": 0.028350830078125, "learning_rate": 1.4746896491157541e-06, "loss": 0.0002835206687450409, "memory(GiB)": 38.13, "reward": 0.5553321242332458, "reward_std": 0.06569980084896088, "rewards/VisualizationJSONCombinedORM/mean": 0.5553321242332458, "rewards/VisualizationJSONCombinedORM/std": 0.24427413940429688, "step": 2808, "train_speed(iter/s)": 0.186938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 297.125, "completions/min_length": 242.0, "epoch": 2.3234077750206783, "grad_norm": 0.16985774040222168, "kl": 0.06927490234375, "learning_rate": 1.471278527366491e-06, "loss": 0.0006920322775840759, "memory(GiB)": 38.13, "reward": 0.5051397681236267, "reward_std": 0.05553349852561951, "rewards/VisualizationJSONCombinedORM/mean": 0.5051397681236267, "rewards/VisualizationJSONCombinedORM/std": 0.0606292225420475, "step": 2809, "train_speed(iter/s)": 0.186658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 289.6875, "completions/min_length": 236.0, "epoch": 2.324234904880066, "grad_norm": 0.16093076765537262, "kl": 0.0728759765625, "learning_rate": 1.4678706746313143e-06, "loss": 0.0007285922765731812, "memory(GiB)": 38.13, "reward": 0.6176549792289734, "reward_std": 0.07558993995189667, "rewards/VisualizationJSONCombinedORM/mean": 0.6176549792289734, "rewards/VisualizationJSONCombinedORM/std": 0.16412349045276642, "step": 2810, "train_speed(iter/s)": 0.186423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 291.5, "completions/min_length": 230.0, "epoch": 2.325062034739454, "grad_norm": 0.173152893781662, "kl": 0.0894775390625, "learning_rate": 1.4644660940672628e-06, "loss": 0.0008955635130405426, "memory(GiB)": 38.13, "reward": 0.618238627910614, "reward_std": 0.07706013321876526, "rewards/VisualizationJSONCombinedORM/mean": 0.618238627910614, "rewards/VisualizationJSONCombinedORM/std": 0.10928692668676376, "step": 2811, "train_speed(iter/s)": 0.186225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 282.0, "completions/min_length": 237.0, "epoch": 2.325889164598842, "grad_norm": 0.19152729213237762, "kl": 0.0877685546875, "learning_rate": 1.461064788828348e-06, "loss": 0.0008773729205131531, "memory(GiB)": 38.13, "reward": 0.4337875545024872, "reward_std": 0.05100959911942482, "rewards/VisualizationJSONCombinedORM/mean": 0.4337875545024872, "rewards/VisualizationJSONCombinedORM/std": 0.20851129293441772, "step": 2812, "train_speed(iter/s)": 0.185975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 293.1875, "completions/min_length": 218.0, "epoch": 2.32671629445823, "grad_norm": 0.2013424038887024, "kl": 0.0673828125, "learning_rate": 1.4576667620655461e-06, "loss": 0.0006737187504768372, "memory(GiB)": 38.13, "reward": 0.6550099849700928, "reward_std": 0.07855190336704254, "rewards/VisualizationJSONCombinedORM/mean": 0.6550099849700928, "rewards/VisualizationJSONCombinedORM/std": 0.13496308028697968, "step": 2813, "train_speed(iter/s)": 0.185778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 302.5, "completions/min_length": 239.0, "epoch": 2.327543424317618, "grad_norm": 0.17881783843040466, "kl": 0.0601806640625, "learning_rate": 1.4542720169267933e-06, "loss": 0.0006010159850120544, "memory(GiB)": 38.13, "reward": 0.5901307463645935, "reward_std": 0.08424852788448334, "rewards/VisualizationJSONCombinedORM/mean": 0.5901307463645935, "rewards/VisualizationJSONCombinedORM/std": 0.2073340266942978, "step": 2814, "train_speed(iter/s)": 0.185594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 311.8125, "completions/min_length": 259.0, "epoch": 2.3283705541770057, "grad_norm": 0.18088650703430176, "kl": 0.03485107421875, "learning_rate": 1.4508805565569883e-06, "loss": 0.0003483183681964874, "memory(GiB)": 38.13, "reward": 0.534074068069458, "reward_std": 0.02809026651084423, "rewards/VisualizationJSONCombinedORM/mean": 0.534074068069458, "rewards/VisualizationJSONCombinedORM/std": 0.036760032176971436, "step": 2815, "train_speed(iter/s)": 0.185324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 311.3125, "completions/min_length": 234.0, "epoch": 2.3291976840363935, "grad_norm": 0.21506093442440033, "kl": 0.1075439453125, "learning_rate": 1.4474923840979871e-06, "loss": 0.0010772794485092163, "memory(GiB)": 38.13, "reward": 0.7497315406799316, "reward_std": 0.10452329367399216, "rewards/VisualizationJSONCombinedORM/mean": 0.7497315406799316, "rewards/VisualizationJSONCombinedORM/std": 0.10559464246034622, "step": 2816, "train_speed(iter/s)": 0.185061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 306.25, "completions/min_length": 243.0, "epoch": 2.3300248138957818, "grad_norm": 0.16875337064266205, "kl": 0.0687255859375, "learning_rate": 1.4441075026885999e-06, "loss": 0.0006871968507766724, "memory(GiB)": 38.13, "reward": 0.2225971668958664, "reward_std": 0.028798311948776245, "rewards/VisualizationJSONCombinedORM/mean": 0.2225971668958664, "rewards/VisualizationJSONCombinedORM/std": 0.03983192518353462, "step": 2817, "train_speed(iter/s)": 0.184748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 278.5625, "completions/min_length": 221.0, "epoch": 2.3308519437551696, "grad_norm": 0.1849845051765442, "kl": 0.0723876953125, "learning_rate": 1.4407259154645841e-06, "loss": 0.0007245689630508423, "memory(GiB)": 38.13, "reward": 0.4929288327693939, "reward_std": 0.07918375730514526, "rewards/VisualizationJSONCombinedORM/mean": 0.4929288327693939, "rewards/VisualizationJSONCombinedORM/std": 0.12621267139911652, "step": 2818, "train_speed(iter/s)": 0.184487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 300.375, "completions/min_length": 270.0, "epoch": 2.3316790736145574, "grad_norm": 0.16527192294597626, "kl": 0.09228515625, "learning_rate": 1.4373476255586515e-06, "loss": 0.0009212549775838852, "memory(GiB)": 38.13, "reward": 0.7222102880477905, "reward_std": 0.07511701434850693, "rewards/VisualizationJSONCombinedORM/mean": 0.7222102880477905, "rewards/VisualizationJSONCombinedORM/std": 0.07852762192487717, "step": 2819, "train_speed(iter/s)": 0.184218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 314.3125, "completions/min_length": 245.0, "epoch": 2.332506203473945, "grad_norm": 0.19475752115249634, "kl": 0.07366943359375, "learning_rate": 1.433972636100452e-06, "loss": 0.0007373318076133728, "memory(GiB)": 38.13, "reward": 0.5274311304092407, "reward_std": 0.0751892626285553, "rewards/VisualizationJSONCombinedORM/mean": 0.5274311304092407, "rewards/VisualizationJSONCombinedORM/std": 0.09024064242839813, "step": 2820, "train_speed(iter/s)": 0.18398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 317.6875, "completions/min_length": 242.0, "epoch": 2.3333333333333335, "grad_norm": 0.21822071075439453, "kl": 0.0679931640625, "learning_rate": 1.4306009502165873e-06, "loss": 0.0006793364882469177, "memory(GiB)": 38.13, "reward": 0.5559775233268738, "reward_std": 0.05679550766944885, "rewards/VisualizationJSONCombinedORM/mean": 0.5559775233268738, "rewards/VisualizationJSONCombinedORM/std": 0.1316872388124466, "step": 2821, "train_speed(iter/s)": 0.18368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 309.5, "completions/min_length": 258.0, "epoch": 2.3341604631927213, "grad_norm": 0.1837129145860672, "kl": 0.05059814453125, "learning_rate": 1.4272325710305895e-06, "loss": 0.0005059130489826202, "memory(GiB)": 38.13, "reward": 0.5083275437355042, "reward_std": 0.06688527017831802, "rewards/VisualizationJSONCombinedORM/mean": 0.5083275437355042, "rewards/VisualizationJSONCombinedORM/std": 0.19980373978614807, "step": 2822, "train_speed(iter/s)": 0.183444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 296.125, "completions/min_length": 237.0, "epoch": 2.334987593052109, "grad_norm": 0.2563263773918152, "kl": 0.07958984375, "learning_rate": 1.423867501662934e-06, "loss": 0.0007956810295581818, "memory(GiB)": 38.13, "reward": 0.555321991443634, "reward_std": 0.08009251207113266, "rewards/VisualizationJSONCombinedORM/mean": 0.555321991443634, "rewards/VisualizationJSONCombinedORM/std": 0.14943228662014008, "step": 2823, "train_speed(iter/s)": 0.183211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 301.4375, "completions/min_length": 241.0, "epoch": 2.335814722911497, "grad_norm": 0.1520751416683197, "kl": 0.1044921875, "learning_rate": 1.420505745231024e-06, "loss": 0.0010453686118125916, "memory(GiB)": 38.13, "reward": 0.4554211497306824, "reward_std": 0.04452173411846161, "rewards/VisualizationJSONCombinedORM/mean": 0.4554211497306824, "rewards/VisualizationJSONCombinedORM/std": 0.2438160479068756, "step": 2824, "train_speed(iter/s)": 0.182919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 361.375, "completions/min_length": 306.0, "epoch": 2.336641852770885, "grad_norm": 0.1748993694782257, "kl": 0.035980224609375, "learning_rate": 1.417147304849199e-06, "loss": 0.00036059319972991943, "memory(GiB)": 38.13, "reward": 0.6805739402770996, "reward_std": 0.07949370890855789, "rewards/VisualizationJSONCombinedORM/mean": 0.6805739402770996, "rewards/VisualizationJSONCombinedORM/std": 0.14430369436740875, "step": 2825, "train_speed(iter/s)": 0.182633 }, { "epoch": 2.336641852770885, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.75, "eval_completions/mean_length": 301.96875, "eval_completions/min_length": 254.25, "eval_kl": 0.0806884765625, "eval_loss": 0.0008122597937472165, "eval_reward": 0.4706538661072652, "eval_reward_std": 0.06507711838154744, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4706538661072652, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06507712133073558, "eval_runtime": 305.5281, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 296.625, "completions/min_length": 242.0, "epoch": 2.337468982630273, "grad_norm": 0.2078356295824051, "kl": 0.1063232421875, "learning_rate": 1.4137921836287238e-06, "loss": 0.0010619685053825378, "memory(GiB)": 38.13, "reward": 0.5167495608329773, "reward_std": 0.09353785961866379, "rewards/VisualizationJSONCombinedORM/mean": 0.5167495608329773, "rewards/VisualizationJSONCombinedORM/std": 0.19479921460151672, "step": 2826, "train_speed(iter/s)": 0.178798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 272.8125, "completions/min_length": 213.0, "epoch": 2.338296112489661, "grad_norm": 0.16980932652950287, "kl": 0.04681396484375, "learning_rate": 1.410440384677791e-06, "loss": 0.00046667084097862244, "memory(GiB)": 38.13, "reward": 0.45349055528640747, "reward_std": 0.03895598649978638, "rewards/VisualizationJSONCombinedORM/mean": 0.45349055528640747, "rewards/VisualizationJSONCombinedORM/std": 0.07065670192241669, "step": 2827, "train_speed(iter/s)": 0.17856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 316.9375, "completions/min_length": 259.0, "epoch": 2.3391232423490487, "grad_norm": 0.18067097663879395, "kl": 0.06097412109375, "learning_rate": 1.4070919111015097e-06, "loss": 0.0006083771586418152, "memory(GiB)": 38.13, "reward": 0.509697437286377, "reward_std": 0.06366746127605438, "rewards/VisualizationJSONCombinedORM/mean": 0.509697437286377, "rewards/VisualizationJSONCombinedORM/std": 0.1914883404970169, "step": 2828, "train_speed(iter/s)": 0.178344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 328.125, "completions/min_length": 250.0, "epoch": 2.3399503722084365, "grad_norm": 0.22354306280612946, "kl": 0.05560302734375, "learning_rate": 1.4037467660019156e-06, "loss": 0.0005558021366596222, "memory(GiB)": 38.13, "reward": 0.6077160835266113, "reward_std": 0.11172345280647278, "rewards/VisualizationJSONCombinedORM/mean": 0.6077160835266113, "rewards/VisualizationJSONCombinedORM/std": 0.1575492024421692, "step": 2829, "train_speed(iter/s)": 0.178072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 304.4375, "completions/min_length": 254.0, "epoch": 2.3407775020678248, "grad_norm": 0.14883877336978912, "kl": 0.04302978515625, "learning_rate": 1.400404952477954e-06, "loss": 0.000430794432759285, "memory(GiB)": 38.13, "reward": 0.5594910383224487, "reward_std": 0.06950286775827408, "rewards/VisualizationJSONCombinedORM/mean": 0.5594910383224487, "rewards/VisualizationJSONCombinedORM/std": 0.190229594707489, "step": 2830, "train_speed(iter/s)": 0.177848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 278.75, "completions/min_length": 233.0, "epoch": 2.3416046319272126, "grad_norm": 0.2369309812784195, "kl": 0.0782470703125, "learning_rate": 1.3970664736254885e-06, "loss": 0.000783286988735199, "memory(GiB)": 38.13, "reward": 0.5342264175415039, "reward_std": 0.06591882556676865, "rewards/VisualizationJSONCombinedORM/mean": 0.5342264175415039, "rewards/VisualizationJSONCombinedORM/std": 0.25321468710899353, "step": 2831, "train_speed(iter/s)": 0.177644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 276.0, "completions/min_length": 236.0, "epoch": 2.3424317617866004, "grad_norm": 0.1548984795808792, "kl": 0.035736083984375, "learning_rate": 1.3937313325372919e-06, "loss": 0.00035772472620010376, "memory(GiB)": 38.13, "reward": 0.619879961013794, "reward_std": 0.06365266442298889, "rewards/VisualizationJSONCombinedORM/mean": 0.619879961013794, "rewards/VisualizationJSONCombinedORM/std": 0.07537787407636642, "step": 2832, "train_speed(iter/s)": 0.177433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 307.1875, "completions/min_length": 248.0, "epoch": 2.3432588916459887, "grad_norm": 0.17208930850028992, "kl": 0.05072021484375, "learning_rate": 1.3903995323030473e-06, "loss": 0.0005069039762020111, "memory(GiB)": 38.13, "reward": 0.5522753596305847, "reward_std": 0.04055941477417946, "rewards/VisualizationJSONCombinedORM/mean": 0.5522753596305847, "rewards/VisualizationJSONCombinedORM/std": 0.18078379333019257, "step": 2833, "train_speed(iter/s)": 0.177158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 311.9375, "completions/min_length": 250.0, "epoch": 2.3440860215053765, "grad_norm": 0.19006364047527313, "kl": 0.077880859375, "learning_rate": 1.387071076009337e-06, "loss": 0.0007785633206367493, "memory(GiB)": 38.13, "reward": 0.4700593650341034, "reward_std": 0.09806925058364868, "rewards/VisualizationJSONCombinedORM/mean": 0.4700593650341034, "rewards/VisualizationJSONCombinedORM/std": 0.11447808146476746, "step": 2834, "train_speed(iter/s)": 0.17694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 294.0625, "completions/min_length": 222.0, "epoch": 2.3449131513647643, "grad_norm": 0.24444663524627686, "kl": 0.08148193359375, "learning_rate": 1.383745966739652e-06, "loss": 0.0008149594068527222, "memory(GiB)": 38.13, "reward": 0.5226812362670898, "reward_std": 0.061685435473918915, "rewards/VisualizationJSONCombinedORM/mean": 0.5226812362670898, "rewards/VisualizationJSONCombinedORM/std": 0.10648655146360397, "step": 2835, "train_speed(iter/s)": 0.176684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 294.625, "completions/min_length": 237.0, "epoch": 2.345740281224152, "grad_norm": 0.2378869503736496, "kl": 0.07208251953125, "learning_rate": 1.3804242075743751e-06, "loss": 0.0007190462201833725, "memory(GiB)": 38.13, "reward": 0.46107131242752075, "reward_std": 0.08065693825483322, "rewards/VisualizationJSONCombinedORM/mean": 0.46107131242752075, "rewards/VisualizationJSONCombinedORM/std": 0.14826150238513947, "step": 2836, "train_speed(iter/s)": 0.176461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 285.75, "completions/min_length": 230.0, "epoch": 2.34656741108354, "grad_norm": 0.2415211796760559, "kl": 0.03729248046875, "learning_rate": 1.3771058015907967e-06, "loss": 0.00037229806184768677, "memory(GiB)": 38.13, "reward": 0.302722692489624, "reward_std": 0.05980876460671425, "rewards/VisualizationJSONCombinedORM/mean": 0.302722692489624, "rewards/VisualizationJSONCombinedORM/std": 0.07277563959360123, "step": 2837, "train_speed(iter/s)": 0.176298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 304.125, "completions/min_length": 245.0, "epoch": 2.347394540942928, "grad_norm": 0.22494202852249146, "kl": 0.086669921875, "learning_rate": 1.37379075186309e-06, "loss": 0.0008637867867946625, "memory(GiB)": 38.13, "reward": 0.5338248014450073, "reward_std": 0.07004792988300323, "rewards/VisualizationJSONCombinedORM/mean": 0.5338248014450073, "rewards/VisualizationJSONCombinedORM/std": 0.0751357302069664, "step": 2838, "train_speed(iter/s)": 0.176078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 304.6875, "completions/min_length": 248.0, "epoch": 2.348221670802316, "grad_norm": 0.2300439029932022, "kl": 0.1142578125, "learning_rate": 1.3704790614623248e-06, "loss": 0.001143038272857666, "memory(GiB)": 38.13, "reward": 0.6057237386703491, "reward_std": 0.05175945162773132, "rewards/VisualizationJSONCombinedORM/mean": 0.6057237386703491, "rewards/VisualizationJSONCombinedORM/std": 0.07819075137376785, "step": 2839, "train_speed(iter/s)": 0.17589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 320.125, "completions/min_length": 244.0, "epoch": 2.349048800661704, "grad_norm": 0.20059870183467865, "kl": 0.0645751953125, "learning_rate": 1.3671707334564549e-06, "loss": 0.0006463862955570221, "memory(GiB)": 38.13, "reward": 0.5606991648674011, "reward_std": 0.054902561008930206, "rewards/VisualizationJSONCombinedORM/mean": 0.5606991648674011, "rewards/VisualizationJSONCombinedORM/std": 0.07363928109407425, "step": 2840, "train_speed(iter/s)": 0.175662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 323.375, "completions/min_length": 247.0, "epoch": 2.3498759305210917, "grad_norm": 0.16899803280830383, "kl": 0.0606689453125, "learning_rate": 1.3638657709103238e-06, "loss": 0.000605948269367218, "memory(GiB)": 38.13, "reward": 0.6954441070556641, "reward_std": 0.07529542595148087, "rewards/VisualizationJSONCombinedORM/mean": 0.6954441070556641, "rewards/VisualizationJSONCombinedORM/std": 0.0867544487118721, "step": 2841, "train_speed(iter/s)": 0.175491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 308.625, "completions/min_length": 266.0, "epoch": 2.3507030603804795, "grad_norm": 0.16954955458641052, "kl": 0.0667724609375, "learning_rate": 1.3605641768856536e-06, "loss": 0.000668209046125412, "memory(GiB)": 38.13, "reward": 0.6516867876052856, "reward_std": 0.0735340267419815, "rewards/VisualizationJSONCombinedORM/mean": 0.6516867876052856, "rewards/VisualizationJSONCombinedORM/std": 0.08990868180990219, "step": 2842, "train_speed(iter/s)": 0.175317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 300.0625, "completions/min_length": 257.0, "epoch": 2.3515301902398678, "grad_norm": 0.18072862923145294, "kl": 0.04510498046875, "learning_rate": 1.3572659544410493e-06, "loss": 0.00045068562030792236, "memory(GiB)": 38.13, "reward": 0.5272249579429626, "reward_std": 0.0598071813583374, "rewards/VisualizationJSONCombinedORM/mean": 0.5272249579429626, "rewards/VisualizationJSONCombinedORM/std": 0.08984388411045074, "step": 2843, "train_speed(iter/s)": 0.175117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 273.4375, "completions/min_length": 234.0, "epoch": 2.3523573200992556, "grad_norm": 0.1938200294971466, "kl": 0.114990234375, "learning_rate": 1.3539711066319873e-06, "loss": 0.0011494457721710205, "memory(GiB)": 38.13, "reward": 0.658004641532898, "reward_std": 0.08893747627735138, "rewards/VisualizationJSONCombinedORM/mean": 0.658004641532898, "rewards/VisualizationJSONCombinedORM/std": 0.12141714990139008, "step": 2844, "train_speed(iter/s)": 0.174944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 303.1875, "completions/min_length": 254.0, "epoch": 2.3531844499586434, "grad_norm": 0.2001419961452484, "kl": 0.07476806640625, "learning_rate": 1.3506796365108232e-06, "loss": 0.0007477402687072754, "memory(GiB)": 38.13, "reward": 0.5529730916023254, "reward_std": 0.051708586513996124, "rewards/VisualizationJSONCombinedORM/mean": 0.5529730916023254, "rewards/VisualizationJSONCombinedORM/std": 0.19307619333267212, "step": 2845, "train_speed(iter/s)": 0.174685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 282.375, "completions/min_length": 227.0, "epoch": 2.3540115798180317, "grad_norm": 0.18346838653087616, "kl": 0.07659912109375, "learning_rate": 1.3473915471267785e-06, "loss": 0.0007666610181331635, "memory(GiB)": 38.13, "reward": 0.40659570693969727, "reward_std": 0.024521246552467346, "rewards/VisualizationJSONCombinedORM/mean": 0.40659570693969727, "rewards/VisualizationJSONCombinedORM/std": 0.190602645277977, "step": 2846, "train_speed(iter/s)": 0.174464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 298.3125, "completions/min_length": 251.0, "epoch": 2.3548387096774195, "grad_norm": 0.25584787130355835, "kl": 0.04766845703125, "learning_rate": 1.3441068415259462e-06, "loss": 0.0004754886031150818, "memory(GiB)": 38.13, "reward": 0.5285072326660156, "reward_std": 0.04354521259665489, "rewards/VisualizationJSONCombinedORM/mean": 0.5285072326660156, "rewards/VisualizationJSONCombinedORM/std": 0.2862483561038971, "step": 2847, "train_speed(iter/s)": 0.174238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 336.125, "completions/min_length": 245.0, "epoch": 2.3556658395368073, "grad_norm": 0.19522880017757416, "kl": 0.066650390625, "learning_rate": 1.3408255227512845e-06, "loss": 0.0006673373281955719, "memory(GiB)": 38.13, "reward": 0.5345566868782043, "reward_std": 0.06984901428222656, "rewards/VisualizationJSONCombinedORM/mean": 0.5345566868782043, "rewards/VisualizationJSONCombinedORM/std": 0.11181282997131348, "step": 2848, "train_speed(iter/s)": 0.173975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 323.3125, "completions/min_length": 245.0, "epoch": 2.356492969396195, "grad_norm": 0.20358680188655853, "kl": 0.04547119140625, "learning_rate": 1.337547593842614e-06, "loss": 0.0004544667899608612, "memory(GiB)": 38.13, "reward": 0.5432318449020386, "reward_std": 0.08070333302021027, "rewards/VisualizationJSONCombinedORM/mean": 0.5432318449020386, "rewards/VisualizationJSONCombinedORM/std": 0.22537735104560852, "step": 2849, "train_speed(iter/s)": 0.173764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 294.1875, "completions/min_length": 243.0, "epoch": 2.357320099255583, "grad_norm": 0.1650225669145584, "kl": 0.05181884765625, "learning_rate": 1.334273057836611e-06, "loss": 0.0005182307213544846, "memory(GiB)": 38.13, "reward": 0.6814152002334595, "reward_std": 0.08879151940345764, "rewards/VisualizationJSONCombinedORM/mean": 0.6814152002334595, "rewards/VisualizationJSONCombinedORM/std": 0.09060318768024445, "step": 2850, "train_speed(iter/s)": 0.173512 }, { "epoch": 2.357320099255583, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.125, "eval_completions/mean_length": 305.609375, "eval_completions/min_length": 257.0416666666667, "eval_kl": 0.07155354817708333, "eval_loss": 0.0007229174370877445, "eval_reward": 0.45623987292249996, "eval_reward_std": 0.0600370035196344, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45623987292249996, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06003700441215187, "eval_runtime": 316.7449, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 300.875, "completions/min_length": 247.0, "epoch": 2.358147229114971, "grad_norm": 0.15444138646125793, "kl": 0.033782958984375, "learning_rate": 1.3310019177668154e-06, "loss": 0.00033814460039138794, "memory(GiB)": 38.13, "reward": 0.5388714075088501, "reward_std": 0.04780685156583786, "rewards/VisualizationJSONCombinedORM/mean": 0.5388714075088501, "rewards/VisualizationJSONCombinedORM/std": 0.14101099967956543, "step": 2851, "train_speed(iter/s)": 0.17004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 334.0625, "completions/min_length": 247.0, "epoch": 2.358974358974359, "grad_norm": 0.16655300557613373, "kl": 0.09381103515625, "learning_rate": 1.327734176663612e-06, "loss": 0.0009382069110870361, "memory(GiB)": 38.13, "reward": 0.5358072519302368, "reward_std": 0.05022966116666794, "rewards/VisualizationJSONCombinedORM/mean": 0.5358072519302368, "rewards/VisualizationJSONCombinedORM/std": 0.17026755213737488, "step": 2852, "train_speed(iter/s)": 0.169844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 272.3125, "completions/min_length": 235.0, "epoch": 2.359801488833747, "grad_norm": 0.1535353809595108, "kl": 0.03582763671875, "learning_rate": 1.3244698375542492e-06, "loss": 0.00035842880606651306, "memory(GiB)": 38.13, "reward": 0.7035324573516846, "reward_std": 0.09360907971858978, "rewards/VisualizationJSONCombinedORM/mean": 0.7035324573516846, "rewards/VisualizationJSONCombinedORM/std": 0.13931475579738617, "step": 2853, "train_speed(iter/s)": 0.169655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 322.8125, "completions/min_length": 240.0, "epoch": 2.3606286186931347, "grad_norm": 0.1778256893157959, "kl": 0.144775390625, "learning_rate": 1.3212089034628118e-06, "loss": 0.0014432594180107117, "memory(GiB)": 38.13, "reward": 0.5008976459503174, "reward_std": 0.08834857493638992, "rewards/VisualizationJSONCombinedORM/mean": 0.5008976459503174, "rewards/VisualizationJSONCombinedORM/std": 0.1706175059080124, "step": 2854, "train_speed(iter/s)": 0.169409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 303.4375, "completions/min_length": 238.0, "epoch": 2.361455748552523, "grad_norm": 0.17131739854812622, "kl": 0.0399169921875, "learning_rate": 1.3179513774102376e-06, "loss": 0.0003988537937402725, "memory(GiB)": 38.13, "reward": 0.4976934790611267, "reward_std": 0.03598566725850105, "rewards/VisualizationJSONCombinedORM/mean": 0.4976934790611267, "rewards/VisualizationJSONCombinedORM/std": 0.05072652921080589, "step": 2855, "train_speed(iter/s)": 0.169233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 314.8125, "completions/min_length": 263.0, "epoch": 2.3622828784119108, "grad_norm": 0.18130169808864594, "kl": 0.03155517578125, "learning_rate": 1.3146972624143024e-06, "loss": 0.00031570345163345337, "memory(GiB)": 38.13, "reward": 0.6164261698722839, "reward_std": 0.06662164628505707, "rewards/VisualizationJSONCombinedORM/mean": 0.6164261698722839, "rewards/VisualizationJSONCombinedORM/std": 0.08174562454223633, "step": 2856, "train_speed(iter/s)": 0.169047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 281.875, "completions/min_length": 218.0, "epoch": 2.3631100082712986, "grad_norm": 0.21695935726165771, "kl": 0.06768798828125, "learning_rate": 1.311446561489626e-06, "loss": 0.0006767362356185913, "memory(GiB)": 38.13, "reward": 0.7540139555931091, "reward_std": 0.05222966521978378, "rewards/VisualizationJSONCombinedORM/mean": 0.7540139555931091, "rewards/VisualizationJSONCombinedORM/std": 0.09129410237073898, "step": 2857, "train_speed(iter/s)": 0.168864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 328.4375, "completions/min_length": 261.0, "epoch": 2.3639371381306864, "grad_norm": 0.18241946399211884, "kl": 0.052001953125, "learning_rate": 1.3081992776476633e-06, "loss": 0.0005197077989578247, "memory(GiB)": 38.13, "reward": 0.5243282318115234, "reward_std": 0.07306428998708725, "rewards/VisualizationJSONCombinedORM/mean": 0.5243282318115234, "rewards/VisualizationJSONCombinedORM/std": 0.10803937166929245, "step": 2858, "train_speed(iter/s)": 0.168602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 285.4375, "completions/min_length": 232.0, "epoch": 2.3647642679900747, "grad_norm": 0.17157670855522156, "kl": 0.1163330078125, "learning_rate": 1.3049554138967052e-06, "loss": 0.0011619199067354202, "memory(GiB)": 38.13, "reward": 0.6317805051803589, "reward_std": 0.06490626931190491, "rewards/VisualizationJSONCombinedORM/mean": 0.6317805051803589, "rewards/VisualizationJSONCombinedORM/std": 0.16791824996471405, "step": 2859, "train_speed(iter/s)": 0.168455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 304.8125, "completions/min_length": 260.0, "epoch": 2.3655913978494625, "grad_norm": 0.15411469340324402, "kl": 0.026580810546875, "learning_rate": 1.301714973241871e-06, "loss": 0.0002653971314430237, "memory(GiB)": 38.13, "reward": 0.5441323518753052, "reward_std": 0.023522935807704926, "rewards/VisualizationJSONCombinedORM/mean": 0.5441323518753052, "rewards/VisualizationJSONCombinedORM/std": 0.25340354442596436, "step": 2860, "train_speed(iter/s)": 0.168257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 313.3125, "completions/min_length": 247.0, "epoch": 2.3664185277088503, "grad_norm": 0.22197076678276062, "kl": 0.05322265625, "learning_rate": 1.298477958685113e-06, "loss": 0.0005323793739080429, "memory(GiB)": 38.13, "reward": 0.4966868758201599, "reward_std": 0.069365493953228, "rewards/VisualizationJSONCombinedORM/mean": 0.4966868758201599, "rewards/VisualizationJSONCombinedORM/std": 0.2199033796787262, "step": 2861, "train_speed(iter/s)": 0.168068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 313.8125, "completions/min_length": 229.0, "epoch": 2.367245657568238, "grad_norm": 0.19622907042503357, "kl": 0.06854248046875, "learning_rate": 1.2952443732252058e-06, "loss": 0.0006850920617580414, "memory(GiB)": 38.13, "reward": 0.466377317905426, "reward_std": 0.06464187800884247, "rewards/VisualizationJSONCombinedORM/mean": 0.466377317905426, "rewards/VisualizationJSONCombinedORM/std": 0.2342173457145691, "step": 2862, "train_speed(iter/s)": 0.167894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 291.6875, "completions/min_length": 243.0, "epoch": 2.368072787427626, "grad_norm": 0.2110562026500702, "kl": 0.06219482421875, "learning_rate": 1.2920142198577484e-06, "loss": 0.0006214678287506104, "memory(GiB)": 38.13, "reward": 0.47645819187164307, "reward_std": 0.055704474449157715, "rewards/VisualizationJSONCombinedORM/mean": 0.47645819187164307, "rewards/VisualizationJSONCombinedORM/std": 0.056474313139915466, "step": 2863, "train_speed(iter/s)": 0.167654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 277.0625, "completions/min_length": 214.0, "epoch": 2.368899917287014, "grad_norm": 0.17380373179912567, "kl": 0.04833984375, "learning_rate": 1.2887875015751628e-06, "loss": 0.00048325955867767334, "memory(GiB)": 38.13, "reward": 0.45288628339767456, "reward_std": 0.0534696951508522, "rewards/VisualizationJSONCombinedORM/mean": 0.45288628339767456, "rewards/VisualizationJSONCombinedORM/std": 0.14249955117702484, "step": 2864, "train_speed(iter/s)": 0.167469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 282.625, "completions/min_length": 225.0, "epoch": 2.369727047146402, "grad_norm": 0.16236178576946259, "kl": 0.0491943359375, "learning_rate": 1.2855642213666858e-06, "loss": 0.0004919357597827911, "memory(GiB)": 38.13, "reward": 0.5419660210609436, "reward_std": 0.05259404331445694, "rewards/VisualizationJSONCombinedORM/mean": 0.5419660210609436, "rewards/VisualizationJSONCombinedORM/std": 0.1321467012166977, "step": 2865, "train_speed(iter/s)": 0.167238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 311.8125, "completions/min_length": 219.0, "epoch": 2.37055417700579, "grad_norm": 0.16232351958751678, "kl": 0.09814453125, "learning_rate": 1.2823443822183717e-06, "loss": 0.0009815804660320282, "memory(GiB)": 38.13, "reward": 0.5743129253387451, "reward_std": 0.06451545655727386, "rewards/VisualizationJSONCombinedORM/mean": 0.5743129253387451, "rewards/VisualizationJSONCombinedORM/std": 0.11903271824121475, "step": 2866, "train_speed(iter/s)": 0.167032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 291.4375, "completions/min_length": 231.0, "epoch": 2.3713813068651777, "grad_norm": 0.2036031037569046, "kl": 0.105712890625, "learning_rate": 1.2791279871130824e-06, "loss": 0.0010579749941825867, "memory(GiB)": 38.13, "reward": 0.5086696743965149, "reward_std": 0.08477439731359482, "rewards/VisualizationJSONCombinedORM/mean": 0.5086696743965149, "rewards/VisualizationJSONCombinedORM/std": 0.08381354063749313, "step": 2867, "train_speed(iter/s)": 0.166823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 332.75, "completions/min_length": 246.0, "epoch": 2.372208436724566, "grad_norm": 0.20563620328903198, "kl": 0.0623779296875, "learning_rate": 1.2759150390304953e-06, "loss": 0.0006251782178878784, "memory(GiB)": 38.13, "reward": 0.4889494478702545, "reward_std": 0.03262626379728317, "rewards/VisualizationJSONCombinedORM/mean": 0.4889494478702545, "rewards/VisualizationJSONCombinedORM/std": 0.263395220041275, "step": 2868, "train_speed(iter/s)": 0.166611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 315.6875, "completions/min_length": 244.0, "epoch": 2.3730355665839538, "grad_norm": 0.19719989597797394, "kl": 0.07611083984375, "learning_rate": 1.2727055409470873e-06, "loss": 0.0007626079022884369, "memory(GiB)": 38.13, "reward": 0.38044992089271545, "reward_std": 0.04221522808074951, "rewards/VisualizationJSONCombinedORM/mean": 0.38044992089271545, "rewards/VisualizationJSONCombinedORM/std": 0.13169324398040771, "step": 2869, "train_speed(iter/s)": 0.166456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 347.8125, "completions/min_length": 281.0, "epoch": 2.3738626964433416, "grad_norm": 0.17071731388568878, "kl": 0.142333984375, "learning_rate": 1.269499495836149e-06, "loss": 0.0014252196997404099, "memory(GiB)": 38.13, "reward": 0.29134511947631836, "reward_std": 0.030233751982450485, "rewards/VisualizationJSONCombinedORM/mean": 0.29134511947631836, "rewards/VisualizationJSONCombinedORM/std": 0.06655392795801163, "step": 2870, "train_speed(iter/s)": 0.166273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 267.375, "completions/min_length": 229.0, "epoch": 2.3746898263027294, "grad_norm": 0.1831071525812149, "kl": 0.0489501953125, "learning_rate": 1.266296906667762e-06, "loss": 0.0004900507628917694, "memory(GiB)": 38.13, "reward": 0.5071338415145874, "reward_std": 0.0763780027627945, "rewards/VisualizationJSONCombinedORM/mean": 0.5071338415145874, "rewards/VisualizationJSONCombinedORM/std": 0.10757091641426086, "step": 2871, "train_speed(iter/s)": 0.166055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 289.4375, "completions/min_length": 248.0, "epoch": 2.3755169561621177, "grad_norm": 0.2582298517227173, "kl": 0.317138671875, "learning_rate": 1.2630977764088143e-06, "loss": 0.0031741932034492493, "memory(GiB)": 38.13, "reward": 0.3526865243911743, "reward_std": 0.06295598298311234, "rewards/VisualizationJSONCombinedORM/mean": 0.3526865243911743, "rewards/VisualizationJSONCombinedORM/std": 0.08397134393453598, "step": 2872, "train_speed(iter/s)": 0.165922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 286.125, "completions/min_length": 226.0, "epoch": 2.3763440860215055, "grad_norm": 0.18897265195846558, "kl": 0.063232421875, "learning_rate": 1.2599021080229824e-06, "loss": 0.0006336066871881485, "memory(GiB)": 38.13, "reward": 0.5351150035858154, "reward_std": 0.0815453827381134, "rewards/VisualizationJSONCombinedORM/mean": 0.5351150035858154, "rewards/VisualizationJSONCombinedORM/std": 0.20088298618793488, "step": 2873, "train_speed(iter/s)": 0.165727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/mean_length": 332.375, "completions/min_length": 236.0, "epoch": 2.3771712158808933, "grad_norm": 0.16510677337646484, "kl": 0.08282470703125, "learning_rate": 1.256709904470741e-06, "loss": 0.0008279383182525635, "memory(GiB)": 38.13, "reward": 0.6452863216400146, "reward_std": 0.06563891470432281, "rewards/VisualizationJSONCombinedORM/mean": 0.6452863216400146, "rewards/VisualizationJSONCombinedORM/std": 0.12710176408290863, "step": 2874, "train_speed(iter/s)": 0.165495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 313.3125, "completions/min_length": 250.0, "epoch": 2.377998345740281, "grad_norm": 0.19256554543972015, "kl": 0.0679931640625, "learning_rate": 1.2535211687093535e-06, "loss": 0.0006808340549468994, "memory(GiB)": 38.13, "reward": 0.7123028635978699, "reward_std": 0.09694145619869232, "rewards/VisualizationJSONCombinedORM/mean": 0.7123028635978699, "rewards/VisualizationJSONCombinedORM/std": 0.09467830508947372, "step": 2875, "train_speed(iter/s)": 0.165313 }, { "epoch": 2.377998345740281, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.5416666666667, "eval_completions/mean_length": 310.6666666666667, "eval_completions/min_length": 258.9166666666667, "eval_kl": 0.07267252604166667, "eval_loss": 0.0007306790794245899, "eval_reward": 0.46294254437088966, "eval_reward_std": 0.056918257381767035, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46294254437088966, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05691826017573476, "eval_runtime": 313.5297, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 322.1875, "completions/min_length": 252.0, "epoch": 2.378825475599669, "grad_norm": 0.1789189577102661, "kl": 0.03497314453125, "learning_rate": 1.250335903692872e-06, "loss": 0.00035053491592407227, "memory(GiB)": 38.13, "reward": 0.5390991568565369, "reward_std": 0.028691615909337997, "rewards/VisualizationJSONCombinedORM/mean": 0.5390991568565369, "rewards/VisualizationJSONCombinedORM/std": 0.07181566953659058, "step": 2876, "train_speed(iter/s)": 0.162228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/mean_length": 322.0625, "completions/min_length": 240.0, "epoch": 2.379652605459057, "grad_norm": 0.19828687608242035, "kl": 0.04022216796875, "learning_rate": 1.2471541123721292e-06, "loss": 0.00040302053093910217, "memory(GiB)": 38.13, "reward": 0.45029109716415405, "reward_std": 0.059499986469745636, "rewards/VisualizationJSONCombinedORM/mean": 0.45029109716415405, "rewards/VisualizationJSONCombinedORM/std": 0.058120258152484894, "step": 2877, "train_speed(iter/s)": 0.162029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 297.875, "completions/min_length": 242.0, "epoch": 2.380479735318445, "grad_norm": 0.2225465178489685, "kl": 0.1092529296875, "learning_rate": 1.2439757976947454e-06, "loss": 0.0010909661650657654, "memory(GiB)": 38.13, "reward": 0.5449920296669006, "reward_std": 0.08010777831077576, "rewards/VisualizationJSONCombinedORM/mean": 0.5449920296669006, "rewards/VisualizationJSONCombinedORM/std": 0.1084422767162323, "step": 2878, "train_speed(iter/s)": 0.161829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 342.25, "completions/min_length": 284.0, "epoch": 2.381306865177833, "grad_norm": 0.1811494082212448, "kl": 0.10205078125, "learning_rate": 1.2408009626051137e-06, "loss": 0.0010202638804912567, "memory(GiB)": 38.13, "reward": 0.4403771758079529, "reward_std": 0.06567607820034027, "rewards/VisualizationJSONCombinedORM/mean": 0.4403771758079529, "rewards/VisualizationJSONCombinedORM/std": 0.07672132551670074, "step": 2879, "train_speed(iter/s)": 0.161643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 334.75, "completions/min_length": 256.0, "epoch": 2.3821339950372207, "grad_norm": 0.18190297484397888, "kl": 0.1251220703125, "learning_rate": 1.2376296100444092e-06, "loss": 0.0012501180171966553, "memory(GiB)": 38.13, "reward": 0.4310462474822998, "reward_std": 0.06265757977962494, "rewards/VisualizationJSONCombinedORM/mean": 0.4310462474822998, "rewards/VisualizationJSONCombinedORM/std": 0.061298470944166183, "step": 2880, "train_speed(iter/s)": 0.161467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 311.75, "completions/min_length": 237.0, "epoch": 2.382961124896609, "grad_norm": 0.2077929824590683, "kl": 0.1109619140625, "learning_rate": 1.2344617429505784e-06, "loss": 0.001107737421989441, "memory(GiB)": 38.13, "reward": 0.660256564617157, "reward_std": 0.09280036389827728, "rewards/VisualizationJSONCombinedORM/mean": 0.660256564617157, "rewards/VisualizationJSONCombinedORM/std": 0.13863764703273773, "step": 2881, "train_speed(iter/s)": 0.161291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 315.875, "completions/min_length": 234.0, "epoch": 2.3837882547559968, "grad_norm": 0.20660215616226196, "kl": 0.095947265625, "learning_rate": 1.2312973642583414e-06, "loss": 0.0009599123150110245, "memory(GiB)": 38.13, "reward": 0.4299890100955963, "reward_std": 0.04467377811670303, "rewards/VisualizationJSONCombinedORM/mean": 0.4299890100955963, "rewards/VisualizationJSONCombinedORM/std": 0.12243455648422241, "step": 2882, "train_speed(iter/s)": 0.161106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 300.5, "completions/min_length": 243.0, "epoch": 2.3846153846153846, "grad_norm": 0.18082132935523987, "kl": 0.0635986328125, "learning_rate": 1.2281364768991804e-06, "loss": 0.0006348416209220886, "memory(GiB)": 38.13, "reward": 0.432349294424057, "reward_std": 0.04871103912591934, "rewards/VisualizationJSONCombinedORM/mean": 0.432349294424057, "rewards/VisualizationJSONCombinedORM/std": 0.0472775362432003, "step": 2883, "train_speed(iter/s)": 0.160946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 292.0, "completions/min_length": 236.0, "epoch": 2.3854425144747724, "grad_norm": 0.2164687067270279, "kl": 0.14306640625, "learning_rate": 1.2249790838013514e-06, "loss": 0.0014320164918899536, "memory(GiB)": 38.13, "reward": 0.5759074091911316, "reward_std": 0.06482377648353577, "rewards/VisualizationJSONCombinedORM/mean": 0.5759074091911316, "rewards/VisualizationJSONCombinedORM/std": 0.1975826472043991, "step": 2884, "train_speed(iter/s)": 0.160784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 325.0625, "completions/min_length": 282.0, "epoch": 2.3862696443341607, "grad_norm": 0.17519143223762512, "kl": 0.06024169921875, "learning_rate": 1.2218251878898641e-06, "loss": 0.0006044358015060425, "memory(GiB)": 38.13, "reward": 0.6493857502937317, "reward_std": 0.055067066103219986, "rewards/VisualizationJSONCombinedORM/mean": 0.6493857502937317, "rewards/VisualizationJSONCombinedORM/std": 0.15741564333438873, "step": 2885, "train_speed(iter/s)": 0.160597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 294.5, "completions/min_length": 236.0, "epoch": 2.3870967741935485, "grad_norm": 0.1900510936975479, "kl": 0.082672119140625, "learning_rate": 1.2186747920864993e-06, "loss": 0.0008280538022518158, "memory(GiB)": 38.13, "reward": 0.5554831624031067, "reward_std": 0.08277442306280136, "rewards/VisualizationJSONCombinedORM/mean": 0.5554831624031067, "rewards/VisualizationJSONCombinedORM/std": 0.0864565446972847, "step": 2886, "train_speed(iter/s)": 0.160435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 281.8125, "completions/min_length": 230.0, "epoch": 2.3879239040529363, "grad_norm": 0.1600506454706192, "kl": 0.06243896484375, "learning_rate": 1.2155278993097853e-06, "loss": 0.0006242990493774414, "memory(GiB)": 38.13, "reward": 0.6838945150375366, "reward_std": 0.07892028987407684, "rewards/VisualizationJSONCombinedORM/mean": 0.6838945150375366, "rewards/VisualizationJSONCombinedORM/std": 0.09300028532743454, "step": 2887, "train_speed(iter/s)": 0.160298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 293.625, "completions/min_length": 245.0, "epoch": 2.388751033912324, "grad_norm": 0.23463894426822662, "kl": 0.0328369140625, "learning_rate": 1.2123845124750122e-06, "loss": 0.00032766908407211304, "memory(GiB)": 38.13, "reward": 0.5578991174697876, "reward_std": 0.08121666312217712, "rewards/VisualizationJSONCombinedORM/mean": 0.5578991174697876, "rewards/VisualizationJSONCombinedORM/std": 0.0819324478507042, "step": 2888, "train_speed(iter/s)": 0.16011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 342.0, "completions/min_length": 268.0, "epoch": 2.389578163771712, "grad_norm": 0.18065321445465088, "kl": 0.05596923828125, "learning_rate": 1.2092446344942165e-06, "loss": 0.0005593057721853256, "memory(GiB)": 38.13, "reward": 0.5079395771026611, "reward_std": 0.061434730887413025, "rewards/VisualizationJSONCombinedORM/mean": 0.5079395771026611, "rewards/VisualizationJSONCombinedORM/std": 0.17719417810440063, "step": 2889, "train_speed(iter/s)": 0.159944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 326.0, "completions/min_length": 242.0, "epoch": 2.3904052936311, "grad_norm": 0.18984222412109375, "kl": 0.0819091796875, "learning_rate": 1.2061082682761888e-06, "loss": 0.0008200109004974365, "memory(GiB)": 38.13, "reward": 0.42844998836517334, "reward_std": 0.05029799044132233, "rewards/VisualizationJSONCombinedORM/mean": 0.42844998836517334, "rewards/VisualizationJSONCombinedORM/std": 0.06015424057841301, "step": 2890, "train_speed(iter/s)": 0.159728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 299.625, "completions/min_length": 243.0, "epoch": 2.391232423490488, "grad_norm": 0.17185290157794952, "kl": 0.03704833984375, "learning_rate": 1.202975416726464e-06, "loss": 0.000370219349861145, "memory(GiB)": 38.13, "reward": 0.5261320471763611, "reward_std": 0.05213724821805954, "rewards/VisualizationJSONCombinedORM/mean": 0.5261320471763611, "rewards/VisualizationJSONCombinedORM/std": 0.094598188996315, "step": 2891, "train_speed(iter/s)": 0.159551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 307.8125, "completions/min_length": 241.0, "epoch": 2.392059553349876, "grad_norm": 0.17129366099834442, "kl": 0.080078125, "learning_rate": 1.199846082747323e-06, "loss": 0.0007992088794708252, "memory(GiB)": 38.13, "reward": 0.4678792655467987, "reward_std": 0.04823624715209007, "rewards/VisualizationJSONCombinedORM/mean": 0.4678792655467987, "rewards/VisualizationJSONCombinedORM/std": 0.20126333832740784, "step": 2892, "train_speed(iter/s)": 0.159419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 314.25, "completions/min_length": 266.0, "epoch": 2.3928866832092637, "grad_norm": 0.18005575239658356, "kl": 0.131103515625, "learning_rate": 1.1967202692377844e-06, "loss": 0.00131254643201828, "memory(GiB)": 38.13, "reward": 0.5299656987190247, "reward_std": 0.03468657657504082, "rewards/VisualizationJSONCombinedORM/mean": 0.5299656987190247, "rewards/VisualizationJSONCombinedORM/std": 0.2655297517776489, "step": 2893, "train_speed(iter/s)": 0.159262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 334.0, "completions/min_length": 251.0, "epoch": 2.393713813068652, "grad_norm": 0.1706504076719284, "kl": 0.07275390625, "learning_rate": 1.1935979790936093e-06, "loss": 0.0007279030978679657, "memory(GiB)": 38.13, "reward": 0.6733537912368774, "reward_std": 0.0620303750038147, "rewards/VisualizationJSONCombinedORM/mean": 0.6733537912368774, "rewards/VisualizationJSONCombinedORM/std": 0.12843722105026245, "step": 2894, "train_speed(iter/s)": 0.159082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 289.25, "completions/min_length": 216.0, "epoch": 2.3945409429280398, "grad_norm": 0.18611258268356323, "kl": 0.139404296875, "learning_rate": 1.1904792152072914e-06, "loss": 0.0013918578624725342, "memory(GiB)": 38.13, "reward": 0.5260463953018188, "reward_std": 0.07560998946428299, "rewards/VisualizationJSONCombinedORM/mean": 0.5260463953018188, "rewards/VisualizationJSONCombinedORM/std": 0.09993410855531693, "step": 2895, "train_speed(iter/s)": 0.158904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 319.875, "completions/min_length": 246.0, "epoch": 2.3953680727874276, "grad_norm": 0.17912395298480988, "kl": 0.127197265625, "learning_rate": 1.1873639804680599e-06, "loss": 0.0012757070362567902, "memory(GiB)": 38.13, "reward": 0.6092246174812317, "reward_std": 0.07177848368883133, "rewards/VisualizationJSONCombinedORM/mean": 0.6092246174812317, "rewards/VisualizationJSONCombinedORM/std": 0.16782091557979584, "step": 2896, "train_speed(iter/s)": 0.158697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 313.75, "completions/min_length": 245.0, "epoch": 2.3961952026468154, "grad_norm": 0.18005281686782837, "kl": 0.06683349609375, "learning_rate": 1.1842522777618742e-06, "loss": 0.0006678029894828796, "memory(GiB)": 38.13, "reward": 0.639201819896698, "reward_std": 0.05686786770820618, "rewards/VisualizationJSONCombinedORM/mean": 0.639201819896698, "rewards/VisualizationJSONCombinedORM/std": 0.1696770042181015, "step": 2897, "train_speed(iter/s)": 0.15851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 285.5625, "completions/min_length": 236.0, "epoch": 2.3970223325062037, "grad_norm": 0.22764550149440765, "kl": 0.0830078125, "learning_rate": 1.1811441099714232e-06, "loss": 0.0008286163210868835, "memory(GiB)": 38.13, "reward": 0.6005830764770508, "reward_std": 0.058283731341362, "rewards/VisualizationJSONCombinedORM/mean": 0.6005830764770508, "rewards/VisualizationJSONCombinedORM/std": 0.06055651232600212, "step": 2898, "train_speed(iter/s)": 0.15831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 293.25, "completions/min_length": 226.0, "epoch": 2.3978494623655915, "grad_norm": 0.20759759843349457, "kl": 0.1280517578125, "learning_rate": 1.1780394799761163e-06, "loss": 0.0012807734310626984, "memory(GiB)": 38.13, "reward": 0.5359499454498291, "reward_std": 0.06076754629611969, "rewards/VisualizationJSONCombinedORM/mean": 0.5359499454498291, "rewards/VisualizationJSONCombinedORM/std": 0.25595900416374207, "step": 2899, "train_speed(iter/s)": 0.158172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 315.6875, "completions/min_length": 264.0, "epoch": 2.3986765922249793, "grad_norm": 0.18597835302352905, "kl": 0.0523681640625, "learning_rate": 1.1749383906520912e-06, "loss": 0.0005238503217697144, "memory(GiB)": 38.13, "reward": 0.43503886461257935, "reward_std": 0.0536976084113121, "rewards/VisualizationJSONCombinedORM/mean": 0.43503886461257935, "rewards/VisualizationJSONCombinedORM/std": 0.12971048057079315, "step": 2900, "train_speed(iter/s)": 0.158003 }, { "epoch": 2.3986765922249793, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.7916666666667, "eval_completions/mean_length": 311.1197916666667, "eval_completions/min_length": 257.9583333333333, "eval_kl": 0.08275349934895833, "eval_loss": 0.0008280997280962765, "eval_reward": 0.4649560072769721, "eval_reward_std": 0.06402119660439591, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4649560072769721, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0640211986998717, "eval_runtime": 315.8387, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 342.625, "completions/min_length": 262.0, "epoch": 2.399503722084367, "grad_norm": 0.22309647500514984, "kl": 0.0543212890625, "learning_rate": 1.171840844872198e-06, "loss": 0.0005417242646217346, "memory(GiB)": 38.13, "reward": 0.7063939571380615, "reward_std": 0.08273869007825851, "rewards/VisualizationJSONCombinedORM/mean": 0.7063939571380615, "rewards/VisualizationJSONCombinedORM/std": 0.08043789118528366, "step": 2901, "train_speed(iter/s)": 0.155172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 298.125, "completions/min_length": 251.0, "epoch": 2.400330851943755, "grad_norm": 0.19543446600437164, "kl": 0.065673828125, "learning_rate": 1.1687468455060157e-06, "loss": 0.0006582308560609818, "memory(GiB)": 38.13, "reward": 0.6003671884536743, "reward_std": 0.0671098604798317, "rewards/VisualizationJSONCombinedORM/mean": 0.6003671884536743, "rewards/VisualizationJSONCombinedORM/std": 0.17361347377300262, "step": 2902, "train_speed(iter/s)": 0.15502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 294.75, "completions/min_length": 238.0, "epoch": 2.401157981803143, "grad_norm": 0.22042162716388702, "kl": 0.1009521484375, "learning_rate": 1.1656563954198258e-06, "loss": 0.0010090246796607971, "memory(GiB)": 38.13, "reward": 0.4812985062599182, "reward_std": 0.06341642141342163, "rewards/VisualizationJSONCombinedORM/mean": 0.4812985062599182, "rewards/VisualizationJSONCombinedORM/std": 0.06538157165050507, "step": 2903, "train_speed(iter/s)": 0.154879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 297.125, "completions/min_length": 225.0, "epoch": 2.401985111662531, "grad_norm": 0.2207091897726059, "kl": 0.0438232421875, "learning_rate": 1.16256949747663e-06, "loss": 0.0004380419850349426, "memory(GiB)": 38.13, "reward": 0.53339684009552, "reward_std": 0.0726218968629837, "rewards/VisualizationJSONCombinedORM/mean": 0.53339684009552, "rewards/VisualizationJSONCombinedORM/std": 0.16973994672298431, "step": 2904, "train_speed(iter/s)": 0.154711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 309.0, "completions/min_length": 251.0, "epoch": 2.402812241521919, "grad_norm": 0.18472160398960114, "kl": 0.0989990234375, "learning_rate": 1.1594861545361336e-06, "loss": 0.0009908182546496391, "memory(GiB)": 38.13, "reward": 0.6031419038772583, "reward_std": 0.07933516800403595, "rewards/VisualizationJSONCombinedORM/mean": 0.6031419038772583, "rewards/VisualizationJSONCombinedORM/std": 0.2119949609041214, "step": 2905, "train_speed(iter/s)": 0.154543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 306.375, "completions/min_length": 226.0, "epoch": 2.4036393713813067, "grad_norm": 0.1645716428756714, "kl": 0.05902099609375, "learning_rate": 1.1564063694547523e-06, "loss": 0.0005906596779823303, "memory(GiB)": 38.13, "reward": 0.6454906463623047, "reward_std": 0.0826745480298996, "rewards/VisualizationJSONCombinedORM/mean": 0.6454906463623047, "rewards/VisualizationJSONCombinedORM/std": 0.12333757430315018, "step": 2906, "train_speed(iter/s)": 0.154395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 275.5625, "completions/min_length": 224.0, "epoch": 2.404466501240695, "grad_norm": 0.1795801818370819, "kl": 0.07391357421875, "learning_rate": 1.1533301450856054e-06, "loss": 0.0007410645484924316, "memory(GiB)": 38.13, "reward": 0.463136225938797, "reward_std": 0.049793586134910583, "rewards/VisualizationJSONCombinedORM/mean": 0.463136225938797, "rewards/VisualizationJSONCombinedORM/std": 0.048927657306194305, "step": 2907, "train_speed(iter/s)": 0.154234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 300.75, "completions/min_length": 239.0, "epoch": 2.4052936311000828, "grad_norm": 0.16376852989196777, "kl": 0.0728759765625, "learning_rate": 1.1502574842785135e-06, "loss": 0.0007302574813365936, "memory(GiB)": 38.13, "reward": 0.3954463005065918, "reward_std": 0.07962831854820251, "rewards/VisualizationJSONCombinedORM/mean": 0.3954463005065918, "rewards/VisualizationJSONCombinedORM/std": 0.09315838664770126, "step": 2908, "train_speed(iter/s)": 0.154068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 298.9375, "completions/min_length": 249.0, "epoch": 2.4061207609594706, "grad_norm": 0.26855358481407166, "kl": 0.0682373046875, "learning_rate": 1.1471883898799924e-06, "loss": 0.0006829798221588135, "memory(GiB)": 38.13, "reward": 0.5119291543960571, "reward_std": 0.06589734554290771, "rewards/VisualizationJSONCombinedORM/mean": 0.5119291543960571, "rewards/VisualizationJSONCombinedORM/std": 0.16073808073997498, "step": 2909, "train_speed(iter/s)": 0.153902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 282.0625, "completions/min_length": 243.0, "epoch": 2.4069478908188584, "grad_norm": 0.176192507147789, "kl": 0.0477294921875, "learning_rate": 1.1441228647332602e-06, "loss": 0.0004771128296852112, "memory(GiB)": 38.13, "reward": 0.6394366025924683, "reward_std": 0.0511382520198822, "rewards/VisualizationJSONCombinedORM/mean": 0.6394366025924683, "rewards/VisualizationJSONCombinedORM/std": 0.16411560773849487, "step": 2910, "train_speed(iter/s)": 0.153746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 308.0, "completions/min_length": 222.0, "epoch": 2.4077750206782467, "grad_norm": 0.2369493544101715, "kl": 0.05548095703125, "learning_rate": 1.1410609116782218e-06, "loss": 0.0005553029477596283, "memory(GiB)": 38.13, "reward": 0.5052388906478882, "reward_std": 0.07583293318748474, "rewards/VisualizationJSONCombinedORM/mean": 0.5052388906478882, "rewards/VisualizationJSONCombinedORM/std": 0.1143159568309784, "step": 2911, "train_speed(iter/s)": 0.153541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 351.125, "completions/min_length": 295.0, "epoch": 2.4086021505376345, "grad_norm": 0.17014771699905396, "kl": 0.0706787109375, "learning_rate": 1.1380025335514777e-06, "loss": 0.0007069781422615051, "memory(GiB)": 38.13, "reward": 0.5119103193283081, "reward_std": 0.07572883367538452, "rewards/VisualizationJSONCombinedORM/mean": 0.5119103193283081, "rewards/VisualizationJSONCombinedORM/std": 0.13595207035541534, "step": 2912, "train_speed(iter/s)": 0.153368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 310.4375, "completions/min_length": 241.0, "epoch": 2.4094292803970223, "grad_norm": 0.17808601260185242, "kl": 0.15576171875, "learning_rate": 1.134947733186315e-06, "loss": 0.001555517315864563, "memory(GiB)": 38.13, "reward": 0.5567141175270081, "reward_std": 0.05865253135561943, "rewards/VisualizationJSONCombinedORM/mean": 0.5567141175270081, "rewards/VisualizationJSONCombinedORM/std": 0.15220387279987335, "step": 2913, "train_speed(iter/s)": 0.153205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 306.75, "completions/min_length": 242.0, "epoch": 2.41025641025641, "grad_norm": 0.1778394728899002, "kl": 0.10400390625, "learning_rate": 1.131896513412707e-06, "loss": 0.001041613519191742, "memory(GiB)": 38.13, "reward": 0.49063509702682495, "reward_std": 0.05172112584114075, "rewards/VisualizationJSONCombinedORM/mean": 0.49063509702682495, "rewards/VisualizationJSONCombinedORM/std": 0.1612440049648285, "step": 2914, "train_speed(iter/s)": 0.153056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 297.625, "completions/min_length": 239.0, "epoch": 2.411083540115798, "grad_norm": 0.24258649349212646, "kl": 0.07501220703125, "learning_rate": 1.1288488770573097e-06, "loss": 0.000751301646232605, "memory(GiB)": 38.13, "reward": 0.5538489818572998, "reward_std": 0.11683662235736847, "rewards/VisualizationJSONCombinedORM/mean": 0.5538489818572998, "rewards/VisualizationJSONCombinedORM/std": 0.12958748638629913, "step": 2915, "train_speed(iter/s)": 0.152917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 309.375, "completions/min_length": 257.0, "epoch": 2.411910669975186, "grad_norm": 0.19188544154167175, "kl": 0.071533203125, "learning_rate": 1.1258048269434569e-06, "loss": 0.0007151737809181213, "memory(GiB)": 38.13, "reward": 0.6685011982917786, "reward_std": 0.11993597447872162, "rewards/VisualizationJSONCombinedORM/mean": 0.6685011982917786, "rewards/VisualizationJSONCombinedORM/std": 0.1171216368675232, "step": 2916, "train_speed(iter/s)": 0.152761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 302.9375, "completions/min_length": 238.0, "epoch": 2.412737799834574, "grad_norm": 0.20417335629463196, "kl": 0.0557861328125, "learning_rate": 1.1227643658911647e-06, "loss": 0.0005573686212301254, "memory(GiB)": 38.13, "reward": 0.539294421672821, "reward_std": 0.05881313607096672, "rewards/VisualizationJSONCombinedORM/mean": 0.539294421672821, "rewards/VisualizationJSONCombinedORM/std": 0.15218518674373627, "step": 2917, "train_speed(iter/s)": 0.152553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 312.875, "completions/min_length": 218.0, "epoch": 2.413564929693962, "grad_norm": 0.17417781054973602, "kl": 0.0849609375, "learning_rate": 1.1197274967171178e-06, "loss": 0.0008506346493959427, "memory(GiB)": 38.13, "reward": 0.5085127353668213, "reward_std": 0.045588452368974686, "rewards/VisualizationJSONCombinedORM/mean": 0.5085127353668213, "rewards/VisualizationJSONCombinedORM/std": 0.263304203748703, "step": 2918, "train_speed(iter/s)": 0.152399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 296.0625, "completions/min_length": 249.0, "epoch": 2.41439205955335, "grad_norm": 0.17408502101898193, "kl": 0.04803466796875, "learning_rate": 1.1166942222346828e-06, "loss": 0.00047993846237659454, "memory(GiB)": 38.13, "reward": 0.7499625086784363, "reward_std": 0.0840015783905983, "rewards/VisualizationJSONCombinedORM/mean": 0.7499625086784363, "rewards/VisualizationJSONCombinedORM/std": 0.1250186562538147, "step": 2919, "train_speed(iter/s)": 0.152239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 328.4375, "completions/min_length": 288.0, "epoch": 2.415219189412738, "grad_norm": 0.20184166729450226, "kl": 0.096435546875, "learning_rate": 1.113664545253887e-06, "loss": 0.0009643640369176865, "memory(GiB)": 38.13, "reward": 0.48059970140457153, "reward_std": 0.06856127828359604, "rewards/VisualizationJSONCombinedORM/mean": 0.48059970140457153, "rewards/VisualizationJSONCombinedORM/std": 0.10532744973897934, "step": 2920, "train_speed(iter/s)": 0.152051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 315.3125, "completions/min_length": 236.0, "epoch": 2.4160463192721258, "grad_norm": 0.16754181683063507, "kl": 0.0465087890625, "learning_rate": 1.1106384685814314e-06, "loss": 0.0004659779369831085, "memory(GiB)": 38.13, "reward": 0.503486156463623, "reward_std": 0.052042946219444275, "rewards/VisualizationJSONCombinedORM/mean": 0.503486156463623, "rewards/VisualizationJSONCombinedORM/std": 0.13836659491062164, "step": 2921, "train_speed(iter/s)": 0.151897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 257.4375, "completions/min_length": 202.0, "epoch": 2.4168734491315136, "grad_norm": 0.16953299939632416, "kl": 0.0733642578125, "learning_rate": 1.1076159950206762e-06, "loss": 0.0007339585572481155, "memory(GiB)": 38.13, "reward": 0.40426599979400635, "reward_std": 0.058540504425764084, "rewards/VisualizationJSONCombinedORM/mean": 0.40426599979400635, "rewards/VisualizationJSONCombinedORM/std": 0.10681053251028061, "step": 2922, "train_speed(iter/s)": 0.15178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 321.625, "completions/min_length": 241.0, "epoch": 2.4177005789909014, "grad_norm": 0.18597936630249023, "kl": 0.092041015625, "learning_rate": 1.1045971273716476e-06, "loss": 0.0009210929274559021, "memory(GiB)": 38.13, "reward": 0.5934990644454956, "reward_std": 0.09089609235525131, "rewards/VisualizationJSONCombinedORM/mean": 0.5934990644454956, "rewards/VisualizationJSONCombinedORM/std": 0.10863100737333298, "step": 2923, "train_speed(iter/s)": 0.151616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 278.25, "completions/min_length": 242.0, "epoch": 2.4185277088502897, "grad_norm": 0.19437605142593384, "kl": 0.08807373046875, "learning_rate": 1.101581868431031e-06, "loss": 0.0008805990219116211, "memory(GiB)": 38.13, "reward": 0.5282281637191772, "reward_std": 0.059048768132925034, "rewards/VisualizationJSONCombinedORM/mean": 0.5282281637191772, "rewards/VisualizationJSONCombinedORM/std": 0.2733737528324127, "step": 2924, "train_speed(iter/s)": 0.151516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 322.4375, "completions/min_length": 226.0, "epoch": 2.4193548387096775, "grad_norm": 0.20330239832401276, "kl": 0.05401611328125, "learning_rate": 1.0985702209921677e-06, "loss": 0.0005394220352172852, "memory(GiB)": 38.13, "reward": 0.6657045483589172, "reward_std": 0.0658310204744339, "rewards/VisualizationJSONCombinedORM/mean": 0.6657045483589172, "rewards/VisualizationJSONCombinedORM/std": 0.158987894654274, "step": 2925, "train_speed(iter/s)": 0.151381 }, { "epoch": 2.4193548387096775, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 380.625, "eval_completions/mean_length": 310.1875, "eval_completions/min_length": 254.20833333333334, "eval_kl": 0.06801859537760417, "eval_loss": 0.0006846289034001529, "eval_reward": 0.4453268988678853, "eval_reward_std": 0.05701310567868253, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4453268988678853, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057013108977116644, "eval_runtime": 319.9861, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 327.9375, "completions/min_length": 260.0, "epoch": 2.4201819685690653, "grad_norm": 0.1702013909816742, "kl": 0.04620361328125, "learning_rate": 1.0955621878450523e-06, "loss": 0.0004620775580406189, "memory(GiB)": 38.13, "reward": 0.6886396408081055, "reward_std": 0.07220466434955597, "rewards/VisualizationJSONCombinedORM/mean": 0.6886396408081055, "rewards/VisualizationJSONCombinedORM/std": 0.07381679117679596, "step": 2926, "train_speed(iter/s)": 0.148759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 286.75, "completions/min_length": 245.0, "epoch": 2.421009098428453, "grad_norm": 0.16236147284507751, "kl": 0.12353515625, "learning_rate": 1.0925577717763342e-06, "loss": 0.0012385919690132141, "memory(GiB)": 38.13, "reward": 0.7135647535324097, "reward_std": 0.05789776146411896, "rewards/VisualizationJSONCombinedORM/mean": 0.7135647535324097, "rewards/VisualizationJSONCombinedORM/std": 0.06467188894748688, "step": 2927, "train_speed(iter/s)": 0.148626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 301.5625, "completions/min_length": 250.0, "epoch": 2.421836228287841, "grad_norm": 0.29849886894226074, "kl": 0.1527099609375, "learning_rate": 1.0895569755693076e-06, "loss": 0.0015239734202623367, "memory(GiB)": 38.13, "reward": 0.4988800883293152, "reward_std": 0.07035941630601883, "rewards/VisualizationJSONCombinedORM/mean": 0.4988800883293152, "rewards/VisualizationJSONCombinedORM/std": 0.3154941499233246, "step": 2928, "train_speed(iter/s)": 0.14848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 312.3125, "completions/min_length": 248.0, "epoch": 2.422663358147229, "grad_norm": 0.1759268194437027, "kl": 0.0360107421875, "learning_rate": 1.0865598020039158e-06, "loss": 0.0003601834177970886, "memory(GiB)": 38.13, "reward": 0.3511214852333069, "reward_std": 0.023197414353489876, "rewards/VisualizationJSONCombinedORM/mean": 0.3511214852333069, "rewards/VisualizationJSONCombinedORM/std": 0.08823433518409729, "step": 2929, "train_speed(iter/s)": 0.148344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 316.875, "completions/min_length": 268.0, "epoch": 2.423490488006617, "grad_norm": 0.20409435033798218, "kl": 0.0894775390625, "learning_rate": 1.0835662538567482e-06, "loss": 0.0008942438289523125, "memory(GiB)": 38.13, "reward": 0.5318557024002075, "reward_std": 0.028466438874602318, "rewards/VisualizationJSONCombinedORM/mean": 0.5318557024002075, "rewards/VisualizationJSONCombinedORM/std": 0.21560858190059662, "step": 2930, "train_speed(iter/s)": 0.148181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 320.0625, "completions/min_length": 239.0, "epoch": 2.424317617866005, "grad_norm": 0.243710458278656, "kl": 0.12060546875, "learning_rate": 1.0805763339010329e-06, "loss": 0.0012032687664031982, "memory(GiB)": 38.13, "reward": 0.4845491051673889, "reward_std": 0.050371602177619934, "rewards/VisualizationJSONCombinedORM/mean": 0.4845491051673889, "rewards/VisualizationJSONCombinedORM/std": 0.23646800220012665, "step": 2931, "train_speed(iter/s)": 0.148065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 341.625, "completions/min_length": 296.0, "epoch": 2.425144747725393, "grad_norm": 0.197129487991333, "kl": 0.05706787109375, "learning_rate": 1.0775900449066346e-06, "loss": 0.0005717948079109192, "memory(GiB)": 38.13, "reward": 0.4466049373149872, "reward_std": 0.05288359150290489, "rewards/VisualizationJSONCombinedORM/mean": 0.4466049373149872, "rewards/VisualizationJSONCombinedORM/std": 0.17030291259288788, "step": 2932, "train_speed(iter/s)": 0.147929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 285.25, "completions/min_length": 206.0, "epoch": 2.425971877584781, "grad_norm": 0.20537911355495453, "kl": 0.052001953125, "learning_rate": 1.0746073896400605e-06, "loss": 0.0005205683410167694, "memory(GiB)": 38.13, "reward": 0.6060512065887451, "reward_std": 0.07284460216760635, "rewards/VisualizationJSONCombinedORM/mean": 0.6060512065887451, "rewards/VisualizationJSONCombinedORM/std": 0.07076744735240936, "step": 2933, "train_speed(iter/s)": 0.147795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 323.375, "completions/min_length": 262.0, "epoch": 2.4267990074441688, "grad_norm": 0.16924786567687988, "kl": 0.0577392578125, "learning_rate": 1.0716283708644431e-06, "loss": 0.0005775019526481628, "memory(GiB)": 38.13, "reward": 0.3595888614654541, "reward_std": 0.03201065957546234, "rewards/VisualizationJSONCombinedORM/mean": 0.3595888614654541, "rewards/VisualizationJSONCombinedORM/std": 0.06615714728832245, "step": 2934, "train_speed(iter/s)": 0.147656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 270.5, "completions/min_length": 242.0, "epoch": 2.4276261373035566, "grad_norm": 0.20510034263134003, "kl": 0.04608154296875, "learning_rate": 1.0686529913395572e-06, "loss": 0.0004606805741786957, "memory(GiB)": 38.13, "reward": 0.5514423251152039, "reward_std": 0.06697729229927063, "rewards/VisualizationJSONCombinedORM/mean": 0.5514423251152039, "rewards/VisualizationJSONCombinedORM/std": 0.18125039339065552, "step": 2935, "train_speed(iter/s)": 0.147538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 318.125, "completions/min_length": 249.0, "epoch": 2.4284532671629444, "grad_norm": 0.1930101066827774, "kl": 0.07452392578125, "learning_rate": 1.0656812538217953e-06, "loss": 0.0007454212754964828, "memory(GiB)": 38.13, "reward": 0.4041706919670105, "reward_std": 0.1366981863975525, "rewards/VisualizationJSONCombinedORM/mean": 0.4041706919670105, "rewards/VisualizationJSONCombinedORM/std": 0.17520268261432648, "step": 2936, "train_speed(iter/s)": 0.147377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 311.5, "completions/min_length": 223.0, "epoch": 2.4292803970223327, "grad_norm": 0.17949184775352478, "kl": 0.10150146484375, "learning_rate": 1.0627131610641829e-06, "loss": 0.0010137483477592468, "memory(GiB)": 38.13, "reward": 0.6657885909080505, "reward_std": 0.07623157650232315, "rewards/VisualizationJSONCombinedORM/mean": 0.6657885909080505, "rewards/VisualizationJSONCombinedORM/std": 0.10429783910512924, "step": 2937, "train_speed(iter/s)": 0.14719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 310.4375, "completions/min_length": 224.0, "epoch": 2.4301075268817205, "grad_norm": 0.17936857044696808, "kl": 0.02899169921875, "learning_rate": 1.0597487158163644e-06, "loss": 0.00028984248638153076, "memory(GiB)": 38.13, "reward": 0.46913981437683105, "reward_std": 0.05147411301732063, "rewards/VisualizationJSONCombinedORM/mean": 0.46913981437683105, "rewards/VisualizationJSONCombinedORM/std": 0.2535932958126068, "step": 2938, "train_speed(iter/s)": 0.147059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 294.9375, "completions/min_length": 227.0, "epoch": 2.4309346567411083, "grad_norm": 0.29698050022125244, "kl": 0.04632568359375, "learning_rate": 1.0567879208246084e-06, "loss": 0.00046272575855255127, "memory(GiB)": 38.13, "reward": 0.4882761538028717, "reward_std": 0.07232071459293365, "rewards/VisualizationJSONCombinedORM/mean": 0.4882761538028717, "rewards/VisualizationJSONCombinedORM/std": 0.2699123024940491, "step": 2939, "train_speed(iter/s)": 0.146922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 307.8125, "completions/min_length": 242.0, "epoch": 2.431761786600496, "grad_norm": 0.1632246971130371, "kl": 0.04132080078125, "learning_rate": 1.0538307788318014e-06, "loss": 0.00041434168815612793, "memory(GiB)": 38.13, "reward": 0.5316950082778931, "reward_std": 0.05011191964149475, "rewards/VisualizationJSONCombinedORM/mean": 0.5316950082778931, "rewards/VisualizationJSONCombinedORM/std": 0.15055352449417114, "step": 2940, "train_speed(iter/s)": 0.146773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 308.125, "completions/min_length": 248.0, "epoch": 2.432588916459884, "grad_norm": 0.3736061453819275, "kl": 0.1343994140625, "learning_rate": 1.0508772925774458e-06, "loss": 0.0013433843851089478, "memory(GiB)": 38.13, "reward": 0.4994492530822754, "reward_std": 0.0908481702208519, "rewards/VisualizationJSONCombinedORM/mean": 0.4994492530822754, "rewards/VisualizationJSONCombinedORM/std": 0.1979772448539734, "step": 2941, "train_speed(iter/s)": 0.146616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 334.375, "completions/min_length": 213.0, "epoch": 2.433416046319272, "grad_norm": 0.16024541854858398, "kl": 0.060546875, "learning_rate": 1.0479274647976546e-06, "loss": 0.0006048977375030518, "memory(GiB)": 38.13, "reward": 0.5984420776367188, "reward_std": 0.05428147315979004, "rewards/VisualizationJSONCombinedORM/mean": 0.5984420776367188, "rewards/VisualizationJSONCombinedORM/std": 0.17153088748455048, "step": 2942, "train_speed(iter/s)": 0.146456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 317.125, "completions/min_length": 261.0, "epoch": 2.43424317617866, "grad_norm": 0.19747185707092285, "kl": 0.054443359375, "learning_rate": 1.0449812982251556e-06, "loss": 0.0005452055484056473, "memory(GiB)": 38.13, "reward": 0.3883321285247803, "reward_std": 0.05617062374949455, "rewards/VisualizationJSONCombinedORM/mean": 0.3883321285247803, "rewards/VisualizationJSONCombinedORM/std": 0.08827699720859528, "step": 2943, "train_speed(iter/s)": 0.146312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 313.6875, "completions/min_length": 236.0, "epoch": 2.435070306038048, "grad_norm": 0.1688700169324875, "kl": 0.08203125, "learning_rate": 1.0420387955892814e-06, "loss": 0.0008211061358451843, "memory(GiB)": 38.13, "reward": 0.3967565894126892, "reward_std": 0.052732452750205994, "rewards/VisualizationJSONCombinedORM/mean": 0.3967565894126892, "rewards/VisualizationJSONCombinedORM/std": 0.07041123509407043, "step": 2944, "train_speed(iter/s)": 0.146155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 307.0, "completions/min_length": 247.0, "epoch": 2.435897435897436, "grad_norm": 0.17772932350635529, "kl": 0.08782958984375, "learning_rate": 1.0390999596159724e-06, "loss": 0.0008773505687713623, "memory(GiB)": 38.13, "reward": 0.5422438979148865, "reward_std": 0.04316449910402298, "rewards/VisualizationJSONCombinedORM/mean": 0.5422438979148865, "rewards/VisualizationJSONCombinedORM/std": 0.19743099808692932, "step": 2945, "train_speed(iter/s)": 0.145991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 294.1875, "completions/min_length": 234.0, "epoch": 2.436724565756824, "grad_norm": 0.1940850019454956, "kl": 0.06585693359375, "learning_rate": 1.0361647930277719e-06, "loss": 0.0006590932607650757, "memory(GiB)": 38.13, "reward": 0.524627685546875, "reward_std": 0.09003743529319763, "rewards/VisualizationJSONCombinedORM/mean": 0.524627685546875, "rewards/VisualizationJSONCombinedORM/std": 0.22818304598331451, "step": 2946, "train_speed(iter/s)": 0.145879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 299.625, "completions/min_length": 238.0, "epoch": 2.4375516956162118, "grad_norm": 0.17383886873722076, "kl": 0.04693603515625, "learning_rate": 1.0332332985438248e-06, "loss": 0.0004691481590270996, "memory(GiB)": 38.13, "reward": 0.5576443076133728, "reward_std": 0.07112924754619598, "rewards/VisualizationJSONCombinedORM/mean": 0.5576443076133728, "rewards/VisualizationJSONCombinedORM/std": 0.215776726603508, "step": 2947, "train_speed(iter/s)": 0.145741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 276.6875, "completions/min_length": 230.0, "epoch": 2.4383788254755996, "grad_norm": 0.1707073301076889, "kl": 0.0941162109375, "learning_rate": 1.030305478879871e-06, "loss": 0.0009394586086273193, "memory(GiB)": 38.13, "reward": 0.648506760597229, "reward_std": 0.1098029613494873, "rewards/VisualizationJSONCombinedORM/mean": 0.648506760597229, "rewards/VisualizationJSONCombinedORM/std": 0.11775278300046921, "step": 2948, "train_speed(iter/s)": 0.14558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 298.1875, "completions/min_length": 238.0, "epoch": 2.4392059553349874, "grad_norm": 0.20144078135490417, "kl": 0.05615234375, "learning_rate": 1.02738133674825e-06, "loss": 0.0005603116005659103, "memory(GiB)": 38.13, "reward": 0.33326369524002075, "reward_std": 0.03076372854411602, "rewards/VisualizationJSONCombinedORM/mean": 0.33326369524002075, "rewards/VisualizationJSONCombinedORM/std": 0.13386698067188263, "step": 2949, "train_speed(iter/s)": 0.145405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 321.6875, "completions/min_length": 248.0, "epoch": 2.4400330851943757, "grad_norm": 0.1788010597229004, "kl": 0.079345703125, "learning_rate": 1.0244608748578884e-06, "loss": 0.0007943958044052124, "memory(GiB)": 38.13, "reward": 0.39750924706459045, "reward_std": 0.04824011027812958, "rewards/VisualizationJSONCombinedORM/mean": 0.39750924706459045, "rewards/VisualizationJSONCombinedORM/std": 0.08067508041858673, "step": 2950, "train_speed(iter/s)": 0.145287 }, { "epoch": 2.4400330851943757, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.0, "eval_completions/mean_length": 301.0520833333333, "eval_completions/min_length": 250.0, "eval_kl": 0.0714569091796875, "eval_loss": 0.0007202737033367157, "eval_reward": 0.44263729577263194, "eval_reward_std": 0.05514186678919941, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44263729577263194, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0551418693891416, "eval_runtime": 307.5431, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 298.875, "completions/min_length": 259.0, "epoch": 2.4408602150537635, "grad_norm": 0.2287103831768036, "kl": 0.054931640625, "learning_rate": 1.0215440959143137e-06, "loss": 0.0005507078021764755, "memory(GiB)": 38.13, "reward": 0.5201225280761719, "reward_std": 0.08717092871665955, "rewards/VisualizationJSONCombinedORM/mean": 0.5201225280761719, "rewards/VisualizationJSONCombinedORM/std": 0.0983569324016571, "step": 2951, "train_speed(iter/s)": 0.143014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 277.75, "completions/min_length": 224.0, "epoch": 2.4416873449131513, "grad_norm": 0.13955159485340118, "kl": 0.1866455078125, "learning_rate": 1.01863100261963e-06, "loss": 0.0018626600503921509, "memory(GiB)": 38.13, "reward": 0.5714848637580872, "reward_std": 0.08555548638105392, "rewards/VisualizationJSONCombinedORM/mean": 0.5714848637580872, "rewards/VisualizationJSONCombinedORM/std": 0.08961783349514008, "step": 2952, "train_speed(iter/s)": 0.142858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 291.4375, "completions/min_length": 227.0, "epoch": 2.442514474772539, "grad_norm": 0.19140280783176422, "kl": 0.06451416015625, "learning_rate": 1.015721597672536e-06, "loss": 0.0006456747651100159, "memory(GiB)": 38.13, "reward": 0.5764929056167603, "reward_std": 0.08737426996231079, "rewards/VisualizationJSONCombinedORM/mean": 0.5764929056167603, "rewards/VisualizationJSONCombinedORM/std": 0.1362665444612503, "step": 2953, "train_speed(iter/s)": 0.142731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 286.3125, "completions/min_length": 226.0, "epoch": 2.4433416046319274, "grad_norm": 0.2412799447774887, "kl": 0.0572509765625, "learning_rate": 1.0128158837683071e-06, "loss": 0.0005720332264900208, "memory(GiB)": 38.13, "reward": 0.4826202094554901, "reward_std": 0.08226819336414337, "rewards/VisualizationJSONCombinedORM/mean": 0.4826202094554901, "rewards/VisualizationJSONCombinedORM/std": 0.08884495496749878, "step": 2954, "train_speed(iter/s)": 0.142599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 276.375, "completions/min_length": 229.0, "epoch": 2.444168734491315, "grad_norm": 0.16864463686943054, "kl": 0.10595703125, "learning_rate": 1.0099138635988026e-06, "loss": 0.0010603517293930054, "memory(GiB)": 38.13, "reward": 0.5070414543151855, "reward_std": 0.053518619388341904, "rewards/VisualizationJSONCombinedORM/mean": 0.5070414543151855, "rewards/VisualizationJSONCombinedORM/std": 0.24364377558231354, "step": 2955, "train_speed(iter/s)": 0.142495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 340.5, "completions/min_length": 293.0, "epoch": 2.444995864350703, "grad_norm": 0.15502414107322693, "kl": 0.134765625, "learning_rate": 1.00701553985246e-06, "loss": 0.0013462454080581665, "memory(GiB)": 38.13, "reward": 0.6916218996047974, "reward_std": 0.06971211731433868, "rewards/VisualizationJSONCombinedORM/mean": 0.6916218996047974, "rewards/VisualizationJSONCombinedORM/std": 0.07131397724151611, "step": 2956, "train_speed(iter/s)": 0.142325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 307.25, "completions/min_length": 236.0, "epoch": 2.445822994210091, "grad_norm": 0.22571934759616852, "kl": 0.04718017578125, "learning_rate": 1.0041209152142934e-06, "loss": 0.0004713311791419983, "memory(GiB)": 38.13, "reward": 0.5870405435562134, "reward_std": 0.0893951952457428, "rewards/VisualizationJSONCombinedORM/mean": 0.5870405435562134, "rewards/VisualizationJSONCombinedORM/std": 0.10788021981716156, "step": 2957, "train_speed(iter/s)": 0.142196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 305.875, "completions/min_length": 236.0, "epoch": 2.446650124069479, "grad_norm": 0.16662664711475372, "kl": 0.0592041015625, "learning_rate": 1.0012299923658848e-06, "loss": 0.0005907630547881126, "memory(GiB)": 38.13, "reward": 0.3816058337688446, "reward_std": 0.03443758934736252, "rewards/VisualizationJSONCombinedORM/mean": 0.3816058337688446, "rewards/VisualizationJSONCombinedORM/std": 0.06392265856266022, "step": 2958, "train_speed(iter/s)": 0.142072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 282.5, "completions/min_length": 234.0, "epoch": 2.447477253928867, "grad_norm": 0.19316726922988892, "kl": 0.0523681640625, "learning_rate": 9.983427739853935e-07, "loss": 0.0005228146910667419, "memory(GiB)": 38.13, "reward": 0.4598885178565979, "reward_std": 0.06257229298353195, "rewards/VisualizationJSONCombinedORM/mean": 0.4598885178565979, "rewards/VisualizationJSONCombinedORM/std": 0.06726911664009094, "step": 2959, "train_speed(iter/s)": 0.141949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 300.5, "completions/min_length": 264.0, "epoch": 2.4483043837882548, "grad_norm": 0.201650470495224, "kl": 0.0447998046875, "learning_rate": 9.954592627475412e-07, "loss": 0.0004477947950363159, "memory(GiB)": 38.13, "reward": 0.5723230838775635, "reward_std": 0.07715156674385071, "rewards/VisualizationJSONCombinedORM/mean": 0.5723230838775635, "rewards/VisualizationJSONCombinedORM/std": 0.10660667717456818, "step": 2960, "train_speed(iter/s)": 0.141823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 291.125, "completions/min_length": 238.0, "epoch": 2.4491315136476426, "grad_norm": 0.20047566294670105, "kl": 0.04815673828125, "learning_rate": 9.925794613236201e-07, "loss": 0.0004812180995941162, "memory(GiB)": 38.13, "reward": 0.6246466636657715, "reward_std": 0.09046381711959839, "rewards/VisualizationJSONCombinedORM/mean": 0.6246466636657715, "rewards/VisualizationJSONCombinedORM/std": 0.10307124257087708, "step": 2961, "train_speed(iter/s)": 0.141695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 308.75, "completions/min_length": 251.0, "epoch": 2.4499586435070304, "grad_norm": 0.2055104523897171, "kl": 0.08282470703125, "learning_rate": 9.897033723814824e-07, "loss": 0.0008314624428749084, "memory(GiB)": 38.13, "reward": 0.4422847628593445, "reward_std": 0.053692661225795746, "rewards/VisualizationJSONCombinedORM/mean": 0.4422847628593445, "rewards/VisualizationJSONCombinedORM/std": 0.05757415294647217, "step": 2962, "train_speed(iter/s)": 0.14156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 303.6875, "completions/min_length": 258.0, "epoch": 2.4507857733664187, "grad_norm": 0.23755981028079987, "kl": 0.056640625, "learning_rate": 9.868309985855446e-07, "loss": 0.0005665943026542664, "memory(GiB)": 38.13, "reward": 0.677216649055481, "reward_std": 0.08180109411478043, "rewards/VisualizationJSONCombinedORM/mean": 0.677216649055481, "rewards/VisualizationJSONCombinedORM/std": 0.08086023479700089, "step": 2963, "train_speed(iter/s)": 0.141454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 290.875, "completions/min_length": 231.0, "epoch": 2.4516129032258065, "grad_norm": 0.1712740659713745, "kl": 0.13427734375, "learning_rate": 9.83962342596776e-07, "loss": 0.0013416558504104614, "memory(GiB)": 38.13, "reward": 0.5427117347717285, "reward_std": 0.07406660914421082, "rewards/VisualizationJSONCombinedORM/mean": 0.5427117347717285, "rewards/VisualizationJSONCombinedORM/std": 0.12571217119693756, "step": 2964, "train_speed(iter/s)": 0.141286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 278.1875, "completions/min_length": 224.0, "epoch": 2.4524400330851943, "grad_norm": 0.2651214003562927, "kl": 0.033294677734375, "learning_rate": 9.810974070727057e-07, "loss": 0.0003332570195198059, "memory(GiB)": 38.13, "reward": 0.36213982105255127, "reward_std": 0.08572734892368317, "rewards/VisualizationJSONCombinedORM/mean": 0.36213982105255127, "rewards/VisualizationJSONCombinedORM/std": 0.15107321739196777, "step": 2965, "train_speed(iter/s)": 0.141151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 324.6875, "completions/min_length": 242.0, "epoch": 2.453267162944582, "grad_norm": 0.18376439809799194, "kl": 0.05670166015625, "learning_rate": 9.782361946674173e-07, "loss": 0.0005676820874214172, "memory(GiB)": 38.13, "reward": 0.48847463726997375, "reward_std": 0.07216699421405792, "rewards/VisualizationJSONCombinedORM/mean": 0.48847463726997375, "rewards/VisualizationJSONCombinedORM/std": 0.2386666238307953, "step": 2966, "train_speed(iter/s)": 0.141019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 287.9375, "completions/min_length": 227.0, "epoch": 2.4540942928039704, "grad_norm": 0.19329199194908142, "kl": 0.09576416015625, "learning_rate": 9.753787080315385e-07, "loss": 0.00095352903008461, "memory(GiB)": 38.13, "reward": 0.6862945556640625, "reward_std": 0.06226731836795807, "rewards/VisualizationJSONCombinedORM/mean": 0.6862945556640625, "rewards/VisualizationJSONCombinedORM/std": 0.11203581839799881, "step": 2967, "train_speed(iter/s)": 0.140905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 318.125, "completions/min_length": 237.0, "epoch": 2.454921422663358, "grad_norm": 0.2109985500574112, "kl": 0.07305908203125, "learning_rate": 9.725249498122563e-07, "loss": 0.000730026513338089, "memory(GiB)": 38.13, "reward": 0.4571612477302551, "reward_std": 0.10089505463838577, "rewards/VisualizationJSONCombinedORM/mean": 0.4571612477302551, "rewards/VisualizationJSONCombinedORM/std": 0.21690115332603455, "step": 2968, "train_speed(iter/s)": 0.140769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 310.25, "completions/min_length": 224.0, "epoch": 2.455748552522746, "grad_norm": 0.1757606565952301, "kl": 0.042724609375, "learning_rate": 9.696749226532937e-07, "loss": 0.00042753666639328003, "memory(GiB)": 38.13, "reward": 0.6494609713554382, "reward_std": 0.08368102461099625, "rewards/VisualizationJSONCombinedORM/mean": 0.6494609713554382, "rewards/VisualizationJSONCombinedORM/std": 0.08791268616914749, "step": 2969, "train_speed(iter/s)": 0.140647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 277.4375, "completions/min_length": 233.0, "epoch": 2.456575682382134, "grad_norm": 0.2210398018360138, "kl": 0.1160888671875, "learning_rate": 9.668286291949224e-07, "loss": 0.0011623506434261799, "memory(GiB)": 38.13, "reward": 0.5544067621231079, "reward_std": 0.0721065104007721, "rewards/VisualizationJSONCombinedORM/mean": 0.5544067621231079, "rewards/VisualizationJSONCombinedORM/std": 0.20138947665691376, "step": 2970, "train_speed(iter/s)": 0.140523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 319.4375, "completions/min_length": 249.0, "epoch": 2.457402812241522, "grad_norm": 0.17447902262210846, "kl": 0.1011962890625, "learning_rate": 9.639860720739524e-07, "loss": 0.0010140538215637207, "memory(GiB)": 38.13, "reward": 0.43238404393196106, "reward_std": 0.041850388050079346, "rewards/VisualizationJSONCombinedORM/mean": 0.43238404393196106, "rewards/VisualizationJSONCombinedORM/std": 0.12863942980766296, "step": 2971, "train_speed(iter/s)": 0.140387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 305.8125, "completions/min_length": 237.0, "epoch": 2.45822994210091, "grad_norm": 0.1961081475019455, "kl": 0.075439453125, "learning_rate": 9.611472539237354e-07, "loss": 0.0007537975907325745, "memory(GiB)": 38.13, "reward": 0.4508674442768097, "reward_std": 0.05132141336798668, "rewards/VisualizationJSONCombinedORM/mean": 0.4508674442768097, "rewards/VisualizationJSONCombinedORM/std": 0.07352671027183533, "step": 2972, "train_speed(iter/s)": 0.140276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 294.75, "completions/min_length": 249.0, "epoch": 2.4590570719602978, "grad_norm": 0.19065910577774048, "kl": 0.06622314453125, "learning_rate": 9.583121773741571e-07, "loss": 0.0006622243672609329, "memory(GiB)": 38.13, "reward": 0.5030311942100525, "reward_std": 0.06970586627721786, "rewards/VisualizationJSONCombinedORM/mean": 0.5030311942100525, "rewards/VisualizationJSONCombinedORM/std": 0.211721271276474, "step": 2973, "train_speed(iter/s)": 0.140154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 311.125, "completions/min_length": 252.0, "epoch": 2.4598842018196856, "grad_norm": 0.16743682324886322, "kl": 0.0814208984375, "learning_rate": 9.55480845051639e-07, "loss": 0.0008115582168102264, "memory(GiB)": 38.13, "reward": 0.4608269929885864, "reward_std": 0.0483819879591465, "rewards/VisualizationJSONCombinedORM/mean": 0.4608269929885864, "rewards/VisualizationJSONCombinedORM/std": 0.3070509433746338, "step": 2974, "train_speed(iter/s)": 0.140023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 304.6875, "completions/min_length": 248.0, "epoch": 2.4607113316790734, "grad_norm": 0.27581652998924255, "kl": 0.052734375, "learning_rate": 9.526532595791305e-07, "loss": 0.0005274489521980286, "memory(GiB)": 38.13, "reward": 0.4284970164299011, "reward_std": 0.04808808118104935, "rewards/VisualizationJSONCombinedORM/mean": 0.4284970164299011, "rewards/VisualizationJSONCombinedORM/std": 0.1862124651670456, "step": 2975, "train_speed(iter/s)": 0.139896 }, { "epoch": 2.4607113316790734, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.6666666666667, "eval_completions/mean_length": 308.09375, "eval_completions/min_length": 252.20833333333334, "eval_kl": 0.08851114908854167, "eval_loss": 0.0008960685809142888, "eval_reward": 0.46413589765628177, "eval_reward_std": 0.06305726788317163, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46413589765628177, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06305726997864743, "eval_runtime": 311.1698, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 296.9375, "completions/min_length": 253.0, "epoch": 2.4615384615384617, "grad_norm": 0.24990713596343994, "kl": 0.08746337890625, "learning_rate": 9.498294235761141e-07, "loss": 0.0008734613656997681, "memory(GiB)": 38.13, "reward": 0.3913986086845398, "reward_std": 0.046911824494600296, "rewards/VisualizationJSONCombinedORM/mean": 0.3913986086845398, "rewards/VisualizationJSONCombinedORM/std": 0.061769407242536545, "step": 2976, "train_speed(iter/s)": 0.137742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 291.75, "completions/min_length": 229.0, "epoch": 2.4623655913978495, "grad_norm": 0.17209340631961823, "kl": 0.0550537109375, "learning_rate": 9.470093396585945e-07, "loss": 0.0005498062819242477, "memory(GiB)": 38.13, "reward": 0.48776865005493164, "reward_std": 0.05407359451055527, "rewards/VisualizationJSONCombinedORM/mean": 0.48776865005493164, "rewards/VisualizationJSONCombinedORM/std": 0.30277204513549805, "step": 2977, "train_speed(iter/s)": 0.137633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 297.125, "completions/min_length": 244.0, "epoch": 2.4631927212572373, "grad_norm": 0.16617333889007568, "kl": 0.06072998046875, "learning_rate": 9.441930104391034e-07, "loss": 0.000607931986451149, "memory(GiB)": 38.13, "reward": 0.7127804756164551, "reward_std": 0.05705709010362625, "rewards/VisualizationJSONCombinedORM/mean": 0.7127804756164551, "rewards/VisualizationJSONCombinedORM/std": 0.0800456553697586, "step": 2978, "train_speed(iter/s)": 0.137506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 309.5625, "completions/min_length": 215.0, "epoch": 2.464019851116625, "grad_norm": 0.19845585525035858, "kl": 0.0469970703125, "learning_rate": 9.41380438526694e-07, "loss": 0.0004699304699897766, "memory(GiB)": 38.13, "reward": 0.7215426564216614, "reward_std": 0.0864911898970604, "rewards/VisualizationJSONCombinedORM/mean": 0.7215426564216614, "rewards/VisualizationJSONCombinedORM/std": 0.08876141160726547, "step": 2979, "train_speed(iter/s)": 0.137378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 320.5625, "completions/min_length": 252.0, "epoch": 2.4648469809760134, "grad_norm": 0.2046106606721878, "kl": 0.06689453125, "learning_rate": 9.385716265269384e-07, "loss": 0.0006703436374664307, "memory(GiB)": 38.13, "reward": 0.5348284840583801, "reward_std": 0.05872560665011406, "rewards/VisualizationJSONCombinedORM/mean": 0.5348284840583801, "rewards/VisualizationJSONCombinedORM/std": 0.24501945078372955, "step": 2980, "train_speed(iter/s)": 0.137261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 319.25, "completions/min_length": 259.0, "epoch": 2.4656741108354012, "grad_norm": 0.23430530726909637, "kl": 0.0997314453125, "learning_rate": 9.357665770419244e-07, "loss": 0.0009977594017982483, "memory(GiB)": 38.13, "reward": 0.595772922039032, "reward_std": 0.0793510377407074, "rewards/VisualizationJSONCombinedORM/mean": 0.595772922039032, "rewards/VisualizationJSONCombinedORM/std": 0.15985049307346344, "step": 2981, "train_speed(iter/s)": 0.137136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 283.375, "completions/min_length": 211.0, "epoch": 2.466501240694789, "grad_norm": 0.2201472967863083, "kl": 0.08935546875, "learning_rate": 9.329652926702559e-07, "loss": 0.0008937716484069824, "memory(GiB)": 38.13, "reward": 0.5182389616966248, "reward_std": 0.08505110442638397, "rewards/VisualizationJSONCombinedORM/mean": 0.5182389616966248, "rewards/VisualizationJSONCombinedORM/std": 0.15061695873737335, "step": 2982, "train_speed(iter/s)": 0.136996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 300.1875, "completions/min_length": 256.0, "epoch": 2.467328370554177, "grad_norm": 0.2551766037940979, "kl": 0.07562255859375, "learning_rate": 9.301677760070449e-07, "loss": 0.0007551182061433792, "memory(GiB)": 38.13, "reward": 0.32644134759902954, "reward_std": 0.03859843313694, "rewards/VisualizationJSONCombinedORM/mean": 0.32644134759902954, "rewards/VisualizationJSONCombinedORM/std": 0.049529027193784714, "step": 2983, "train_speed(iter/s)": 0.136896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 297.375, "completions/min_length": 235.0, "epoch": 2.468155500413565, "grad_norm": 0.1756693720817566, "kl": 0.07635498046875, "learning_rate": 9.273740296439204e-07, "loss": 0.0007641501724720001, "memory(GiB)": 38.13, "reward": 0.5345067381858826, "reward_std": 0.06298024952411652, "rewards/VisualizationJSONCombinedORM/mean": 0.5345067381858826, "rewards/VisualizationJSONCombinedORM/std": 0.2759883403778076, "step": 2984, "train_speed(iter/s)": 0.136783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 323.875, "completions/min_length": 257.0, "epoch": 2.468982630272953, "grad_norm": 0.18282312154769897, "kl": 0.0992431640625, "learning_rate": 9.245840561690117e-07, "loss": 0.0009927228093147278, "memory(GiB)": 38.13, "reward": 0.6294248104095459, "reward_std": 0.07117894291877747, "rewards/VisualizationJSONCombinedORM/mean": 0.6294248104095459, "rewards/VisualizationJSONCombinedORM/std": 0.08626634627580643, "step": 2985, "train_speed(iter/s)": 0.136699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 317.8125, "completions/min_length": 229.0, "epoch": 2.4698097601323408, "grad_norm": 0.16589070856571198, "kl": 0.0643310546875, "learning_rate": 9.217978581669562e-07, "loss": 0.0006435513496398926, "memory(GiB)": 38.13, "reward": 0.6696349382400513, "reward_std": 0.09241096675395966, "rewards/VisualizationJSONCombinedORM/mean": 0.6696349382400513, "rewards/VisualizationJSONCombinedORM/std": 0.09745781868696213, "step": 2986, "train_speed(iter/s)": 0.136605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 313.125, "completions/min_length": 254.0, "epoch": 2.4706368899917286, "grad_norm": 0.23690004646778107, "kl": 0.1185302734375, "learning_rate": 9.190154382188921e-07, "loss": 0.0011855252087116241, "memory(GiB)": 38.13, "reward": 0.4543570578098297, "reward_std": 0.05411927402019501, "rewards/VisualizationJSONCombinedORM/mean": 0.4543570578098297, "rewards/VisualizationJSONCombinedORM/std": 0.055502768605947495, "step": 2987, "train_speed(iter/s)": 0.136491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 314.875, "completions/min_length": 256.0, "epoch": 2.4714640198511164, "grad_norm": 0.18648512661457062, "kl": 0.0504150390625, "learning_rate": 9.162367989024584e-07, "loss": 0.0005040615797042847, "memory(GiB)": 38.13, "reward": 0.5321052670478821, "reward_std": 0.08034457266330719, "rewards/VisualizationJSONCombinedORM/mean": 0.5321052670478821, "rewards/VisualizationJSONCombinedORM/std": 0.16791915893554688, "step": 2988, "train_speed(iter/s)": 0.136367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 305.6875, "completions/min_length": 227.0, "epoch": 2.4722911497105047, "grad_norm": 0.17597903311252594, "kl": 0.09552001953125, "learning_rate": 9.134619427917918e-07, "loss": 0.000957077369093895, "memory(GiB)": 38.13, "reward": 0.7876731753349304, "reward_std": 0.06829411536455154, "rewards/VisualizationJSONCombinedORM/mean": 0.7876731753349304, "rewards/VisualizationJSONCombinedORM/std": 0.09429998695850372, "step": 2989, "train_speed(iter/s)": 0.136254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 298.0625, "completions/min_length": 233.0, "epoch": 2.4731182795698925, "grad_norm": 0.16173022985458374, "kl": 0.0672607421875, "learning_rate": 9.106908724575258e-07, "loss": 0.0006729811429977417, "memory(GiB)": 38.13, "reward": 0.7074184417724609, "reward_std": 0.08291084319353104, "rewards/VisualizationJSONCombinedORM/mean": 0.7074184417724609, "rewards/VisualizationJSONCombinedORM/std": 0.1055794358253479, "step": 2990, "train_speed(iter/s)": 0.136114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 286.3125, "completions/min_length": 229.0, "epoch": 2.4739454094292803, "grad_norm": 0.21810618042945862, "kl": 0.076416015625, "learning_rate": 9.079235904667826e-07, "loss": 0.0007654409855604172, "memory(GiB)": 38.13, "reward": 0.6872261166572571, "reward_std": 0.09534355998039246, "rewards/VisualizationJSONCombinedORM/mean": 0.6872261166572571, "rewards/VisualizationJSONCombinedORM/std": 0.09416095912456512, "step": 2991, "train_speed(iter/s)": 0.136004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 311.1875, "completions/min_length": 254.0, "epoch": 2.474772539288668, "grad_norm": 0.1782972812652588, "kl": 0.0386962890625, "learning_rate": 9.051600993831799e-07, "loss": 0.0003869384527206421, "memory(GiB)": 38.13, "reward": 0.6912422180175781, "reward_std": 0.0720672458410263, "rewards/VisualizationJSONCombinedORM/mean": 0.6912422180175781, "rewards/VisualizationJSONCombinedORM/std": 0.09937077015638351, "step": 2992, "train_speed(iter/s)": 0.135892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 288.3125, "completions/min_length": 226.0, "epoch": 2.4755996691480564, "grad_norm": 0.202856183052063, "kl": 0.033599853515625, "learning_rate": 9.024004017668181e-07, "loss": 0.00033583492040634155, "memory(GiB)": 38.13, "reward": 0.5146247148513794, "reward_std": 0.08482513576745987, "rewards/VisualizationJSONCombinedORM/mean": 0.5146247148513794, "rewards/VisualizationJSONCombinedORM/std": 0.2288023978471756, "step": 2993, "train_speed(iter/s)": 0.135785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 338.6875, "completions/min_length": 285.0, "epoch": 2.4764267990074442, "grad_norm": 0.16721242666244507, "kl": 0.0821533203125, "learning_rate": 8.996445001742871e-07, "loss": 0.0008218586444854736, "memory(GiB)": 38.13, "reward": 0.5424091815948486, "reward_std": 0.05180782824754715, "rewards/VisualizationJSONCombinedORM/mean": 0.5424091815948486, "rewards/VisualizationJSONCombinedORM/std": 0.1923762708902359, "step": 2994, "train_speed(iter/s)": 0.135634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 331.0625, "completions/min_length": 245.0, "epoch": 2.477253928866832, "grad_norm": 0.18541546165943146, "kl": 0.07269287109375, "learning_rate": 8.968923971586596e-07, "loss": 0.0007265433669090271, "memory(GiB)": 38.13, "reward": 0.6180801391601562, "reward_std": 0.06093835085630417, "rewards/VisualizationJSONCombinedORM/mean": 0.6180801391601562, "rewards/VisualizationJSONCombinedORM/std": 0.1149192750453949, "step": 2995, "train_speed(iter/s)": 0.135518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 296.375, "completions/min_length": 246.0, "epoch": 2.47808105872622, "grad_norm": 0.23008686304092407, "kl": 0.04302978515625, "learning_rate": 8.941440952694907e-07, "loss": 0.00042898207902908325, "memory(GiB)": 38.13, "reward": 0.4858834743499756, "reward_std": 0.016321629285812378, "rewards/VisualizationJSONCombinedORM/mean": 0.4858834743499756, "rewards/VisualizationJSONCombinedORM/std": 0.19477646052837372, "step": 2996, "train_speed(iter/s)": 0.13541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 324.0, "completions/min_length": 231.0, "epoch": 2.478908188585608, "grad_norm": 0.18375170230865479, "kl": 0.07574462890625, "learning_rate": 8.913995970528089e-07, "loss": 0.0007584318518638611, "memory(GiB)": 38.13, "reward": 0.6449266672134399, "reward_std": 0.0791582390666008, "rewards/VisualizationJSONCombinedORM/mean": 0.6449266672134399, "rewards/VisualizationJSONCombinedORM/std": 0.12219615280628204, "step": 2997, "train_speed(iter/s)": 0.135296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 290.1875, "completions/min_length": 238.0, "epoch": 2.479735318444996, "grad_norm": 0.17942671477794647, "kl": 0.1263427734375, "learning_rate": 8.886589050511257e-07, "loss": 0.0012616068124771118, "memory(GiB)": 38.13, "reward": 0.5284091234207153, "reward_std": 0.057168230414390564, "rewards/VisualizationJSONCombinedORM/mean": 0.5284091234207153, "rewards/VisualizationJSONCombinedORM/std": 0.2559683620929718, "step": 2998, "train_speed(iter/s)": 0.135172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 330.5625, "completions/min_length": 225.0, "epoch": 2.4805624483043838, "grad_norm": 0.17197301983833313, "kl": 0.13323974609375, "learning_rate": 8.859220218034192e-07, "loss": 0.0013301968574523926, "memory(GiB)": 38.13, "reward": 0.24303141236305237, "reward_std": 0.03101116046309471, "rewards/VisualizationJSONCombinedORM/mean": 0.24303141236305237, "rewards/VisualizationJSONCombinedORM/std": 0.03745996952056885, "step": 2999, "train_speed(iter/s)": 0.135035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 308.8125, "completions/min_length": 232.0, "epoch": 2.4813895781637716, "grad_norm": 0.1912061870098114, "kl": 0.049072265625, "learning_rate": 8.831889498451474e-07, "loss": 0.0004900135099887848, "memory(GiB)": 38.13, "reward": 0.34265589714050293, "reward_std": 0.046804480254650116, "rewards/VisualizationJSONCombinedORM/mean": 0.34265589714050293, "rewards/VisualizationJSONCombinedORM/std": 0.0463431142270565, "step": 3000, "train_speed(iter/s)": 0.134891 }, { "epoch": 2.4813895781637716, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.0416666666667, "eval_completions/mean_length": 305.6354166666667, "eval_completions/min_length": 256.25, "eval_kl": 0.08631388346354167, "eval_loss": 0.0008629883523099124, "eval_reward": 0.4739684046556552, "eval_reward_std": 0.059530892448189356, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4739684046556552, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05953089454366515, "eval_runtime": 312.341, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 307.3125, "completions/min_length": 257.0, "epoch": 2.4822167080231594, "grad_norm": 0.1514851152896881, "kl": 0.05072021484375, "learning_rate": 8.804596917082309e-07, "loss": 0.0005074813961982727, "memory(GiB)": 38.13, "reward": 0.5319541692733765, "reward_std": 0.03785104304552078, "rewards/VisualizationJSONCombinedORM/mean": 0.5319541692733765, "rewards/VisualizationJSONCombinedORM/std": 0.12515997886657715, "step": 3001, "train_speed(iter/s)": 0.13293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 301.5, "completions/min_length": 239.0, "epoch": 2.4830438378825477, "grad_norm": 0.169532909989357, "kl": 0.04315185546875, "learning_rate": 8.777342499210606e-07, "loss": 0.0004303343594074249, "memory(GiB)": 38.13, "reward": 0.4838907718658447, "reward_std": 0.0602528415620327, "rewards/VisualizationJSONCombinedORM/mean": 0.4838907718658447, "rewards/VisualizationJSONCombinedORM/std": 0.07810796797275543, "step": 3002, "train_speed(iter/s)": 0.132842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 295.375, "completions/min_length": 224.0, "epoch": 2.4838709677419355, "grad_norm": 0.19166915118694305, "kl": 0.0726318359375, "learning_rate": 8.750126270084891e-07, "loss": 0.0007242709398269653, "memory(GiB)": 38.13, "reward": 0.5880197882652283, "reward_std": 0.10272425413131714, "rewards/VisualizationJSONCombinedORM/mean": 0.5880197882652283, "rewards/VisualizationJSONCombinedORM/std": 0.11586980521678925, "step": 3003, "train_speed(iter/s)": 0.13271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 317.25, "completions/min_length": 249.0, "epoch": 2.4846980976013233, "grad_norm": 0.17536386847496033, "kl": 0.0791015625, "learning_rate": 8.72294825491834e-07, "loss": 0.0007901415228843689, "memory(GiB)": 38.13, "reward": 0.4986664950847626, "reward_std": 0.06736347079277039, "rewards/VisualizationJSONCombinedORM/mean": 0.4986664950847626, "rewards/VisualizationJSONCombinedORM/std": 0.06767010688781738, "step": 3004, "train_speed(iter/s)": 0.132587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 318.25, "completions/min_length": 251.0, "epoch": 2.485525227460711, "grad_norm": 0.22067125141620636, "kl": 0.04327392578125, "learning_rate": 8.695808478888712e-07, "loss": 0.0004328489303588867, "memory(GiB)": 38.13, "reward": 0.5246899127960205, "reward_std": 0.08718196302652359, "rewards/VisualizationJSONCombinedORM/mean": 0.5246899127960205, "rewards/VisualizationJSONCombinedORM/std": 0.10536009073257446, "step": 3005, "train_speed(iter/s)": 0.132474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 305.125, "completions/min_length": 246.0, "epoch": 2.4863523573200994, "grad_norm": 0.18359264731407166, "kl": 0.048553466796875, "learning_rate": 8.668706967138363e-07, "loss": 0.00048551708459854126, "memory(GiB)": 38.13, "reward": 0.6070671081542969, "reward_std": 0.08231740444898605, "rewards/VisualizationJSONCombinedORM/mean": 0.6070671081542969, "rewards/VisualizationJSONCombinedORM/std": 0.08241405338048935, "step": 3006, "train_speed(iter/s)": 0.132372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 316.25, "completions/min_length": 257.0, "epoch": 2.4871794871794872, "grad_norm": 0.1978289783000946, "kl": 0.104736328125, "learning_rate": 8.641643744774159e-07, "loss": 0.0010487958788871765, "memory(GiB)": 38.13, "reward": 0.6225265860557556, "reward_std": 0.09954468905925751, "rewards/VisualizationJSONCombinedORM/mean": 0.6225265860557556, "rewards/VisualizationJSONCombinedORM/std": 0.10051438212394714, "step": 3007, "train_speed(iter/s)": 0.132259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 326.5, "completions/min_length": 247.0, "epoch": 2.488006617038875, "grad_norm": 0.19228532910346985, "kl": 0.0479736328125, "learning_rate": 8.614618836867539e-07, "loss": 0.0004796236753463745, "memory(GiB)": 38.13, "reward": 0.4313603639602661, "reward_std": 0.05973702296614647, "rewards/VisualizationJSONCombinedORM/mean": 0.4313603639602661, "rewards/VisualizationJSONCombinedORM/std": 0.058236148208379745, "step": 3008, "train_speed(iter/s)": 0.132156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 321.9375, "completions/min_length": 234.0, "epoch": 2.488833746898263, "grad_norm": 0.17407339811325073, "kl": 0.041412353515625, "learning_rate": 8.587632268454405e-07, "loss": 0.00041473284363746643, "memory(GiB)": 38.13, "reward": 0.7549903392791748, "reward_std": 0.053290486335754395, "rewards/VisualizationJSONCombinedORM/mean": 0.7549903392791748, "rewards/VisualizationJSONCombinedORM/std": 0.06224840134382248, "step": 3009, "train_speed(iter/s)": 0.132038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 357.375, "completions/min_length": 257.0, "epoch": 2.489660876757651, "grad_norm": 0.22468647360801697, "kl": 0.1077880859375, "learning_rate": 8.56068406453518e-07, "loss": 0.0010807253420352936, "memory(GiB)": 38.13, "reward": 0.5349999666213989, "reward_std": 0.07919555902481079, "rewards/VisualizationJSONCombinedORM/mean": 0.5349999666213989, "rewards/VisualizationJSONCombinedORM/std": 0.1396992802619934, "step": 3010, "train_speed(iter/s)": 0.131906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 328.6875, "completions/min_length": 258.0, "epoch": 2.490488006617039, "grad_norm": 0.17051327228546143, "kl": 0.110107421875, "learning_rate": 8.533774250074727e-07, "loss": 0.0010996232740581036, "memory(GiB)": 38.13, "reward": 0.5234788060188293, "reward_std": 0.06619996577501297, "rewards/VisualizationJSONCombinedORM/mean": 0.5234788060188293, "rewards/VisualizationJSONCombinedORM/std": 0.11756075918674469, "step": 3011, "train_speed(iter/s)": 0.131811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 292.4375, "completions/min_length": 245.0, "epoch": 2.4913151364764268, "grad_norm": 0.1965027153491974, "kl": 0.088623046875, "learning_rate": 8.506902850002358e-07, "loss": 0.0008851960301399231, "memory(GiB)": 38.13, "reward": 0.7122308015823364, "reward_std": 0.06750067323446274, "rewards/VisualizationJSONCombinedORM/mean": 0.7122308015823364, "rewards/VisualizationJSONCombinedORM/std": 0.08716153353452682, "step": 3012, "train_speed(iter/s)": 0.131689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 293.9375, "completions/min_length": 235.0, "epoch": 2.4921422663358146, "grad_norm": 0.333593487739563, "kl": 0.1978759765625, "learning_rate": 8.480069889211767e-07, "loss": 0.001978740096092224, "memory(GiB)": 38.13, "reward": 0.5410192012786865, "reward_std": 0.11005011200904846, "rewards/VisualizationJSONCombinedORM/mean": 0.5410192012786865, "rewards/VisualizationJSONCombinedORM/std": 0.11479071527719498, "step": 3013, "train_speed(iter/s)": 0.131593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 309.125, "completions/min_length": 239.0, "epoch": 2.4929693961952024, "grad_norm": 0.20015107095241547, "kl": 0.0816650390625, "learning_rate": 8.453275392561083e-07, "loss": 0.0008171088993549347, "memory(GiB)": 38.13, "reward": 0.44362154603004456, "reward_std": 0.06886116415262222, "rewards/VisualizationJSONCombinedORM/mean": 0.44362154603004456, "rewards/VisualizationJSONCombinedORM/std": 0.18795478343963623, "step": 3014, "train_speed(iter/s)": 0.131495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 294.0, "completions/min_length": 229.0, "epoch": 2.4937965260545907, "grad_norm": 0.2140573412179947, "kl": 0.0826416015625, "learning_rate": 8.426519384872733e-07, "loss": 0.0008242540061473846, "memory(GiB)": 38.13, "reward": 0.3582513928413391, "reward_std": 0.03263291344046593, "rewards/VisualizationJSONCombinedORM/mean": 0.3582513928413391, "rewards/VisualizationJSONCombinedORM/std": 0.14712271094322205, "step": 3015, "train_speed(iter/s)": 0.131401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 327.375, "completions/min_length": 255.0, "epoch": 2.4946236559139785, "grad_norm": 0.15716125071048737, "kl": 0.10205078125, "learning_rate": 8.399801890933579e-07, "loss": 0.0010219179093837738, "memory(GiB)": 38.13, "reward": 0.6073334217071533, "reward_std": 0.07323621958494186, "rewards/VisualizationJSONCombinedORM/mean": 0.6073334217071533, "rewards/VisualizationJSONCombinedORM/std": 0.19364866614341736, "step": 3016, "train_speed(iter/s)": 0.131321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 296.0, "completions/min_length": 218.0, "epoch": 2.4954507857733663, "grad_norm": 0.19148023426532745, "kl": 0.0491943359375, "learning_rate": 8.373122935494749e-07, "loss": 0.0004918128252029419, "memory(GiB)": 38.13, "reward": 0.385658860206604, "reward_std": 0.05340081453323364, "rewards/VisualizationJSONCombinedORM/mean": 0.385658860206604, "rewards/VisualizationJSONCombinedORM/std": 0.09661880135536194, "step": 3017, "train_speed(iter/s)": 0.131216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 307.25, "completions/min_length": 238.0, "epoch": 2.4962779156327546, "grad_norm": 0.22938983142375946, "kl": 0.11297607421875, "learning_rate": 8.346482543271656e-07, "loss": 0.0011296272277832031, "memory(GiB)": 38.13, "reward": 0.5478272438049316, "reward_std": 0.10507111251354218, "rewards/VisualizationJSONCombinedORM/mean": 0.5478272438049316, "rewards/VisualizationJSONCombinedORM/std": 0.13146989047527313, "step": 3018, "train_speed(iter/s)": 0.131085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 317.6875, "completions/min_length": 217.0, "epoch": 2.4971050454921424, "grad_norm": 0.18368886411190033, "kl": 0.0911865234375, "learning_rate": 8.31988073894403e-07, "loss": 0.0009137727320194244, "memory(GiB)": 38.13, "reward": 0.49892693758010864, "reward_std": 0.05103904753923416, "rewards/VisualizationJSONCombinedORM/mean": 0.49892693758010864, "rewards/VisualizationJSONCombinedORM/std": 0.2091047465801239, "step": 3019, "train_speed(iter/s)": 0.130978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 286.0625, "completions/min_length": 234.0, "epoch": 2.4979321753515302, "grad_norm": 0.19222965836524963, "kl": 0.046630859375, "learning_rate": 8.293317547155816e-07, "loss": 0.00046701356768608093, "memory(GiB)": 38.13, "reward": 0.4076310396194458, "reward_std": 0.05374070629477501, "rewards/VisualizationJSONCombinedORM/mean": 0.4076310396194458, "rewards/VisualizationJSONCombinedORM/std": 0.1302127093076706, "step": 3020, "train_speed(iter/s)": 0.130881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 309.875, "completions/min_length": 213.0, "epoch": 2.498759305210918, "grad_norm": 0.18352963030338287, "kl": 0.0966796875, "learning_rate": 8.266792992515199e-07, "loss": 0.0009676367044448853, "memory(GiB)": 38.13, "reward": 0.5701190233230591, "reward_std": 0.06752505898475647, "rewards/VisualizationJSONCombinedORM/mean": 0.5701190233230591, "rewards/VisualizationJSONCombinedORM/std": 0.1422225534915924, "step": 3021, "train_speed(iter/s)": 0.130791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 291.625, "completions/min_length": 237.0, "epoch": 2.499586435070306, "grad_norm": 0.1942024677991867, "kl": 0.08721923828125, "learning_rate": 8.240307099594591e-07, "loss": 0.0008721053600311279, "memory(GiB)": 38.13, "reward": 0.38925328850746155, "reward_std": 0.04940987750887871, "rewards/VisualizationJSONCombinedORM/mean": 0.38925328850746155, "rewards/VisualizationJSONCombinedORM/std": 0.10832951217889786, "step": 3022, "train_speed(iter/s)": 0.130709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 308.3125, "completions/min_length": 228.0, "epoch": 2.500413564929694, "grad_norm": 0.16536319255828857, "kl": 0.07220458984375, "learning_rate": 8.213859892930581e-07, "loss": 0.0007219240069389343, "memory(GiB)": 38.13, "reward": 0.7539508938789368, "reward_std": 0.0789370983839035, "rewards/VisualizationJSONCombinedORM/mean": 0.7539508938789368, "rewards/VisualizationJSONCombinedORM/std": 0.08397993445396423, "step": 3023, "train_speed(iter/s)": 0.130589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 296.3125, "completions/min_length": 244.0, "epoch": 2.501240694789082, "grad_norm": 0.16340866684913635, "kl": 0.0465087890625, "learning_rate": 8.187451397023877e-07, "loss": 0.00046388059854507446, "memory(GiB)": 38.13, "reward": 0.3649495840072632, "reward_std": 0.03517830744385719, "rewards/VisualizationJSONCombinedORM/mean": 0.3649495840072632, "rewards/VisualizationJSONCombinedORM/std": 0.11113394051790237, "step": 3024, "train_speed(iter/s)": 0.130506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 303.875, "completions/min_length": 222.0, "epoch": 2.5020678246484698, "grad_norm": 0.16867037117481232, "kl": 0.100830078125, "learning_rate": 8.161081636339397e-07, "loss": 0.0010120980441570282, "memory(GiB)": 38.13, "reward": 0.33715692162513733, "reward_std": 0.03468047082424164, "rewards/VisualizationJSONCombinedORM/mean": 0.33715692162513733, "rewards/VisualizationJSONCombinedORM/std": 0.12614315748214722, "step": 3025, "train_speed(iter/s)": 0.1304 }, { "epoch": 2.5020678246484698, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 349.9166666666667, "eval_completions/mean_length": 293.7708333333333, "eval_completions/min_length": 247.20833333333334, "eval_kl": 0.0765380859375, "eval_loss": 0.0007697927649132907, "eval_reward": 0.46936014605065185, "eval_reward_std": 0.06264603304831932, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46936014605065185, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06264603498857468, "eval_runtime": 301.9029, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 316.375, "completions/min_length": 241.0, "epoch": 2.5028949545078576, "grad_norm": 0.15046440064907074, "kl": 0.0377197265625, "learning_rate": 8.134750635306099e-07, "loss": 0.00037739798426628113, "memory(GiB)": 38.13, "reward": 0.6602951288223267, "reward_std": 0.02630225196480751, "rewards/VisualizationJSONCombinedORM/mean": 0.6602951288223267, "rewards/VisualizationJSONCombinedORM/std": 0.15675939619541168, "step": 3026, "train_speed(iter/s)": 0.128624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 305.25, "completions/min_length": 233.0, "epoch": 2.5037220843672454, "grad_norm": 0.18624377250671387, "kl": 0.1375732421875, "learning_rate": 8.108458418317089e-07, "loss": 0.0013736635446548462, "memory(GiB)": 38.13, "reward": 0.4757688641548157, "reward_std": 0.061813294887542725, "rewards/VisualizationJSONCombinedORM/mean": 0.4757688641548157, "rewards/VisualizationJSONCombinedORM/std": 0.06593462079763412, "step": 3027, "train_speed(iter/s)": 0.128507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 308.875, "completions/min_length": 239.0, "epoch": 2.5045492142266337, "grad_norm": 0.18652723729610443, "kl": 0.0396728515625, "learning_rate": 8.082205009729521e-07, "loss": 0.0003971010446548462, "memory(GiB)": 38.13, "reward": 0.5530470609664917, "reward_std": 0.05213797837495804, "rewards/VisualizationJSONCombinedORM/mean": 0.5530470609664917, "rewards/VisualizationJSONCombinedORM/std": 0.07058722525835037, "step": 3028, "train_speed(iter/s)": 0.128421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 296.625, "completions/min_length": 264.0, "epoch": 2.5053763440860215, "grad_norm": 0.18104249238967896, "kl": 0.047119140625, "learning_rate": 8.055990433864618e-07, "loss": 0.0004719756543636322, "memory(GiB)": 38.13, "reward": 0.5814958214759827, "reward_std": 0.06134096533060074, "rewards/VisualizationJSONCombinedORM/mean": 0.5814958214759827, "rewards/VisualizationJSONCombinedORM/std": 0.295352965593338, "step": 3029, "train_speed(iter/s)": 0.128318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 309.0, "completions/min_length": 230.0, "epoch": 2.5062034739454093, "grad_norm": 0.20483918488025665, "kl": 0.1234130859375, "learning_rate": 8.029814715007589e-07, "loss": 0.0012378618121147156, "memory(GiB)": 38.13, "reward": 0.3717941641807556, "reward_std": 0.04268704354763031, "rewards/VisualizationJSONCombinedORM/mean": 0.3717941641807556, "rewards/VisualizationJSONCombinedORM/std": 0.1030203327536583, "step": 3030, "train_speed(iter/s)": 0.128215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 296.3125, "completions/min_length": 260.0, "epoch": 2.5070306038047976, "grad_norm": 0.1927289217710495, "kl": 0.10107421875, "learning_rate": 8.00367787740769e-07, "loss": 0.001010395586490631, "memory(GiB)": 38.13, "reward": 0.43831902742385864, "reward_std": 0.06283735483884811, "rewards/VisualizationJSONCombinedORM/mean": 0.43831902742385864, "rewards/VisualizationJSONCombinedORM/std": 0.21259677410125732, "step": 3031, "train_speed(iter/s)": 0.128133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 322.1875, "completions/min_length": 245.0, "epoch": 2.5078577336641854, "grad_norm": 0.18206948041915894, "kl": 0.09100341796875, "learning_rate": 7.977579945278091e-07, "loss": 0.000908244401216507, "memory(GiB)": 38.13, "reward": 0.5261138081550598, "reward_std": 0.05428388714790344, "rewards/VisualizationJSONCombinedORM/mean": 0.5261138081550598, "rewards/VisualizationJSONCombinedORM/std": 0.05529317259788513, "step": 3032, "train_speed(iter/s)": 0.128057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 293.125, "completions/min_length": 228.0, "epoch": 2.5086848635235732, "grad_norm": 0.1688825637102127, "kl": 0.059326171875, "learning_rate": 7.951520942796026e-07, "loss": 0.000593092292547226, "memory(GiB)": 38.13, "reward": 0.6470600962638855, "reward_std": 0.0790875256061554, "rewards/VisualizationJSONCombinedORM/mean": 0.6470600962638855, "rewards/VisualizationJSONCombinedORM/std": 0.14015065133571625, "step": 3033, "train_speed(iter/s)": 0.127957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 315.75, "completions/min_length": 241.0, "epoch": 2.509511993382961, "grad_norm": 0.21428923308849335, "kl": 0.06158447265625, "learning_rate": 7.925500894102561e-07, "loss": 0.0006156377494335175, "memory(GiB)": 38.13, "reward": 0.6020129323005676, "reward_std": 0.0487801656126976, "rewards/VisualizationJSONCombinedORM/mean": 0.6020129323005676, "rewards/VisualizationJSONCombinedORM/std": 0.16483396291732788, "step": 3034, "train_speed(iter/s)": 0.127862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 285.5625, "completions/min_length": 228.0, "epoch": 2.510339123242349, "grad_norm": 0.2095198631286621, "kl": 0.07586669921875, "learning_rate": 7.899519823302743e-07, "loss": 0.0007596816867589951, "memory(GiB)": 38.13, "reward": 0.4117669463157654, "reward_std": 0.03686852753162384, "rewards/VisualizationJSONCombinedORM/mean": 0.4117669463157654, "rewards/VisualizationJSONCombinedORM/std": 0.14497524499893188, "step": 3035, "train_speed(iter/s)": 0.127786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 303.25, "completions/min_length": 265.0, "epoch": 2.511166253101737, "grad_norm": 0.22635748982429504, "kl": 0.07122802734375, "learning_rate": 7.873577754465456e-07, "loss": 0.0007119178771972656, "memory(GiB)": 38.13, "reward": 0.3424927592277527, "reward_std": 0.03770359233021736, "rewards/VisualizationJSONCombinedORM/mean": 0.3424927592277527, "rewards/VisualizationJSONCombinedORM/std": 0.03997686132788658, "step": 3036, "train_speed(iter/s)": 0.127687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 311.875, "completions/min_length": 246.0, "epoch": 2.511993382961125, "grad_norm": 0.17394275963306427, "kl": 0.0421142578125, "learning_rate": 7.847674711623499e-07, "loss": 0.00042116641998291016, "memory(GiB)": 38.13, "reward": 0.5693755745887756, "reward_std": 0.06618370115756989, "rewards/VisualizationJSONCombinedORM/mean": 0.5693755745887756, "rewards/VisualizationJSONCombinedORM/std": 0.07081955671310425, "step": 3037, "train_speed(iter/s)": 0.127593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 297.625, "completions/min_length": 249.0, "epoch": 2.5128205128205128, "grad_norm": 0.18758833408355713, "kl": 0.07122802734375, "learning_rate": 7.821810718773493e-07, "loss": 0.0007117427885532379, "memory(GiB)": 38.13, "reward": 0.61566162109375, "reward_std": 0.09151868522167206, "rewards/VisualizationJSONCombinedORM/mean": 0.61566162109375, "rewards/VisualizationJSONCombinedORM/std": 0.12058570981025696, "step": 3038, "train_speed(iter/s)": 0.1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 320.8125, "completions/min_length": 247.0, "epoch": 2.5136476426799006, "grad_norm": 0.2064148187637329, "kl": 0.0994873046875, "learning_rate": 7.7959857998759e-07, "loss": 0.0009945966303348541, "memory(GiB)": 38.13, "reward": 0.568882942199707, "reward_std": 0.07377800345420837, "rewards/VisualizationJSONCombinedORM/mean": 0.568882942199707, "rewards/VisualizationJSONCombinedORM/std": 0.10361846536397934, "step": 3039, "train_speed(iter/s)": 0.127396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 318.5, "completions/min_length": 261.0, "epoch": 2.5144747725392884, "grad_norm": 0.19416017830371857, "kl": 0.045196533203125, "learning_rate": 7.770199978854947e-07, "loss": 0.00045168399810791016, "memory(GiB)": 38.13, "reward": 0.4943969249725342, "reward_std": 0.05173877254128456, "rewards/VisualizationJSONCombinedORM/mean": 0.4943969249725342, "rewards/VisualizationJSONCombinedORM/std": 0.1217930018901825, "step": 3040, "train_speed(iter/s)": 0.127314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 283.1875, "completions/min_length": 219.0, "epoch": 2.5153019023986767, "grad_norm": 0.17422370612621307, "kl": 0.1173095703125, "learning_rate": 7.744453279598702e-07, "loss": 0.0011747181415557861, "memory(GiB)": 38.13, "reward": 0.38924622535705566, "reward_std": 0.04691290855407715, "rewards/VisualizationJSONCombinedORM/mean": 0.38924622535705566, "rewards/VisualizationJSONCombinedORM/std": 0.061008863151073456, "step": 3041, "train_speed(iter/s)": 0.127196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 314.0, "completions/min_length": 244.0, "epoch": 2.5161290322580645, "grad_norm": 0.18521441519260406, "kl": 0.07928466796875, "learning_rate": 7.718745725958914e-07, "loss": 0.0007927343249320984, "memory(GiB)": 38.13, "reward": 0.5313810110092163, "reward_std": 0.053720131516456604, "rewards/VisualizationJSONCombinedORM/mean": 0.5313810110092163, "rewards/VisualizationJSONCombinedORM/std": 0.16578751802444458, "step": 3042, "train_speed(iter/s)": 0.127089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 331.625, "completions/min_length": 275.0, "epoch": 2.5169561621174523, "grad_norm": 0.20285053551197052, "kl": 0.1361083984375, "learning_rate": 7.693077341751138e-07, "loss": 0.0013615228235721588, "memory(GiB)": 38.13, "reward": 0.6878267526626587, "reward_std": 0.0756077915430069, "rewards/VisualizationJSONCombinedORM/mean": 0.6878267526626587, "rewards/VisualizationJSONCombinedORM/std": 0.08820472657680511, "step": 3043, "train_speed(iter/s)": 0.126987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 302.875, "completions/min_length": 247.0, "epoch": 2.5177832919768406, "grad_norm": 0.1742679327726364, "kl": 0.043212890625, "learning_rate": 7.667448150754598e-07, "loss": 0.0004319809377193451, "memory(GiB)": 38.13, "reward": 0.6071762442588806, "reward_std": 0.05158127844333649, "rewards/VisualizationJSONCombinedORM/mean": 0.6071762442588806, "rewards/VisualizationJSONCombinedORM/std": 0.12773984670639038, "step": 3044, "train_speed(iter/s)": 0.126898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 293.1875, "completions/min_length": 235.0, "epoch": 2.5186104218362284, "grad_norm": 0.20140092074871063, "kl": 0.082763671875, "learning_rate": 7.641858176712241e-07, "loss": 0.0008283518254756927, "memory(GiB)": 38.13, "reward": 0.4988086521625519, "reward_std": 0.0752129852771759, "rewards/VisualizationJSONCombinedORM/mean": 0.4988086521625519, "rewards/VisualizationJSONCombinedORM/std": 0.14236709475517273, "step": 3045, "train_speed(iter/s)": 0.126791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 290.375, "completions/min_length": 239.0, "epoch": 2.5194375516956162, "grad_norm": 0.20959730446338654, "kl": 0.08209228515625, "learning_rate": 7.616307443330645e-07, "loss": 0.0008195266127586365, "memory(GiB)": 38.13, "reward": 0.5340718030929565, "reward_std": 0.07514229416847229, "rewards/VisualizationJSONCombinedORM/mean": 0.5340718030929565, "rewards/VisualizationJSONCombinedORM/std": 0.16291803121566772, "step": 3046, "train_speed(iter/s)": 0.126699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 323.625, "completions/min_length": 260.0, "epoch": 2.520264681555004, "grad_norm": 0.27947998046875, "kl": 0.0975341796875, "learning_rate": 7.590795974280079e-07, "loss": 0.0009761489927768707, "memory(GiB)": 38.13, "reward": 0.7223668098449707, "reward_std": 0.1108781099319458, "rewards/VisualizationJSONCombinedORM/mean": 0.7223668098449707, "rewards/VisualizationJSONCombinedORM/std": 0.11087346076965332, "step": 3047, "train_speed(iter/s)": 0.126599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 311.125, "completions/min_length": 239.0, "epoch": 2.521091811414392, "grad_norm": 0.18154789507389069, "kl": 0.06134033203125, "learning_rate": 7.565323793194373e-07, "loss": 0.0006127241067588329, "memory(GiB)": 38.13, "reward": 0.5158703327178955, "reward_std": 0.037872932851314545, "rewards/VisualizationJSONCombinedORM/mean": 0.5158703327178955, "rewards/VisualizationJSONCombinedORM/std": 0.055463362485170364, "step": 3048, "train_speed(iter/s)": 0.126496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 301.125, "completions/min_length": 230.0, "epoch": 2.52191894127378, "grad_norm": 0.18348093330860138, "kl": 0.06524658203125, "learning_rate": 7.539890923671061e-07, "loss": 0.0006526224315166473, "memory(GiB)": 38.13, "reward": 0.5706345438957214, "reward_std": 0.04070284590125084, "rewards/VisualizationJSONCombinedORM/mean": 0.5706345438957214, "rewards/VisualizationJSONCombinedORM/std": 0.07498958706855774, "step": 3049, "train_speed(iter/s)": 0.126379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 324.5, "completions/min_length": 274.0, "epoch": 2.522746071133168, "grad_norm": 0.20872896909713745, "kl": 0.050048828125, "learning_rate": 7.514497389271153e-07, "loss": 0.0005002263933420181, "memory(GiB)": 38.13, "reward": 0.5755210518836975, "reward_std": 0.05043283477425575, "rewards/VisualizationJSONCombinedORM/mean": 0.5755210518836975, "rewards/VisualizationJSONCombinedORM/std": 0.19942854344844818, "step": 3050, "train_speed(iter/s)": 0.126248 }, { "epoch": 2.522746071133168, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.2916666666667, "eval_completions/mean_length": 295.9791666666667, "eval_completions/min_length": 245.75, "eval_kl": 0.07121785481770833, "eval_loss": 0.000711654603946954, "eval_reward": 0.46737113781273365, "eval_reward_std": 0.0670681893825531, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46737113781273365, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06706819202130039, "eval_runtime": 307.458, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 314.8125, "completions/min_length": 229.0, "epoch": 2.5235732009925558, "grad_norm": 0.20545552670955658, "kl": 0.09423828125, "learning_rate": 7.489143213519301e-07, "loss": 0.0009424630552530289, "memory(GiB)": 38.13, "reward": 0.5801428556442261, "reward_std": 0.08382071554660797, "rewards/VisualizationJSONCombinedORM/mean": 0.5801428556442261, "rewards/VisualizationJSONCombinedORM/std": 0.08528873324394226, "step": 3051, "train_speed(iter/s)": 0.124565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 299.8125, "completions/min_length": 247.0, "epoch": 2.5244003308519436, "grad_norm": 0.18967758119106293, "kl": 0.0604248046875, "learning_rate": 7.46382841990363e-07, "loss": 0.000603795051574707, "memory(GiB)": 38.13, "reward": 0.7446874976158142, "reward_std": 0.09078555554151535, "rewards/VisualizationJSONCombinedORM/mean": 0.7446874976158142, "rewards/VisualizationJSONCombinedORM/std": 0.0953134298324585, "step": 3052, "train_speed(iter/s)": 0.124481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 288.25, "completions/min_length": 219.0, "epoch": 2.5252274607113314, "grad_norm": 0.20761863887310028, "kl": 0.06610107421875, "learning_rate": 7.438553031875823e-07, "loss": 0.000661022961139679, "memory(GiB)": 38.13, "reward": 0.7392174005508423, "reward_std": 0.1230730414390564, "rewards/VisualizationJSONCombinedORM/mean": 0.7392174005508423, "rewards/VisualizationJSONCombinedORM/std": 0.12202911078929901, "step": 3053, "train_speed(iter/s)": 0.124374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 312.0, "completions/min_length": 235.0, "epoch": 2.5260545905707197, "grad_norm": 0.17298485338687897, "kl": 0.0926513671875, "learning_rate": 7.413317072851051e-07, "loss": 0.0009285062551498413, "memory(GiB)": 38.13, "reward": 0.6221084594726562, "reward_std": 0.0688522458076477, "rewards/VisualizationJSONCombinedORM/mean": 0.6221084594726562, "rewards/VisualizationJSONCombinedORM/std": 0.10876400768756866, "step": 3054, "train_speed(iter/s)": 0.124253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 309.875, "completions/min_length": 261.0, "epoch": 2.5268817204301075, "grad_norm": 0.17646150290966034, "kl": 0.048095703125, "learning_rate": 7.388120566207957e-07, "loss": 0.00048182159662246704, "memory(GiB)": 38.13, "reward": 0.3884798288345337, "reward_std": 0.027932751923799515, "rewards/VisualizationJSONCombinedORM/mean": 0.3884798288345337, "rewards/VisualizationJSONCombinedORM/std": 0.048316530883312225, "step": 3055, "train_speed(iter/s)": 0.124141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 305.0, "completions/min_length": 235.0, "epoch": 2.5277088502894953, "grad_norm": 0.18561400473117828, "kl": 0.0716552734375, "learning_rate": 7.362963535288614e-07, "loss": 0.0007165968418121338, "memory(GiB)": 38.13, "reward": 0.6044408082962036, "reward_std": 0.08274942636489868, "rewards/VisualizationJSONCombinedORM/mean": 0.6044408082962036, "rewards/VisualizationJSONCombinedORM/std": 0.08411922305822372, "step": 3056, "train_speed(iter/s)": 0.124062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 289.0625, "completions/min_length": 224.0, "epoch": 2.5285359801488836, "grad_norm": 0.18567654490470886, "kl": 0.1195068359375, "learning_rate": 7.337846003398568e-07, "loss": 0.0011944454163312912, "memory(GiB)": 38.13, "reward": 0.32528144121170044, "reward_std": 0.03757733851671219, "rewards/VisualizationJSONCombinedORM/mean": 0.32528144121170044, "rewards/VisualizationJSONCombinedORM/std": 0.06468884646892548, "step": 3057, "train_speed(iter/s)": 0.123947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 323.0, "completions/min_length": 260.0, "epoch": 2.5293631100082714, "grad_norm": 0.17616333067417145, "kl": 0.028900146484375, "learning_rate": 7.312767993806713e-07, "loss": 0.0002888590097427368, "memory(GiB)": 38.13, "reward": 0.6752192974090576, "reward_std": 0.060947269201278687, "rewards/VisualizationJSONCombinedORM/mean": 0.6752192974090576, "rewards/VisualizationJSONCombinedORM/std": 0.11344955861568451, "step": 3058, "train_speed(iter/s)": 0.123835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 293.875, "completions/min_length": 228.0, "epoch": 2.5301902398676592, "grad_norm": 0.18350689113140106, "kl": 0.0936279296875, "learning_rate": 7.287729529745386e-07, "loss": 0.0009368881583213806, "memory(GiB)": 38.13, "reward": 0.547511875629425, "reward_std": 0.05551927536725998, "rewards/VisualizationJSONCombinedORM/mean": 0.547511875629425, "rewards/VisualizationJSONCombinedORM/std": 0.14107176661491394, "step": 3059, "train_speed(iter/s)": 0.123755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 307.8125, "completions/min_length": 251.0, "epoch": 2.531017369727047, "grad_norm": 0.1869773119688034, "kl": 0.07342529296875, "learning_rate": 7.262730634410259e-07, "loss": 0.0007344186305999756, "memory(GiB)": 38.13, "reward": 0.46975845098495483, "reward_std": 0.04836561903357506, "rewards/VisualizationJSONCombinedORM/mean": 0.46975845098495483, "rewards/VisualizationJSONCombinedORM/std": 0.196226105093956, "step": 3060, "train_speed(iter/s)": 0.123638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 322.6875, "completions/min_length": 242.0, "epoch": 2.531844499586435, "grad_norm": 0.21480712294578552, "kl": 0.1126708984375, "learning_rate": 7.237771330960369e-07, "loss": 0.0011255815625190735, "memory(GiB)": 38.13, "reward": 0.3252108097076416, "reward_std": 0.03666134178638458, "rewards/VisualizationJSONCombinedORM/mean": 0.3252108097076416, "rewards/VisualizationJSONCombinedORM/std": 0.03802530840039253, "step": 3061, "train_speed(iter/s)": 0.123564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 297.75, "completions/min_length": 239.0, "epoch": 2.532671629445823, "grad_norm": 0.20197054743766785, "kl": 0.06097412109375, "learning_rate": 7.212851642518043e-07, "loss": 0.0006094127893447876, "memory(GiB)": 38.13, "reward": 0.6256784796714783, "reward_std": 0.11082576215267181, "rewards/VisualizationJSONCombinedORM/mean": 0.6256784796714783, "rewards/VisualizationJSONCombinedORM/std": 0.12533977627754211, "step": 3062, "train_speed(iter/s)": 0.123481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 311.875, "completions/min_length": 227.0, "epoch": 2.533498759305211, "grad_norm": 0.1952148675918579, "kl": 0.032867431640625, "learning_rate": 7.187971592168936e-07, "loss": 0.0003284178674221039, "memory(GiB)": 38.13, "reward": 0.6742585301399231, "reward_std": 0.08287197351455688, "rewards/VisualizationJSONCombinedORM/mean": 0.6742585301399231, "rewards/VisualizationJSONCombinedORM/std": 0.09900148957967758, "step": 3063, "train_speed(iter/s)": 0.123399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 260.1875, "completions/min_length": 193.0, "epoch": 2.5343258891645988, "grad_norm": 0.18297356367111206, "kl": 0.07470703125, "learning_rate": 7.163131202961948e-07, "loss": 0.0007494315505027771, "memory(GiB)": 38.13, "reward": 0.6817008852958679, "reward_std": 0.07773127406835556, "rewards/VisualizationJSONCombinedORM/mean": 0.6817008852958679, "rewards/VisualizationJSONCombinedORM/std": 0.12502175569534302, "step": 3064, "train_speed(iter/s)": 0.123295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 285.1875, "completions/min_length": 237.0, "epoch": 2.535153019023987, "grad_norm": 0.19820961356163025, "kl": 0.0526123046875, "learning_rate": 7.138330497909308e-07, "loss": 0.0005251839756965637, "memory(GiB)": 38.13, "reward": 0.6135650277137756, "reward_std": 0.06748431921005249, "rewards/VisualizationJSONCombinedORM/mean": 0.6135650277137756, "rewards/VisualizationJSONCombinedORM/std": 0.15940625965595245, "step": 3065, "train_speed(iter/s)": 0.123206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 298.9375, "completions/min_length": 250.0, "epoch": 2.5359801488833744, "grad_norm": 0.1457854062318802, "kl": 0.06005859375, "learning_rate": 7.113569499986401e-07, "loss": 0.0006015300750732422, "memory(GiB)": 38.13, "reward": 0.3510572612285614, "reward_std": 0.035545870661735535, "rewards/VisualizationJSONCombinedORM/mean": 0.3510572612285614, "rewards/VisualizationJSONCombinedORM/std": 0.0473928339779377, "step": 3066, "train_speed(iter/s)": 0.123124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 326.0, "completions/min_length": 253.0, "epoch": 2.5368072787427627, "grad_norm": 0.26803845167160034, "kl": 0.0877685546875, "learning_rate": 7.088848232131862e-07, "loss": 0.0008772611618041992, "memory(GiB)": 38.13, "reward": 0.45463159680366516, "reward_std": 0.07678211480379105, "rewards/VisualizationJSONCombinedORM/mean": 0.45463159680366516, "rewards/VisualizationJSONCombinedORM/std": 0.08431360125541687, "step": 3067, "train_speed(iter/s)": 0.123028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 277.0, "completions/min_length": 219.0, "epoch": 2.5376344086021505, "grad_norm": 0.23526310920715332, "kl": 0.0933837890625, "learning_rate": 7.064166717247545e-07, "loss": 0.0009349361062049866, "memory(GiB)": 38.13, "reward": 0.5228421688079834, "reward_std": 0.10871659964323044, "rewards/VisualizationJSONCombinedORM/mean": 0.5228421688079834, "rewards/VisualizationJSONCombinedORM/std": 0.14595438539981842, "step": 3068, "train_speed(iter/s)": 0.122962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 276.3125, "completions/min_length": 227.0, "epoch": 2.5384615384615383, "grad_norm": 0.217523455619812, "kl": 0.06585693359375, "learning_rate": 7.039524978198414e-07, "loss": 0.0006578750908374786, "memory(GiB)": 38.13, "reward": 0.7311602234840393, "reward_std": 0.09987017512321472, "rewards/VisualizationJSONCombinedORM/mean": 0.7311602234840393, "rewards/VisualizationJSONCombinedORM/std": 0.10928382724523544, "step": 3069, "train_speed(iter/s)": 0.122867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 282.875, "completions/min_length": 202.0, "epoch": 2.5392886683209266, "grad_norm": 0.20015063881874084, "kl": 0.06463623046875, "learning_rate": 7.014923037812649e-07, "loss": 0.0006468892097473145, "memory(GiB)": 38.13, "reward": 0.43718570470809937, "reward_std": 0.08909761905670166, "rewards/VisualizationJSONCombinedORM/mean": 0.43718570470809937, "rewards/VisualizationJSONCombinedORM/std": 0.12401842325925827, "step": 3070, "train_speed(iter/s)": 0.122768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 327.9375, "completions/min_length": 263.0, "epoch": 2.5401157981803144, "grad_norm": 0.1792948991060257, "kl": 0.05078125, "learning_rate": 6.990360918881517e-07, "loss": 0.000508161261677742, "memory(GiB)": 38.13, "reward": 0.6560584306716919, "reward_std": 0.06487612426280975, "rewards/VisualizationJSONCombinedORM/mean": 0.6560584306716919, "rewards/VisualizationJSONCombinedORM/std": 0.2438260167837143, "step": 3071, "train_speed(iter/s)": 0.122678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 268.4375, "completions/min_length": 216.0, "epoch": 2.5409429280397022, "grad_norm": 0.17397861182689667, "kl": 0.1474609375, "learning_rate": 6.965838644159434e-07, "loss": 0.0014742985367774963, "memory(GiB)": 38.13, "reward": 0.453814297914505, "reward_std": 0.039159130305051804, "rewards/VisualizationJSONCombinedORM/mean": 0.453814297914505, "rewards/VisualizationJSONCombinedORM/std": 0.22637997567653656, "step": 3072, "train_speed(iter/s)": 0.122598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 282.875, "completions/min_length": 232.0, "epoch": 2.54177005789909, "grad_norm": 0.18434055149555206, "kl": 0.049774169921875, "learning_rate": 6.94135623636385e-07, "loss": 0.0004972629249095917, "memory(GiB)": 38.13, "reward": 0.5476127862930298, "reward_std": 0.09260565787553787, "rewards/VisualizationJSONCombinedORM/mean": 0.5476127862930298, "rewards/VisualizationJSONCombinedORM/std": 0.15736442804336548, "step": 3073, "train_speed(iter/s)": 0.122534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 300.0, "completions/min_length": 222.0, "epoch": 2.542597187758478, "grad_norm": 0.1790745109319687, "kl": 0.033447265625, "learning_rate": 6.916913718175339e-07, "loss": 0.00033526867628097534, "memory(GiB)": 38.13, "reward": 0.6986610293388367, "reward_std": 0.07064193487167358, "rewards/VisualizationJSONCombinedORM/mean": 0.6986610293388367, "rewards/VisualizationJSONCombinedORM/std": 0.1696433573961258, "step": 3074, "train_speed(iter/s)": 0.122447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 289.875, "completions/min_length": 241.0, "epoch": 2.543424317617866, "grad_norm": 0.2980496287345886, "kl": 0.058349609375, "learning_rate": 6.892511112237472e-07, "loss": 0.0005830861628055573, "memory(GiB)": 38.13, "reward": 0.46429190039634705, "reward_std": 0.08643635362386703, "rewards/VisualizationJSONCombinedORM/mean": 0.46429190039634705, "rewards/VisualizationJSONCombinedORM/std": 0.08863846957683563, "step": 3075, "train_speed(iter/s)": 0.122335 }, { "epoch": 2.543424317617866, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.5416666666667, "eval_completions/mean_length": 298.4270833333333, "eval_completions/min_length": 247.79166666666666, "eval_kl": 0.07147216796875, "eval_loss": 0.0007158294320106506, "eval_reward": 0.45823727548122406, "eval_reward_std": 0.0617563568521291, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45823727548122406, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0617563568521291, "eval_runtime": 308.1682, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 326.3125, "completions/min_length": 259.0, "epoch": 2.544251447477254, "grad_norm": 0.1982925683259964, "kl": 0.04266357421875, "learning_rate": 6.868148441156874e-07, "loss": 0.0004264339804649353, "memory(GiB)": 38.13, "reward": 0.6582806706428528, "reward_std": 0.08013637363910675, "rewards/VisualizationJSONCombinedORM/mean": 0.6582806706428528, "rewards/VisualizationJSONCombinedORM/std": 0.07864483445882797, "step": 3076, "train_speed(iter/s)": 0.120765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 299.0625, "completions/min_length": 248.0, "epoch": 2.545078577336642, "grad_norm": 0.18808521330356598, "kl": 0.07244873046875, "learning_rate": 6.843825727503178e-07, "loss": 0.0007238313555717468, "memory(GiB)": 38.13, "reward": 0.6216821670532227, "reward_std": 0.05961858853697777, "rewards/VisualizationJSONCombinedORM/mean": 0.6216821670532227, "rewards/VisualizationJSONCombinedORM/std": 0.06261198222637177, "step": 3077, "train_speed(iter/s)": 0.120686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 286.4375, "completions/min_length": 212.0, "epoch": 2.54590570719603, "grad_norm": 0.18469493091106415, "kl": 0.0457763671875, "learning_rate": 6.819542993809003e-07, "loss": 0.0004585161805152893, "memory(GiB)": 38.13, "reward": 0.690483808517456, "reward_std": 0.0560455247759819, "rewards/VisualizationJSONCombinedORM/mean": 0.690483808517456, "rewards/VisualizationJSONCombinedORM/std": 0.09542800486087799, "step": 3078, "train_speed(iter/s)": 0.120594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 315.0, "completions/min_length": 262.0, "epoch": 2.546732837055418, "grad_norm": 0.1556977778673172, "kl": 0.04583740234375, "learning_rate": 6.795300262569882e-07, "loss": 0.00045797601342201233, "memory(GiB)": 38.13, "reward": 0.7502470016479492, "reward_std": 0.03489881008863449, "rewards/VisualizationJSONCombinedORM/mean": 0.7502470016479492, "rewards/VisualizationJSONCombinedORM/std": 0.03841917961835861, "step": 3079, "train_speed(iter/s)": 0.120497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 291.0625, "completions/min_length": 234.0, "epoch": 2.5475599669148057, "grad_norm": 0.1821805238723755, "kl": 0.0997314453125, "learning_rate": 6.77109755624436e-07, "loss": 0.0009962767362594604, "memory(GiB)": 38.13, "reward": 0.6104632616043091, "reward_std": 0.06764846295118332, "rewards/VisualizationJSONCombinedORM/mean": 0.6104632616043091, "rewards/VisualizationJSONCombinedORM/std": 0.14687426388263702, "step": 3080, "train_speed(iter/s)": 0.120414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 283.5625, "completions/min_length": 237.0, "epoch": 2.5483870967741935, "grad_norm": 0.20457981526851654, "kl": 0.1180419921875, "learning_rate": 6.746934897253832e-07, "loss": 0.0011839419603347778, "memory(GiB)": 38.13, "reward": 0.5654392242431641, "reward_std": 0.09691879153251648, "rewards/VisualizationJSONCombinedORM/mean": 0.5654392242431641, "rewards/VisualizationJSONCombinedORM/std": 0.20341508090496063, "step": 3081, "train_speed(iter/s)": 0.120309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 280.8125, "completions/min_length": 229.0, "epoch": 2.5492142266335813, "grad_norm": 0.17398984730243683, "kl": 0.030914306640625, "learning_rate": 6.722812307982674e-07, "loss": 0.0003095082938671112, "memory(GiB)": 38.13, "reward": 0.6163338422775269, "reward_std": 0.0452558696269989, "rewards/VisualizationJSONCombinedORM/mean": 0.6163338422775269, "rewards/VisualizationJSONCombinedORM/std": 0.14746209979057312, "step": 3082, "train_speed(iter/s)": 0.12022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 300.75, "completions/min_length": 244.0, "epoch": 2.5500413564929696, "grad_norm": 0.17120467126369476, "kl": 0.02789306640625, "learning_rate": 6.698729810778065e-07, "loss": 0.00027814507484436035, "memory(GiB)": 38.13, "reward": 0.4561872184276581, "reward_std": 0.044096045196056366, "rewards/VisualizationJSONCombinedORM/mean": 0.4561872184276581, "rewards/VisualizationJSONCombinedORM/std": 0.09048355370759964, "step": 3083, "train_speed(iter/s)": 0.120132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 309.125, "completions/min_length": 218.0, "epoch": 2.5508684863523574, "grad_norm": 0.20340268313884735, "kl": 0.0972900390625, "learning_rate": 6.6746874279501e-07, "loss": 0.0009743943810462952, "memory(GiB)": 38.13, "reward": 0.7324697971343994, "reward_std": 0.09172964841127396, "rewards/VisualizationJSONCombinedORM/mean": 0.7324697971343994, "rewards/VisualizationJSONCombinedORM/std": 0.14134113490581512, "step": 3084, "train_speed(iter/s)": 0.120024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 318.6875, "completions/min_length": 247.0, "epoch": 2.5516956162117452, "grad_norm": 0.18349649012088776, "kl": 0.07244873046875, "learning_rate": 6.650685181771654e-07, "loss": 0.0007250756025314331, "memory(GiB)": 38.13, "reward": 0.7397599220275879, "reward_std": 0.08921141177415848, "rewards/VisualizationJSONCombinedORM/mean": 0.7397599220275879, "rewards/VisualizationJSONCombinedORM/std": 0.0956176146864891, "step": 3085, "train_speed(iter/s)": 0.119934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 306.5, "completions/min_length": 256.0, "epoch": 2.552522746071133, "grad_norm": 0.20808497071266174, "kl": 0.114013671875, "learning_rate": 6.626723094478477e-07, "loss": 0.001143127679824829, "memory(GiB)": 38.13, "reward": 0.6249483227729797, "reward_std": 0.08982871472835541, "rewards/VisualizationJSONCombinedORM/mean": 0.6249483227729797, "rewards/VisualizationJSONCombinedORM/std": 0.1689741462469101, "step": 3086, "train_speed(iter/s)": 0.119845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 290.1875, "completions/min_length": 241.0, "epoch": 2.553349875930521, "grad_norm": 0.20355679094791412, "kl": 0.04632568359375, "learning_rate": 6.602801188269081e-07, "loss": 0.00046534091234207153, "memory(GiB)": 38.13, "reward": 0.75897216796875, "reward_std": 0.0690321996808052, "rewards/VisualizationJSONCombinedORM/mean": 0.75897216796875, "rewards/VisualizationJSONCombinedORM/std": 0.09162025153636932, "step": 3087, "train_speed(iter/s)": 0.119764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 291.875, "completions/min_length": 222.0, "epoch": 2.554177005789909, "grad_norm": 0.22416535019874573, "kl": 0.06378173828125, "learning_rate": 6.57891948530478e-07, "loss": 0.0006373990327119827, "memory(GiB)": 38.13, "reward": 0.37173688411712646, "reward_std": 0.050436437129974365, "rewards/VisualizationJSONCombinedORM/mean": 0.37173688411712646, "rewards/VisualizationJSONCombinedORM/std": 0.05660080164670944, "step": 3088, "train_speed(iter/s)": 0.119688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 341.4375, "completions/min_length": 237.0, "epoch": 2.555004135649297, "grad_norm": 0.1729494035243988, "kl": 0.08203125, "learning_rate": 6.555078007709603e-07, "loss": 0.000819813460111618, "memory(GiB)": 38.13, "reward": 0.3530132472515106, "reward_std": 0.05777183920145035, "rewards/VisualizationJSONCombinedORM/mean": 0.3530132472515106, "rewards/VisualizationJSONCombinedORM/std": 0.10026565194129944, "step": 3089, "train_speed(iter/s)": 0.119606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 301.125, "completions/min_length": 240.0, "epoch": 2.555831265508685, "grad_norm": 0.16404244303703308, "kl": 0.02740478515625, "learning_rate": 6.531276777570361e-07, "loss": 0.0002742856740951538, "memory(GiB)": 38.13, "reward": 0.5701751708984375, "reward_std": 0.05301051586866379, "rewards/VisualizationJSONCombinedORM/mean": 0.5701751708984375, "rewards/VisualizationJSONCombinedORM/std": 0.18037688732147217, "step": 3090, "train_speed(iter/s)": 0.119512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 308.5625, "completions/min_length": 235.0, "epoch": 2.556658395368073, "grad_norm": 0.18468032777309418, "kl": 0.06512451171875, "learning_rate": 6.507515816936538e-07, "loss": 0.0006519239395856857, "memory(GiB)": 38.13, "reward": 0.6527553796768188, "reward_std": 0.07900533080101013, "rewards/VisualizationJSONCombinedORM/mean": 0.6527553796768188, "rewards/VisualizationJSONCombinedORM/std": 0.08052269369363785, "step": 3091, "train_speed(iter/s)": 0.119415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 285.5, "completions/min_length": 218.0, "epoch": 2.557485525227461, "grad_norm": 0.2065132111310959, "kl": 0.0950927734375, "learning_rate": 6.483795147820343e-07, "loss": 0.0009501725435256958, "memory(GiB)": 38.13, "reward": 0.5340557098388672, "reward_std": 0.08760794252157211, "rewards/VisualizationJSONCombinedORM/mean": 0.5340557098388672, "rewards/VisualizationJSONCombinedORM/std": 0.18444029986858368, "step": 3092, "train_speed(iter/s)": 0.119322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 293.3125, "completions/min_length": 240.0, "epoch": 2.5583126550868487, "grad_norm": 0.17137938737869263, "kl": 0.089599609375, "learning_rate": 6.460114792196642e-07, "loss": 0.0008962899446487427, "memory(GiB)": 38.13, "reward": 0.6502361297607422, "reward_std": 0.0622883178293705, "rewards/VisualizationJSONCombinedORM/mean": 0.6502361297607422, "rewards/VisualizationJSONCombinedORM/std": 0.12256740778684616, "step": 3093, "train_speed(iter/s)": 0.11925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 294.25, "completions/min_length": 223.0, "epoch": 2.5591397849462365, "grad_norm": 0.2428397536277771, "kl": 0.06396484375, "learning_rate": 6.436474772002976e-07, "loss": 0.0006393790245056152, "memory(GiB)": 38.13, "reward": 0.6075164079666138, "reward_std": 0.09798905998468399, "rewards/VisualizationJSONCombinedORM/mean": 0.6075164079666138, "rewards/VisualizationJSONCombinedORM/std": 0.10897570848464966, "step": 3094, "train_speed(iter/s)": 0.119171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 286.0625, "completions/min_length": 230.0, "epoch": 2.5599669148056243, "grad_norm": 0.2645581364631653, "kl": 0.05609130859375, "learning_rate": 6.412875109139482e-07, "loss": 0.0005615800619125366, "memory(GiB)": 38.13, "reward": 0.590524435043335, "reward_std": 0.09612353146076202, "rewards/VisualizationJSONCombinedORM/mean": 0.590524435043335, "rewards/VisualizationJSONCombinedORM/std": 0.19803962111473083, "step": 3095, "train_speed(iter/s)": 0.119097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 308.25, "completions/min_length": 223.0, "epoch": 2.5607940446650126, "grad_norm": 0.17108996212482452, "kl": 0.07269287109375, "learning_rate": 6.38931582546895e-07, "loss": 0.0007272064685821533, "memory(GiB)": 38.13, "reward": 0.2560468912124634, "reward_std": 0.028195131570100784, "rewards/VisualizationJSONCombinedORM/mean": 0.2560468912124634, "rewards/VisualizationJSONCombinedORM/std": 0.07972617447376251, "step": 3096, "train_speed(iter/s)": 0.11901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 294.0, "completions/min_length": 225.0, "epoch": 2.5616211745244004, "grad_norm": 0.16890522837638855, "kl": 0.033447265625, "learning_rate": 6.365796942816716e-07, "loss": 0.00033384934067726135, "memory(GiB)": 38.13, "reward": 0.6890679001808167, "reward_std": 0.07971359044313431, "rewards/VisualizationJSONCombinedORM/mean": 0.6890679001808167, "rewards/VisualizationJSONCombinedORM/std": 0.11944824457168579, "step": 3097, "train_speed(iter/s)": 0.118915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 305.8125, "completions/min_length": 244.0, "epoch": 2.5624483043837882, "grad_norm": 0.22239559888839722, "kl": 0.04864501953125, "learning_rate": 6.342318482970755e-07, "loss": 0.00048629194498062134, "memory(GiB)": 38.13, "reward": 0.5246835350990295, "reward_std": 0.06184273213148117, "rewards/VisualizationJSONCombinedORM/mean": 0.5246835350990295, "rewards/VisualizationJSONCombinedORM/std": 0.06196078285574913, "step": 3098, "train_speed(iter/s)": 0.118853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 261.625, "completions/min_length": 229.0, "epoch": 2.563275434243176, "grad_norm": 0.15602731704711914, "kl": 0.0714111328125, "learning_rate": 6.318880467681527e-07, "loss": 0.0007156059145927429, "memory(GiB)": 38.13, "reward": 0.4943757653236389, "reward_std": 0.07942855358123779, "rewards/VisualizationJSONCombinedORM/mean": 0.4943757653236389, "rewards/VisualizationJSONCombinedORM/std": 0.2551257014274597, "step": 3099, "train_speed(iter/s)": 0.118787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 281.5, "completions/min_length": 223.0, "epoch": 2.564102564102564, "grad_norm": 0.1863626390695572, "kl": 0.046630859375, "learning_rate": 6.295482918662066e-07, "loss": 0.0004665479063987732, "memory(GiB)": 38.13, "reward": 0.678560197353363, "reward_std": 0.07231469452381134, "rewards/VisualizationJSONCombinedORM/mean": 0.678560197353363, "rewards/VisualizationJSONCombinedORM/std": 0.0842670276761055, "step": 3100, "train_speed(iter/s)": 0.118707 }, { "epoch": 2.564102564102564, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.4583333333333, "eval_completions/mean_length": 300.84375, "eval_completions/min_length": 254.875, "eval_kl": 0.06720479329427083, "eval_loss": 0.0006726036663167179, "eval_reward": 0.4531944468617439, "eval_reward_std": 0.05712210499526312, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4531944468617439, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057122108507125326, "eval_runtime": 311.112, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 265.625, "completions/min_length": 229.0, "epoch": 2.564929693961952, "grad_norm": 0.48064613342285156, "kl": 0.0751953125, "learning_rate": 6.272125857587891e-07, "loss": 0.0007463321089744568, "memory(GiB)": 38.13, "reward": 0.5161312818527222, "reward_std": 0.020033812150359154, "rewards/VisualizationJSONCombinedORM/mean": 0.5161312818527222, "rewards/VisualizationJSONCombinedORM/std": 0.11345377564430237, "step": 3101, "train_speed(iter/s)": 0.117234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 316.375, "completions/min_length": 243.0, "epoch": 2.56575682382134, "grad_norm": 0.17363189160823822, "kl": 0.034698486328125, "learning_rate": 6.248809306097036e-07, "loss": 0.0003474801778793335, "memory(GiB)": 38.13, "reward": 0.7379703521728516, "reward_std": 0.05721345916390419, "rewards/VisualizationJSONCombinedORM/mean": 0.7379703521728516, "rewards/VisualizationJSONCombinedORM/std": 0.12100474536418915, "step": 3102, "train_speed(iter/s)": 0.117174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 272.8125, "completions/min_length": 224.0, "epoch": 2.566583953680728, "grad_norm": 0.18221105635166168, "kl": 0.06878662109375, "learning_rate": 6.225533285789997e-07, "loss": 0.0006878934800624847, "memory(GiB)": 38.13, "reward": 0.7325058579444885, "reward_std": 0.06019818037748337, "rewards/VisualizationJSONCombinedORM/mean": 0.7325058579444885, "rewards/VisualizationJSONCombinedORM/std": 0.05993511155247688, "step": 3103, "train_speed(iter/s)": 0.117087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 308.9375, "completions/min_length": 238.0, "epoch": 2.567411083540116, "grad_norm": 0.1818762719631195, "kl": 0.07916259765625, "learning_rate": 6.20229781822973e-07, "loss": 0.0007896721363067627, "memory(GiB)": 38.13, "reward": 0.5441964864730835, "reward_std": 0.08129115402698517, "rewards/VisualizationJSONCombinedORM/mean": 0.5441964864730835, "rewards/VisualizationJSONCombinedORM/std": 0.15218587219715118, "step": 3104, "train_speed(iter/s)": 0.117005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 272.0625, "completions/min_length": 249.0, "epoch": 2.568238213399504, "grad_norm": 0.15928679704666138, "kl": 0.0277099609375, "learning_rate": 6.179102924941599e-07, "loss": 0.0002763308584690094, "memory(GiB)": 38.13, "reward": 0.5813865661621094, "reward_std": 0.06343766301870346, "rewards/VisualizationJSONCombinedORM/mean": 0.5813865661621094, "rewards/VisualizationJSONCombinedORM/std": 0.08006592094898224, "step": 3105, "train_speed(iter/s)": 0.116938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 308.4375, "completions/min_length": 232.0, "epoch": 2.5690653432588917, "grad_norm": 0.20875288546085358, "kl": 0.08184814453125, "learning_rate": 6.155948627413411e-07, "loss": 0.000819183886051178, "memory(GiB)": 38.13, "reward": 0.7126063704490662, "reward_std": 0.07605001330375671, "rewards/VisualizationJSONCombinedORM/mean": 0.7126063704490662, "rewards/VisualizationJSONCombinedORM/std": 0.09002722054719925, "step": 3106, "train_speed(iter/s)": 0.116854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 310.5625, "completions/min_length": 237.0, "epoch": 2.5698924731182795, "grad_norm": 0.1561247557401657, "kl": 0.04949951171875, "learning_rate": 6.132834947095334e-07, "loss": 0.0004947781562805176, "memory(GiB)": 38.13, "reward": 0.5552332401275635, "reward_std": 0.048821888864040375, "rewards/VisualizationJSONCombinedORM/mean": 0.5552332401275635, "rewards/VisualizationJSONCombinedORM/std": 0.28739163279533386, "step": 3107, "train_speed(iter/s)": 0.116759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 273.625, "completions/min_length": 239.0, "epoch": 2.5707196029776673, "grad_norm": 0.20433928072452545, "kl": 0.1622314453125, "learning_rate": 6.10976190539993e-07, "loss": 0.0016241297125816345, "memory(GiB)": 38.13, "reward": 0.4668424427509308, "reward_std": 0.09607827663421631, "rewards/VisualizationJSONCombinedORM/mean": 0.4668424427509308, "rewards/VisualizationJSONCombinedORM/std": 0.20676545798778534, "step": 3108, "train_speed(iter/s)": 0.11667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 274.375, "completions/min_length": 228.0, "epoch": 2.5715467328370556, "grad_norm": 0.20411714911460876, "kl": 0.1134033203125, "learning_rate": 6.086729523702118e-07, "loss": 0.0011341944336891174, "memory(GiB)": 38.13, "reward": 0.26396146416664124, "reward_std": 0.03481718525290489, "rewards/VisualizationJSONCombinedORM/mean": 0.26396146416664124, "rewards/VisualizationJSONCombinedORM/std": 0.05500955879688263, "step": 3109, "train_speed(iter/s)": 0.116612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 295.4375, "completions/min_length": 235.0, "epoch": 2.5723738626964434, "grad_norm": 0.2243853211402893, "kl": 0.05865478515625, "learning_rate": 6.063737823339133e-07, "loss": 0.0005867313593626022, "memory(GiB)": 38.13, "reward": 0.513437032699585, "reward_std": 0.0743831992149353, "rewards/VisualizationJSONCombinedORM/mean": 0.513437032699585, "rewards/VisualizationJSONCombinedORM/std": 0.18495531380176544, "step": 3110, "train_speed(iter/s)": 0.116545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 266.375, "completions/min_length": 218.0, "epoch": 2.5732009925558312, "grad_norm": 0.17202329635620117, "kl": 0.03912353515625, "learning_rate": 6.040786825610518e-07, "loss": 0.00039099156856536865, "memory(GiB)": 38.13, "reward": 0.5178261399269104, "reward_std": 0.037918344140052795, "rewards/VisualizationJSONCombinedORM/mean": 0.5178261399269104, "rewards/VisualizationJSONCombinedORM/std": 0.09131336957216263, "step": 3111, "train_speed(iter/s)": 0.116482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 292.9375, "completions/min_length": 215.0, "epoch": 2.574028122415219, "grad_norm": 0.1985408365726471, "kl": 0.092529296875, "learning_rate": 6.01787655177814e-07, "loss": 0.0009252503514289856, "memory(GiB)": 38.13, "reward": 0.6158766150474548, "reward_std": 0.11048437654972076, "rewards/VisualizationJSONCombinedORM/mean": 0.6158766150474548, "rewards/VisualizationJSONCombinedORM/std": 0.1734015792608261, "step": 3112, "train_speed(iter/s)": 0.116383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 286.125, "completions/min_length": 238.0, "epoch": 2.574855252274607, "grad_norm": 0.18415310978889465, "kl": 0.04937744140625, "learning_rate": 5.995007023066085e-07, "loss": 0.0004936680197715759, "memory(GiB)": 38.13, "reward": 0.5763905048370361, "reward_std": 0.054879337549209595, "rewards/VisualizationJSONCombinedORM/mean": 0.5763905048370361, "rewards/VisualizationJSONCombinedORM/std": 0.224539652466774, "step": 3113, "train_speed(iter/s)": 0.116309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 282.125, "completions/min_length": 250.0, "epoch": 2.575682382133995, "grad_norm": 0.2613496482372284, "kl": 0.0614013671875, "learning_rate": 5.972178260660771e-07, "loss": 0.0006123483180999756, "memory(GiB)": 38.13, "reward": 0.45451730489730835, "reward_std": 0.05964197218418121, "rewards/VisualizationJSONCombinedORM/mean": 0.45451730489730835, "rewards/VisualizationJSONCombinedORM/std": 0.1434880793094635, "step": 3114, "train_speed(iter/s)": 0.116252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 296.375, "completions/min_length": 254.0, "epoch": 2.576509511993383, "grad_norm": 0.2020982801914215, "kl": 0.03167724609375, "learning_rate": 5.949390285710777e-07, "loss": 0.00031665340065956116, "memory(GiB)": 38.13, "reward": 0.6773995757102966, "reward_std": 0.06675386428833008, "rewards/VisualizationJSONCombinedORM/mean": 0.6773995757102966, "rewards/VisualizationJSONCombinedORM/std": 0.10093756020069122, "step": 3115, "train_speed(iter/s)": 0.116171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 302.875, "completions/min_length": 264.0, "epoch": 2.577336641852771, "grad_norm": 0.16798177361488342, "kl": 0.033721923828125, "learning_rate": 5.926643119326936e-07, "loss": 0.00033720582723617554, "memory(GiB)": 38.13, "reward": 0.7706823348999023, "reward_std": 0.06931640207767487, "rewards/VisualizationJSONCombinedORM/mean": 0.7706823348999023, "rewards/VisualizationJSONCombinedORM/std": 0.06785383820533752, "step": 3116, "train_speed(iter/s)": 0.116106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 318.9375, "completions/min_length": 253.0, "epoch": 2.578163771712159, "grad_norm": 0.22289302945137024, "kl": 0.06744384765625, "learning_rate": 5.903936782582253e-07, "loss": 0.0006727930158376694, "memory(GiB)": 38.13, "reward": 0.37488529086112976, "reward_std": 0.04715510457754135, "rewards/VisualizationJSONCombinedORM/mean": 0.37488529086112976, "rewards/VisualizationJSONCombinedORM/std": 0.06334098428487778, "step": 3117, "train_speed(iter/s)": 0.116039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 255.25, "completions/min_length": 212.0, "epoch": 2.578990901571547, "grad_norm": 0.18741585314273834, "kl": 0.04248046875, "learning_rate": 5.881271296511926e-07, "loss": 0.0004250332713127136, "memory(GiB)": 38.13, "reward": 0.6956526041030884, "reward_std": 0.08405837416648865, "rewards/VisualizationJSONCombinedORM/mean": 0.6956526041030884, "rewards/VisualizationJSONCombinedORM/std": 0.08615541458129883, "step": 3118, "train_speed(iter/s)": 0.11597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 309.9375, "completions/min_length": 234.0, "epoch": 2.5798180314309347, "grad_norm": 0.17406223714351654, "kl": 0.07598876953125, "learning_rate": 5.858646682113306e-07, "loss": 0.0007619708776473999, "memory(GiB)": 38.13, "reward": 0.4174354672431946, "reward_std": 0.03702305257320404, "rewards/VisualizationJSONCombinedORM/mean": 0.4174354672431946, "rewards/VisualizationJSONCombinedORM/std": 0.08819565922021866, "step": 3119, "train_speed(iter/s)": 0.115901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 302.3125, "completions/min_length": 245.0, "epoch": 2.5806451612903225, "grad_norm": 0.15085968375205994, "kl": 0.026275634765625, "learning_rate": 5.836062960345878e-07, "loss": 0.000262625515460968, "memory(GiB)": 38.13, "reward": 0.43069547414779663, "reward_std": 0.0254814475774765, "rewards/VisualizationJSONCombinedORM/mean": 0.43069547414779663, "rewards/VisualizationJSONCombinedORM/std": 0.05476325377821922, "step": 3120, "train_speed(iter/s)": 0.115825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 291.5, "completions/min_length": 242.0, "epoch": 2.5814722911497103, "grad_norm": 0.1856432557106018, "kl": 0.037841796875, "learning_rate": 5.813520152131252e-07, "loss": 0.00037886016070842743, "memory(GiB)": 38.13, "reward": 0.3565816879272461, "reward_std": 0.04221521317958832, "rewards/VisualizationJSONCombinedORM/mean": 0.3565816879272461, "rewards/VisualizationJSONCombinedORM/std": 0.0687914788722992, "step": 3121, "train_speed(iter/s)": 0.115762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 297.1875, "completions/min_length": 242.0, "epoch": 2.5822994210090986, "grad_norm": 0.15768767893314362, "kl": 0.0615234375, "learning_rate": 5.791018278353106e-07, "loss": 0.0006136298179626465, "memory(GiB)": 38.13, "reward": 0.5161242485046387, "reward_std": 0.039944589138031006, "rewards/VisualizationJSONCombinedORM/mean": 0.5161242485046387, "rewards/VisualizationJSONCombinedORM/std": 0.11728761345148087, "step": 3122, "train_speed(iter/s)": 0.115701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 306.6875, "completions/min_length": 253.0, "epoch": 2.5831265508684864, "grad_norm": 0.18221932649612427, "kl": 0.05859375, "learning_rate": 5.768557359857241e-07, "loss": 0.0005861073732376099, "memory(GiB)": 38.13, "reward": 0.5907638072967529, "reward_std": 0.04366660490632057, "rewards/VisualizationJSONCombinedORM/mean": 0.5907638072967529, "rewards/VisualizationJSONCombinedORM/std": 0.25884929299354553, "step": 3123, "train_speed(iter/s)": 0.115622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 273.75, "completions/min_length": 223.0, "epoch": 2.5839536807278742, "grad_norm": 0.16347606480121613, "kl": 0.14434814453125, "learning_rate": 5.746137417451464e-07, "loss": 0.0014417842030525208, "memory(GiB)": 38.13, "reward": 0.5230663418769836, "reward_std": 0.05884048715233803, "rewards/VisualizationJSONCombinedORM/mean": 0.5230663418769836, "rewards/VisualizationJSONCombinedORM/std": 0.13594037294387817, "step": 3124, "train_speed(iter/s)": 0.115574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 281.6875, "completions/min_length": 206.0, "epoch": 2.584780810587262, "grad_norm": 0.22958193719387054, "kl": 0.1026611328125, "learning_rate": 5.723758471905677e-07, "loss": 0.0010261833667755127, "memory(GiB)": 38.13, "reward": 0.6497842073440552, "reward_std": 0.10218930244445801, "rewards/VisualizationJSONCombinedORM/mean": 0.6497842073440552, "rewards/VisualizationJSONCombinedORM/std": 0.11394035071134567, "step": 3125, "train_speed(iter/s)": 0.115499 }, { "epoch": 2.584780810587262, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 363.5, "eval_completions/mean_length": 301.3697916666667, "eval_completions/min_length": 252.91666666666666, "eval_kl": 0.07948811848958333, "eval_loss": 0.000797634304035455, "eval_reward": 0.4555779676884413, "eval_reward_std": 0.060272248151401676, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4555779676884413, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06027224997524172, "eval_runtime": 311.3639, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 304.75, "completions/min_length": 243.0, "epoch": 2.58560794044665, "grad_norm": 0.23181074857711792, "kl": 0.09130859375, "learning_rate": 5.701420543951757e-07, "loss": 0.0009132847189903259, "memory(GiB)": 38.13, "reward": 0.5406632423400879, "reward_std": 0.06973478198051453, "rewards/VisualizationJSONCombinedORM/mean": 0.5406632423400879, "rewards/VisualizationJSONCombinedORM/std": 0.21828296780586243, "step": 3126, "train_speed(iter/s)": 0.11411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 290.3125, "completions/min_length": 230.0, "epoch": 2.586435070306038, "grad_norm": 0.1839340180158615, "kl": 0.176513671875, "learning_rate": 5.67912365428363e-07, "loss": 0.0017620474100112915, "memory(GiB)": 38.13, "reward": 0.4150509834289551, "reward_std": 0.06811123341321945, "rewards/VisualizationJSONCombinedORM/mean": 0.4150509834289551, "rewards/VisualizationJSONCombinedORM/std": 0.12477874755859375, "step": 3127, "train_speed(iter/s)": 0.114047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 332.0, "completions/min_length": 280.0, "epoch": 2.587262200165426, "grad_norm": 0.17160436511039734, "kl": 0.0625, "learning_rate": 5.656867823557144e-07, "loss": 0.0006250627338886261, "memory(GiB)": 38.13, "reward": 0.5250024199485779, "reward_std": 0.0410347543656826, "rewards/VisualizationJSONCombinedORM/mean": 0.5250024199485779, "rewards/VisualizationJSONCombinedORM/std": 0.1584199219942093, "step": 3128, "train_speed(iter/s)": 0.113986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 286.6875, "completions/min_length": 229.0, "epoch": 2.588089330024814, "grad_norm": 0.19660069048404694, "kl": 0.081298828125, "learning_rate": 5.634653072390167e-07, "loss": 0.0008113794028759003, "memory(GiB)": 38.13, "reward": 0.5521092414855957, "reward_std": 0.10143592953681946, "rewards/VisualizationJSONCombinedORM/mean": 0.5521092414855957, "rewards/VisualizationJSONCombinedORM/std": 0.1372368037700653, "step": 3129, "train_speed(iter/s)": 0.113893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 316.75, "completions/min_length": 243.0, "epoch": 2.588916459884202, "grad_norm": 0.20857203006744385, "kl": 0.08447265625, "learning_rate": 5.612479421362454e-07, "loss": 0.0008439794182777405, "memory(GiB)": 38.13, "reward": 0.586531400680542, "reward_std": 0.13670498132705688, "rewards/VisualizationJSONCombinedORM/mean": 0.586531400680542, "rewards/VisualizationJSONCombinedORM/std": 0.21646513044834137, "step": 3130, "train_speed(iter/s)": 0.113797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 309.5625, "completions/min_length": 261.0, "epoch": 2.58974358974359, "grad_norm": 0.16443495452404022, "kl": 0.0328369140625, "learning_rate": 5.590346891015758e-07, "loss": 0.00032832473516464233, "memory(GiB)": 38.13, "reward": 0.3561584949493408, "reward_std": 0.0379316546022892, "rewards/VisualizationJSONCombinedORM/mean": 0.3561584949493408, "rewards/VisualizationJSONCombinedORM/std": 0.06922315061092377, "step": 3131, "train_speed(iter/s)": 0.113731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 313.5625, "completions/min_length": 241.0, "epoch": 2.5905707196029777, "grad_norm": 0.193902388215065, "kl": 0.068359375, "learning_rate": 5.568255501853664e-07, "loss": 0.0006847269833087921, "memory(GiB)": 38.13, "reward": 0.4746978282928467, "reward_std": 0.030994880944490433, "rewards/VisualizationJSONCombinedORM/mean": 0.4746978282928467, "rewards/VisualizationJSONCombinedORM/std": 0.25741255283355713, "step": 3132, "train_speed(iter/s)": 0.113661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 292.25, "completions/min_length": 243.0, "epoch": 2.5913978494623655, "grad_norm": 0.1691707968711853, "kl": 0.1253662109375, "learning_rate": 5.546205274341693e-07, "loss": 0.0012542754411697388, "memory(GiB)": 38.13, "reward": 0.7639775276184082, "reward_std": 0.0891905277967453, "rewards/VisualizationJSONCombinedORM/mean": 0.7639775276184082, "rewards/VisualizationJSONCombinedORM/std": 0.09434331208467484, "step": 3133, "train_speed(iter/s)": 0.113573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 321.1875, "completions/min_length": 239.0, "epoch": 2.5922249793217533, "grad_norm": 0.19349926710128784, "kl": 0.26220703125, "learning_rate": 5.524196228907203e-07, "loss": 0.0026202909648418427, "memory(GiB)": 38.13, "reward": 0.5389207601547241, "reward_std": 0.06028645485639572, "rewards/VisualizationJSONCombinedORM/mean": 0.5389207601547241, "rewards/VisualizationJSONCombinedORM/std": 0.0826449766755104, "step": 3134, "train_speed(iter/s)": 0.113487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 291.8125, "completions/min_length": 238.0, "epoch": 2.5930521091811416, "grad_norm": 0.3246122896671295, "kl": 0.130126953125, "learning_rate": 5.502228385939418e-07, "loss": 0.0012978091835975647, "memory(GiB)": 38.13, "reward": 0.6359121203422546, "reward_std": 0.06403448432683945, "rewards/VisualizationJSONCombinedORM/mean": 0.6359121203422546, "rewards/VisualizationJSONCombinedORM/std": 0.2196921557188034, "step": 3135, "train_speed(iter/s)": 0.113419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/mean_length": 329.125, "completions/min_length": 245.0, "epoch": 2.5938792390405294, "grad_norm": 0.17613734304904938, "kl": 0.1123046875, "learning_rate": 5.480301765789392e-07, "loss": 0.0011245571076869965, "memory(GiB)": 38.13, "reward": 0.6198031902313232, "reward_std": 0.05971336364746094, "rewards/VisualizationJSONCombinedORM/mean": 0.6198031902313232, "rewards/VisualizationJSONCombinedORM/std": 0.16647091507911682, "step": 3136, "train_speed(iter/s)": 0.113326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 281.125, "completions/min_length": 232.0, "epoch": 2.5947063688999172, "grad_norm": 0.17487362027168274, "kl": 0.1094970703125, "learning_rate": 5.458416388769994e-07, "loss": 0.0010963156819343567, "memory(GiB)": 38.13, "reward": 0.533873975276947, "reward_std": 0.08871671557426453, "rewards/VisualizationJSONCombinedORM/mean": 0.533873975276947, "rewards/VisualizationJSONCombinedORM/std": 0.1867852360010147, "step": 3137, "train_speed(iter/s)": 0.113259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 306.6875, "completions/min_length": 207.0, "epoch": 2.595533498759305, "grad_norm": 0.21121430397033691, "kl": 0.1524658203125, "learning_rate": 5.43657227515586e-07, "loss": 0.0015275254845619202, "memory(GiB)": 38.13, "reward": 0.6537426114082336, "reward_std": 0.07493278384208679, "rewards/VisualizationJSONCombinedORM/mean": 0.6537426114082336, "rewards/VisualizationJSONCombinedORM/std": 0.07291463017463684, "step": 3138, "train_speed(iter/s)": 0.113188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 311.5625, "completions/min_length": 227.0, "epoch": 2.596360628618693, "grad_norm": 0.2423567771911621, "kl": 0.07391357421875, "learning_rate": 5.414769445183432e-07, "loss": 0.0007393693085759878, "memory(GiB)": 38.13, "reward": 0.4133782982826233, "reward_std": 0.08327580988407135, "rewards/VisualizationJSONCombinedORM/mean": 0.4133782982826233, "rewards/VisualizationJSONCombinedORM/std": 0.15042906999588013, "step": 3139, "train_speed(iter/s)": 0.113119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 299.5, "completions/min_length": 249.0, "epoch": 2.597187758478081, "grad_norm": 0.20869934558868408, "kl": 0.06414794921875, "learning_rate": 5.39300791905088e-07, "loss": 0.0006394516676664352, "memory(GiB)": 38.13, "reward": 0.5987627506256104, "reward_std": 0.07225383818149567, "rewards/VisualizationJSONCombinedORM/mean": 0.5987627506256104, "rewards/VisualizationJSONCombinedORM/std": 0.20358966290950775, "step": 3140, "train_speed(iter/s)": 0.113049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 294.5625, "completions/min_length": 227.0, "epoch": 2.598014888337469, "grad_norm": 0.15382687747478485, "kl": 0.07977294921875, "learning_rate": 5.371287716918128e-07, "loss": 0.000797836109995842, "memory(GiB)": 38.13, "reward": 0.47488832473754883, "reward_std": 0.04436129331588745, "rewards/VisualizationJSONCombinedORM/mean": 0.47488832473754883, "rewards/VisualizationJSONCombinedORM/std": 0.29206588864326477, "step": 3141, "train_speed(iter/s)": 0.112972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 281.1875, "completions/min_length": 231.0, "epoch": 2.598842018196857, "grad_norm": 0.18298840522766113, "kl": 0.0548095703125, "learning_rate": 5.349608858906812e-07, "loss": 0.0005482695996761322, "memory(GiB)": 38.13, "reward": 0.47281476855278015, "reward_std": 0.025039302185177803, "rewards/VisualizationJSONCombinedORM/mean": 0.47281476855278015, "rewards/VisualizationJSONCombinedORM/std": 0.16727325320243835, "step": 3142, "train_speed(iter/s)": 0.112898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 303.4375, "completions/min_length": 220.0, "epoch": 2.599669148056245, "grad_norm": 0.19003459811210632, "kl": 0.0618896484375, "learning_rate": 5.327971365100276e-07, "loss": 0.0006192214787006378, "memory(GiB)": 38.13, "reward": 0.5793459415435791, "reward_std": 0.04574192315340042, "rewards/VisualizationJSONCombinedORM/mean": 0.5793459415435791, "rewards/VisualizationJSONCombinedORM/std": 0.15064877271652222, "step": 3143, "train_speed(iter/s)": 0.112828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 297.875, "completions/min_length": 239.0, "epoch": 2.600496277915633, "grad_norm": 0.2052978128194809, "kl": 0.082275390625, "learning_rate": 5.306375255543511e-07, "loss": 0.0008222758769989014, "memory(GiB)": 38.13, "reward": 0.6746736764907837, "reward_std": 0.07432470470666885, "rewards/VisualizationJSONCombinedORM/mean": 0.6746736764907837, "rewards/VisualizationJSONCombinedORM/std": 0.08711450546979904, "step": 3144, "train_speed(iter/s)": 0.112753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 316.625, "completions/min_length": 221.0, "epoch": 2.6013234077750207, "grad_norm": 0.19207854568958282, "kl": 0.06201171875, "learning_rate": 5.284820550243219e-07, "loss": 0.000618278980255127, "memory(GiB)": 38.13, "reward": 0.4536711573600769, "reward_std": 0.06405720859766006, "rewards/VisualizationJSONCombinedORM/mean": 0.4536711573600769, "rewards/VisualizationJSONCombinedORM/std": 0.16595736145973206, "step": 3145, "train_speed(iter/s)": 0.112673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 295.875, "completions/min_length": 193.0, "epoch": 2.6021505376344085, "grad_norm": 0.18347227573394775, "kl": 0.057861328125, "learning_rate": 5.263307269167683e-07, "loss": 0.0005783960223197937, "memory(GiB)": 38.13, "reward": 0.5181752443313599, "reward_std": 0.08529488742351532, "rewards/VisualizationJSONCombinedORM/mean": 0.5181752443313599, "rewards/VisualizationJSONCombinedORM/std": 0.11852850764989853, "step": 3146, "train_speed(iter/s)": 0.112606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 298.75, "completions/min_length": 241.0, "epoch": 2.6029776674937963, "grad_norm": 0.18319456279277802, "kl": 0.159423828125, "learning_rate": 5.241835432246888e-07, "loss": 0.0015950314700603485, "memory(GiB)": 38.13, "reward": 0.48314982652664185, "reward_std": 0.04617635905742645, "rewards/VisualizationJSONCombinedORM/mean": 0.48314982652664185, "rewards/VisualizationJSONCombinedORM/std": 0.23614637553691864, "step": 3147, "train_speed(iter/s)": 0.112539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 283.125, "completions/min_length": 241.0, "epoch": 2.6038047973531846, "grad_norm": 0.2171870470046997, "kl": 0.05828857421875, "learning_rate": 5.220405059372352e-07, "loss": 0.0005817785859107971, "memory(GiB)": 38.13, "reward": 0.47732609510421753, "reward_std": 0.044287730008363724, "rewards/VisualizationJSONCombinedORM/mean": 0.47732609510421753, "rewards/VisualizationJSONCombinedORM/std": 0.07934361696243286, "step": 3148, "train_speed(iter/s)": 0.112469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 292.5625, "completions/min_length": 239.0, "epoch": 2.6046319272125724, "grad_norm": 0.21095287799835205, "kl": 0.02392578125, "learning_rate": 5.199016170397237e-07, "loss": 0.00023965537548065186, "memory(GiB)": 38.13, "reward": 0.5619919300079346, "reward_std": 0.08480392396450043, "rewards/VisualizationJSONCombinedORM/mean": 0.5619919300079346, "rewards/VisualizationJSONCombinedORM/std": 0.08672071993350983, "step": 3149, "train_speed(iter/s)": 0.112408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 283.9375, "completions/min_length": 222.0, "epoch": 2.6054590570719602, "grad_norm": 0.1681428998708725, "kl": 0.09326171875, "learning_rate": 5.177668785136225e-07, "loss": 0.0009338632225990295, "memory(GiB)": 38.13, "reward": 0.6105669140815735, "reward_std": 0.08697008341550827, "rewards/VisualizationJSONCombinedORM/mean": 0.6105669140815735, "rewards/VisualizationJSONCombinedORM/std": 0.11910943686962128, "step": 3150, "train_speed(iter/s)": 0.112337 }, { "epoch": 2.6054590570719602, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.4166666666667, "eval_completions/mean_length": 299.3697916666667, "eval_completions/min_length": 250.33333333333334, "eval_kl": 0.075225830078125, "eval_loss": 0.0007498959894292057, "eval_reward": 0.4634177125990391, "eval_reward_std": 0.05831238223860661, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4634177125990391, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05831238239382704, "eval_runtime": 307.9626, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 298.875, "completions/min_length": 233.0, "epoch": 2.606286186931348, "grad_norm": 0.22984178364276886, "kl": 0.032745361328125, "learning_rate": 5.156362923365587e-07, "loss": 0.00032793357968330383, "memory(GiB)": 38.13, "reward": 0.6377209424972534, "reward_std": 0.06516726315021515, "rewards/VisualizationJSONCombinedORM/mean": 0.6377209424972534, "rewards/VisualizationJSONCombinedORM/std": 0.09980335086584091, "step": 3151, "train_speed(iter/s)": 0.111056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 281.5625, "completions/min_length": 217.0, "epoch": 2.607113316790736, "grad_norm": 0.17500314116477966, "kl": 0.05609130859375, "learning_rate": 5.135098604823107e-07, "loss": 0.0005606189370155334, "memory(GiB)": 38.13, "reward": 0.4968716502189636, "reward_std": 0.063763327896595, "rewards/VisualizationJSONCombinedORM/mean": 0.4968716502189636, "rewards/VisualizationJSONCombinedORM/std": 0.23758827149868011, "step": 3152, "train_speed(iter/s)": 0.111006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 308.3125, "completions/min_length": 245.0, "epoch": 2.607940446650124, "grad_norm": 0.1938556581735611, "kl": 0.03497314453125, "learning_rate": 5.1138758492081e-07, "loss": 0.000350169837474823, "memory(GiB)": 38.13, "reward": 0.6930018663406372, "reward_std": 0.05657432973384857, "rewards/VisualizationJSONCombinedORM/mean": 0.6930018663406372, "rewards/VisualizationJSONCombinedORM/std": 0.0686739906668663, "step": 3153, "train_speed(iter/s)": 0.11093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 289.4375, "completions/min_length": 211.0, "epoch": 2.608767576509512, "grad_norm": 0.27963873744010925, "kl": 0.07958984375, "learning_rate": 5.092694676181353e-07, "loss": 0.0007954072207212448, "memory(GiB)": 38.13, "reward": 0.503553032875061, "reward_std": 0.06968054920434952, "rewards/VisualizationJSONCombinedORM/mean": 0.503553032875061, "rewards/VisualizationJSONCombinedORM/std": 0.20679879188537598, "step": 3154, "train_speed(iter/s)": 0.11087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 274.75, "completions/min_length": 222.0, "epoch": 2.6095947063689, "grad_norm": 0.18819159269332886, "kl": 0.07025146484375, "learning_rate": 5.071555105365156e-07, "loss": 0.0007024072110652924, "memory(GiB)": 38.13, "reward": 0.33298999071121216, "reward_std": 0.039594873785972595, "rewards/VisualizationJSONCombinedORM/mean": 0.33298999071121216, "rewards/VisualizationJSONCombinedORM/std": 0.14386579394340515, "step": 3155, "train_speed(iter/s)": 0.110804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 340.375, "completions/min_length": 274.0, "epoch": 2.610421836228288, "grad_norm": 0.16062025725841522, "kl": 0.05523681640625, "learning_rate": 5.050457156343225e-07, "loss": 0.0005532540380954742, "memory(GiB)": 38.13, "reward": 0.48218458890914917, "reward_std": 0.05897516757249832, "rewards/VisualizationJSONCombinedORM/mean": 0.48218458890914917, "rewards/VisualizationJSONCombinedORM/std": 0.24960434436798096, "step": 3156, "train_speed(iter/s)": 0.11071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 304.0625, "completions/min_length": 247.0, "epoch": 2.611248966087676, "grad_norm": 0.2698265016078949, "kl": 0.10760498046875, "learning_rate": 5.02940084866076e-07, "loss": 0.0010740160942077637, "memory(GiB)": 38.13, "reward": 0.7116695642471313, "reward_std": 0.11503466963768005, "rewards/VisualizationJSONCombinedORM/mean": 0.7116695642471313, "rewards/VisualizationJSONCombinedORM/std": 0.11754225194454193, "step": 3157, "train_speed(iter/s)": 0.110653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 292.1875, "completions/min_length": 235.0, "epoch": 2.6120760959470637, "grad_norm": 0.18555234372615814, "kl": 0.062255859375, "learning_rate": 5.00838620182435e-07, "loss": 0.0006241239607334137, "memory(GiB)": 38.13, "reward": 0.470488578081131, "reward_std": 0.06902500241994858, "rewards/VisualizationJSONCombinedORM/mean": 0.470488578081131, "rewards/VisualizationJSONCombinedORM/std": 0.14677514135837555, "step": 3158, "train_speed(iter/s)": 0.110584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 319.5625, "completions/min_length": 250.0, "epoch": 2.6129032258064515, "grad_norm": 0.1550668179988861, "kl": 0.07537841796875, "learning_rate": 4.987413235302025e-07, "loss": 0.0007541216909885406, "memory(GiB)": 38.13, "reward": 0.6069591045379639, "reward_std": 0.04527679458260536, "rewards/VisualizationJSONCombinedORM/mean": 0.6069591045379639, "rewards/VisualizationJSONCombinedORM/std": 0.056991226971149445, "step": 3159, "train_speed(iter/s)": 0.110494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 284.375, "completions/min_length": 209.0, "epoch": 2.6137303556658393, "grad_norm": 0.20033690333366394, "kl": 0.0721435546875, "learning_rate": 4.966481968523146e-07, "loss": 0.0007214471697807312, "memory(GiB)": 38.13, "reward": 0.5609099864959717, "reward_std": 0.07394867390394211, "rewards/VisualizationJSONCombinedORM/mean": 0.5609099864959717, "rewards/VisualizationJSONCombinedORM/std": 0.1984703093767166, "step": 3160, "train_speed(iter/s)": 0.110411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 312.8125, "completions/min_length": 238.0, "epoch": 2.6145574855252276, "grad_norm": 0.20050886273384094, "kl": 0.0592041015625, "learning_rate": 4.945592420878509e-07, "loss": 0.0005916357040405273, "memory(GiB)": 38.13, "reward": 0.3511807918548584, "reward_std": 0.037768278270959854, "rewards/VisualizationJSONCombinedORM/mean": 0.3511807918548584, "rewards/VisualizationJSONCombinedORM/std": 0.11528709530830383, "step": 3161, "train_speed(iter/s)": 0.11034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 307.875, "completions/min_length": 233.0, "epoch": 2.6153846153846154, "grad_norm": 0.2454194277524948, "kl": 0.07958984375, "learning_rate": 4.924744611720201e-07, "loss": 0.0007968172430992126, "memory(GiB)": 38.13, "reward": 0.5251984596252441, "reward_std": 0.06700340658426285, "rewards/VisualizationJSONCombinedORM/mean": 0.5251984596252441, "rewards/VisualizationJSONCombinedORM/std": 0.17025107145309448, "step": 3162, "train_speed(iter/s)": 0.110257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 290.6875, "completions/min_length": 221.0, "epoch": 2.6162117452440032, "grad_norm": 0.18718166649341583, "kl": 0.06512451171875, "learning_rate": 4.903938560361698e-07, "loss": 0.0006519034504890442, "memory(GiB)": 38.13, "reward": 0.43051594495773315, "reward_std": 0.059699416160583496, "rewards/VisualizationJSONCombinedORM/mean": 0.43051594495773315, "rewards/VisualizationJSONCombinedORM/std": 0.19870318472385406, "step": 3163, "train_speed(iter/s)": 0.110182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 293.875, "completions/min_length": 246.0, "epoch": 2.6170388751033915, "grad_norm": 0.24826648831367493, "kl": 0.08001708984375, "learning_rate": 4.88317428607774e-07, "loss": 0.0008005201816558838, "memory(GiB)": 38.13, "reward": 0.43520063161849976, "reward_std": 0.060828797519207, "rewards/VisualizationJSONCombinedORM/mean": 0.43520063161849976, "rewards/VisualizationJSONCombinedORM/std": 0.19004341959953308, "step": 3164, "train_speed(iter/s)": 0.110122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 303.4375, "completions/min_length": 244.0, "epoch": 2.617866004962779, "grad_norm": 0.18458153307437897, "kl": 0.0872802734375, "learning_rate": 4.862451808104419e-07, "loss": 0.0008733086287975311, "memory(GiB)": 38.13, "reward": 0.5309579372406006, "reward_std": 0.04595474526286125, "rewards/VisualizationJSONCombinedORM/mean": 0.5309579372406006, "rewards/VisualizationJSONCombinedORM/std": 0.19526277482509613, "step": 3165, "train_speed(iter/s)": 0.11008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 281.1875, "completions/min_length": 217.0, "epoch": 2.618693134822167, "grad_norm": 0.18810009956359863, "kl": 0.1102294921875, "learning_rate": 4.841771145639041e-07, "loss": 0.0011021047830581665, "memory(GiB)": 38.13, "reward": 0.3732120394706726, "reward_std": 0.05401059240102768, "rewards/VisualizationJSONCombinedORM/mean": 0.3732120394706726, "rewards/VisualizationJSONCombinedORM/std": 0.136557936668396, "step": 3166, "train_speed(iter/s)": 0.110017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 320.375, "completions/min_length": 242.0, "epoch": 2.619520264681555, "grad_norm": 0.18416476249694824, "kl": 0.06781005859375, "learning_rate": 4.821132317840233e-07, "loss": 0.0006778063252568245, "memory(GiB)": 38.13, "reward": 0.710582971572876, "reward_std": 0.07938388735055923, "rewards/VisualizationJSONCombinedORM/mean": 0.710582971572876, "rewards/VisualizationJSONCombinedORM/std": 0.08374734967947006, "step": 3167, "train_speed(iter/s)": 0.10995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 228.3125, "completions/min_length": 192.0, "epoch": 2.620347394540943, "grad_norm": 0.19556809961795807, "kl": 0.05072021484375, "learning_rate": 4.800535343827834e-07, "loss": 0.0005077086389064789, "memory(GiB)": 38.13, "reward": 0.3305509686470032, "reward_std": 0.050766170024871826, "rewards/VisualizationJSONCombinedORM/mean": 0.3305509686470032, "rewards/VisualizationJSONCombinedORM/std": 0.049165498465299606, "step": 3168, "train_speed(iter/s)": 0.109895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/mean_length": 337.375, "completions/min_length": 235.0, "epoch": 2.621174524400331, "grad_norm": 0.171487495303154, "kl": 0.04730224609375, "learning_rate": 4.779980242682924e-07, "loss": 0.00047339126467704773, "memory(GiB)": 38.13, "reward": 0.30354243516921997, "reward_std": 0.037561021745204926, "rewards/VisualizationJSONCombinedORM/mean": 0.30354243516921997, "rewards/VisualizationJSONCombinedORM/std": 0.14532622694969177, "step": 3169, "train_speed(iter/s)": 0.109814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 282.5, "completions/min_length": 222.0, "epoch": 2.622001654259719, "grad_norm": 0.19570578634738922, "kl": 0.05322265625, "learning_rate": 4.75946703344779e-07, "loss": 0.0005322396755218506, "memory(GiB)": 38.13, "reward": 0.5092027187347412, "reward_std": 0.03368908539414406, "rewards/VisualizationJSONCombinedORM/mean": 0.5092027187347412, "rewards/VisualizationJSONCombinedORM/std": 0.27064868807792664, "step": 3170, "train_speed(iter/s)": 0.109761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 320.5, "completions/min_length": 262.0, "epoch": 2.6228287841191067, "grad_norm": 0.16648134589195251, "kl": 0.0814208984375, "learning_rate": 4.738995735125895e-07, "loss": 0.0008147452026605606, "memory(GiB)": 38.13, "reward": 0.5000936985015869, "reward_std": 0.0622996911406517, "rewards/VisualizationJSONCombinedORM/mean": 0.5000936985015869, "rewards/VisualizationJSONCombinedORM/std": 0.25437331199645996, "step": 3171, "train_speed(iter/s)": 0.1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 283.125, "completions/min_length": 226.0, "epoch": 2.6236559139784945, "grad_norm": 0.1874229907989502, "kl": 0.03741455078125, "learning_rate": 4.718566366681909e-07, "loss": 0.0003734305500984192, "memory(GiB)": 38.13, "reward": 0.5650964975357056, "reward_std": 0.04956423491239548, "rewards/VisualizationJSONCombinedORM/mean": 0.5650964975357056, "rewards/VisualizationJSONCombinedORM/std": 0.16136975586414337, "step": 3172, "train_speed(iter/s)": 0.109639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 307.5625, "completions/min_length": 236.0, "epoch": 2.6244830438378823, "grad_norm": 0.1966417133808136, "kl": 0.075927734375, "learning_rate": 4.6981789470416093e-07, "loss": 0.000760987401008606, "memory(GiB)": 38.13, "reward": 0.5839055776596069, "reward_std": 0.10444007813930511, "rewards/VisualizationJSONCombinedORM/mean": 0.5839055776596069, "rewards/VisualizationJSONCombinedORM/std": 0.11553798615932465, "step": 3173, "train_speed(iter/s)": 0.109582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 273.3125, "completions/min_length": 224.0, "epoch": 2.6253101736972706, "grad_norm": 0.15883751213550568, "kl": 0.053466796875, "learning_rate": 4.677833495091949e-07, "loss": 0.0005347691476345062, "memory(GiB)": 38.13, "reward": 0.5134568214416504, "reward_std": 0.04906415194272995, "rewards/VisualizationJSONCombinedORM/mean": 0.5134568214416504, "rewards/VisualizationJSONCombinedORM/std": 0.08343064039945602, "step": 3174, "train_speed(iter/s)": 0.109528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 300.25, "completions/min_length": 228.0, "epoch": 2.6261373035566584, "grad_norm": 0.20028424263000488, "kl": 0.07696533203125, "learning_rate": 4.6575300296809956e-07, "loss": 0.00077037513256073, "memory(GiB)": 38.13, "reward": 0.5980852842330933, "reward_std": 0.0649915263056755, "rewards/VisualizationJSONCombinedORM/mean": 0.5980852842330933, "rewards/VisualizationJSONCombinedORM/std": 0.1250184178352356, "step": 3175, "train_speed(iter/s)": 0.109449 }, { "epoch": 2.6261373035566584, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 345.6666666666667, "eval_completions/mean_length": 296.9739583333333, "eval_completions/min_length": 246.91666666666666, "eval_kl": 0.07761637369791667, "eval_loss": 0.000787847675383091, "eval_reward": 0.4470172847310702, "eval_reward_std": 0.056021642948811255, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4470172847310702, "eval_rewards/VisualizationJSONCombinedORM/std": 0.056021644617430866, "eval_runtime": 299.2258, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 285.25, "completions/min_length": 236.0, "epoch": 2.6269644334160462, "grad_norm": 0.19519184529781342, "kl": 0.05084228515625, "learning_rate": 4.637268569617931e-07, "loss": 0.0005087126046419144, "memory(GiB)": 38.13, "reward": 0.5306670665740967, "reward_std": 0.065898098051548, "rewards/VisualizationJSONCombinedORM/mean": 0.5306670665740967, "rewards/VisualizationJSONCombinedORM/std": 0.1393120437860489, "step": 3176, "train_speed(iter/s)": 0.108257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 290.0625, "completions/min_length": 224.0, "epoch": 2.6277915632754345, "grad_norm": 0.16862156987190247, "kl": 0.03106689453125, "learning_rate": 4.6170491336729794e-07, "loss": 0.0003108978271484375, "memory(GiB)": 38.13, "reward": 0.6306520104408264, "reward_std": 0.07412895560264587, "rewards/VisualizationJSONCombinedORM/mean": 0.6306520104408264, "rewards/VisualizationJSONCombinedORM/std": 0.1947011798620224, "step": 3177, "train_speed(iter/s)": 0.108178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 286.625, "completions/min_length": 229.0, "epoch": 2.6286186931348223, "grad_norm": 0.17738491296768188, "kl": 0.04034423828125, "learning_rate": 4.596871740577491e-07, "loss": 0.0004041045904159546, "memory(GiB)": 38.13, "reward": 0.7037845849990845, "reward_std": 0.05206640437245369, "rewards/VisualizationJSONCombinedORM/mean": 0.7037845849990845, "rewards/VisualizationJSONCombinedORM/std": 0.15815821290016174, "step": 3178, "train_speed(iter/s)": 0.108102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 335.5625, "completions/min_length": 267.0, "epoch": 2.62944582299421, "grad_norm": 0.1642855405807495, "kl": 0.035675048828125, "learning_rate": 4.576736409023813e-07, "loss": 0.0003568846732378006, "memory(GiB)": 38.13, "reward": 0.7018966674804688, "reward_std": 0.05877673625946045, "rewards/VisualizationJSONCombinedORM/mean": 0.7018966674804688, "rewards/VisualizationJSONCombinedORM/std": 0.13050518929958344, "step": 3179, "train_speed(iter/s)": 0.108026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 290.9375, "completions/min_length": 205.0, "epoch": 2.630272952853598, "grad_norm": 0.21079325675964355, "kl": 0.11865234375, "learning_rate": 4.55664315766538e-07, "loss": 0.0011858418583869934, "memory(GiB)": 38.13, "reward": 0.39457231760025024, "reward_std": 0.06376171112060547, "rewards/VisualizationJSONCombinedORM/mean": 0.39457231760025024, "rewards/VisualizationJSONCombinedORM/std": 0.12083660811185837, "step": 3180, "train_speed(iter/s)": 0.107947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 275.375, "completions/min_length": 216.0, "epoch": 2.631100082712986, "grad_norm": 0.21564848721027374, "kl": 0.0716552734375, "learning_rate": 4.536592005116602e-07, "loss": 0.0007174331694841385, "memory(GiB)": 38.13, "reward": 0.4905630052089691, "reward_std": 0.0313110426068306, "rewards/VisualizationJSONCombinedORM/mean": 0.4905630052089691, "rewards/VisualizationJSONCombinedORM/std": 0.2474047988653183, "step": 3181, "train_speed(iter/s)": 0.107899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 323.9375, "completions/min_length": 245.0, "epoch": 2.631927212572374, "grad_norm": 0.19014419615268707, "kl": 0.1024169921875, "learning_rate": 4.5165829699529153e-07, "loss": 0.0010235123336315155, "memory(GiB)": 38.13, "reward": 0.517219603061676, "reward_std": 0.048051752150058746, "rewards/VisualizationJSONCombinedORM/mean": 0.517219603061676, "rewards/VisualizationJSONCombinedORM/std": 0.18789559602737427, "step": 3182, "train_speed(iter/s)": 0.107828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 316.875, "completions/min_length": 245.0, "epoch": 2.632754342431762, "grad_norm": 0.18517328798770905, "kl": 0.04412841796875, "learning_rate": 4.4966160707107075e-07, "loss": 0.0004418864846229553, "memory(GiB)": 38.13, "reward": 0.5035070776939392, "reward_std": 0.0471094585955143, "rewards/VisualizationJSONCombinedORM/mean": 0.5035070776939392, "rewards/VisualizationJSONCombinedORM/std": 0.20911046862602234, "step": 3183, "train_speed(iter/s)": 0.107763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 316.0625, "completions/min_length": 256.0, "epoch": 2.6335814722911497, "grad_norm": 0.1916736513376236, "kl": 0.0654296875, "learning_rate": 4.4766913258873667e-07, "loss": 0.0006533502601087093, "memory(GiB)": 38.13, "reward": 0.3425102233886719, "reward_std": 0.0361187718808651, "rewards/VisualizationJSONCombinedORM/mean": 0.3425102233886719, "rewards/VisualizationJSONCombinedORM/std": 0.17143654823303223, "step": 3184, "train_speed(iter/s)": 0.107708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 312.6875, "completions/min_length": 253.0, "epoch": 2.6344086021505375, "grad_norm": 0.21346378326416016, "kl": 0.0745849609375, "learning_rate": 4.456808753941205e-07, "loss": 0.0007458552718162537, "memory(GiB)": 38.13, "reward": 0.4769170582294464, "reward_std": 0.04305477440357208, "rewards/VisualizationJSONCombinedORM/mean": 0.4769170582294464, "rewards/VisualizationJSONCombinedORM/std": 0.276140421628952, "step": 3185, "train_speed(iter/s)": 0.10765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 287.625, "completions/min_length": 247.0, "epoch": 2.6352357320099253, "grad_norm": 0.1934676617383957, "kl": 0.07550048828125, "learning_rate": 4.436968373291489e-07, "loss": 0.0007547754794359207, "memory(GiB)": 38.13, "reward": 0.38335615396499634, "reward_std": 0.07754579186439514, "rewards/VisualizationJSONCombinedORM/mean": 0.38335615396499634, "rewards/VisualizationJSONCombinedORM/std": 0.09016095101833344, "step": 3186, "train_speed(iter/s)": 0.107593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 297.1875, "completions/min_length": 235.0, "epoch": 2.6360628618693136, "grad_norm": 0.23323428630828857, "kl": 0.07958984375, "learning_rate": 4.4171702023183663e-07, "loss": 0.0007945038378238678, "memory(GiB)": 38.13, "reward": 0.6744086742401123, "reward_std": 0.10167151689529419, "rewards/VisualizationJSONCombinedORM/mean": 0.6744086742401123, "rewards/VisualizationJSONCombinedORM/std": 0.11886653304100037, "step": 3187, "train_speed(iter/s)": 0.107548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 285.25, "completions/min_length": 214.0, "epoch": 2.6368899917287014, "grad_norm": 0.1895146667957306, "kl": 0.04620361328125, "learning_rate": 4.3974142593629145e-07, "loss": 0.0004620179533958435, "memory(GiB)": 38.13, "reward": 0.28135037422180176, "reward_std": 0.02653953805565834, "rewards/VisualizationJSONCombinedORM/mean": 0.28135037422180176, "rewards/VisualizationJSONCombinedORM/std": 0.09060519933700562, "step": 3188, "train_speed(iter/s)": 0.10749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 300.75, "completions/min_length": 207.0, "epoch": 2.6377171215880892, "grad_norm": 0.17228315770626068, "kl": 0.1248779296875, "learning_rate": 4.377700562727055e-07, "loss": 0.0012496933341026306, "memory(GiB)": 38.13, "reward": 0.2704901695251465, "reward_std": 0.02674027904868126, "rewards/VisualizationJSONCombinedORM/mean": 0.2704901695251465, "rewards/VisualizationJSONCombinedORM/std": 0.07713700085878372, "step": 3189, "train_speed(iter/s)": 0.107419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 300.5625, "completions/min_length": 238.0, "epoch": 2.6385442514474775, "grad_norm": 0.16510042548179626, "kl": 0.027740478515625, "learning_rate": 4.3580291306736025e-07, "loss": 0.0002772025763988495, "memory(GiB)": 38.13, "reward": 0.7568711638450623, "reward_std": 0.06551362574100494, "rewards/VisualizationJSONCombinedORM/mean": 0.7568711638450623, "rewards/VisualizationJSONCombinedORM/std": 0.06408412009477615, "step": 3190, "train_speed(iter/s)": 0.107357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 303.875, "completions/min_length": 242.0, "epoch": 2.6393713813068653, "grad_norm": 0.2815721929073334, "kl": 0.05511474609375, "learning_rate": 4.338399981426211e-07, "loss": 0.000549856573343277, "memory(GiB)": 38.13, "reward": 0.5695202350616455, "reward_std": 0.07953245937824249, "rewards/VisualizationJSONCombinedORM/mean": 0.5695202350616455, "rewards/VisualizationJSONCombinedORM/std": 0.10346034914255142, "step": 3191, "train_speed(iter/s)": 0.107293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 291.3125, "completions/min_length": 236.0, "epoch": 2.640198511166253, "grad_norm": 0.22131195664405823, "kl": 0.1409912109375, "learning_rate": 4.318813133169375e-07, "loss": 0.001411445438861847, "memory(GiB)": 38.13, "reward": 0.348634272813797, "reward_std": 0.041803427040576935, "rewards/VisualizationJSONCombinedORM/mean": 0.348634272813797, "rewards/VisualizationJSONCombinedORM/std": 0.046556275337934494, "step": 3192, "train_speed(iter/s)": 0.107229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 276.625, "completions/min_length": 236.0, "epoch": 2.641025641025641, "grad_norm": 0.2526817321777344, "kl": 0.04022216796875, "learning_rate": 4.2992686040483567e-07, "loss": 0.00040195509791374207, "memory(GiB)": 38.13, "reward": 0.5666803121566772, "reward_std": 0.10455678403377533, "rewards/VisualizationJSONCombinedORM/mean": 0.5666803121566772, "rewards/VisualizationJSONCombinedORM/std": 0.11132820695638657, "step": 3193, "train_speed(iter/s)": 0.107183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 271.875, "completions/min_length": 239.0, "epoch": 2.641852770885029, "grad_norm": 0.20954693853855133, "kl": 0.0517578125, "learning_rate": 4.279766412169267e-07, "loss": 0.000517234206199646, "memory(GiB)": 38.13, "reward": 0.46827930212020874, "reward_std": 0.03973639756441116, "rewards/VisualizationJSONCombinedORM/mean": 0.46827930212020874, "rewards/VisualizationJSONCombinedORM/std": 0.1570650190114975, "step": 3194, "train_speed(iter/s)": 0.107111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 308.125, "completions/min_length": 248.0, "epoch": 2.642679900744417, "grad_norm": 0.21356460452079773, "kl": 0.1195068359375, "learning_rate": 4.2603065755989493e-07, "loss": 0.0011959187686443329, "memory(GiB)": 38.13, "reward": 0.5105799436569214, "reward_std": 0.07859914004802704, "rewards/VisualizationJSONCombinedORM/mean": 0.5105799436569214, "rewards/VisualizationJSONCombinedORM/std": 0.12439204007387161, "step": 3195, "train_speed(iter/s)": 0.107055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 295.6875, "completions/min_length": 240.0, "epoch": 2.643507030603805, "grad_norm": 0.23638515174388885, "kl": 0.063232421875, "learning_rate": 4.24088911236506e-07, "loss": 0.0006325505673885345, "memory(GiB)": 38.13, "reward": 0.5259696841239929, "reward_std": 0.06544092297554016, "rewards/VisualizationJSONCombinedORM/mean": 0.5259696841239929, "rewards/VisualizationJSONCombinedORM/std": 0.1590089052915573, "step": 3196, "train_speed(iter/s)": 0.106999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 303.1875, "completions/min_length": 242.0, "epoch": 2.6443341604631927, "grad_norm": 0.1706983596086502, "kl": 0.02716064453125, "learning_rate": 4.2215140404559596e-07, "loss": 0.00027181580662727356, "memory(GiB)": 38.13, "reward": 0.7647116184234619, "reward_std": 0.07063842564821243, "rewards/VisualizationJSONCombinedORM/mean": 0.7647116184234619, "rewards/VisualizationJSONCombinedORM/std": 0.07254943996667862, "step": 3197, "train_speed(iter/s)": 0.106943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 309.9375, "completions/min_length": 236.0, "epoch": 2.6451612903225805, "grad_norm": 0.2653502821922302, "kl": 0.04901123046875, "learning_rate": 4.202181377820752e-07, "loss": 0.0004913415759801865, "memory(GiB)": 38.13, "reward": 0.5796564817428589, "reward_std": 0.06607158482074738, "rewards/VisualizationJSONCombinedORM/mean": 0.5796564817428589, "rewards/VisualizationJSONCombinedORM/std": 0.1926746666431427, "step": 3198, "train_speed(iter/s)": 0.106903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 268.375, "completions/min_length": 196.0, "epoch": 2.6459884201819683, "grad_norm": 0.1983775943517685, "kl": 0.04559326171875, "learning_rate": 4.182891142369244e-07, "loss": 0.0004558991640806198, "memory(GiB)": 38.13, "reward": 0.5529875755310059, "reward_std": 0.08619581162929535, "rewards/VisualizationJSONCombinedORM/mean": 0.5529875755310059, "rewards/VisualizationJSONCombinedORM/std": 0.09402967244386673, "step": 3199, "train_speed(iter/s)": 0.106849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 276.125, "completions/min_length": 226.0, "epoch": 2.6468155500413566, "grad_norm": 0.1606212556362152, "kl": 0.05889892578125, "learning_rate": 4.163643351971952e-07, "loss": 0.0005899369716644287, "memory(GiB)": 38.13, "reward": 0.6200489401817322, "reward_std": 0.0730161964893341, "rewards/VisualizationJSONCombinedORM/mean": 0.6200489401817322, "rewards/VisualizationJSONCombinedORM/std": 0.10602954030036926, "step": 3200, "train_speed(iter/s)": 0.106783 }, { "epoch": 2.6468155500413566, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.5, "eval_completions/mean_length": 298.3020833333333, "eval_completions/min_length": 246.79166666666666, "eval_kl": 0.07316080729166667, "eval_loss": 0.0007350158994086087, "eval_reward": 0.4459740513314803, "eval_reward_std": 0.0579989372442166, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4459740513314803, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057998937632267676, "eval_runtime": 307.1099, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 321.0, "completions/min_length": 246.0, "epoch": 2.6476426799007444, "grad_norm": 0.1598777025938034, "kl": 0.07281494140625, "learning_rate": 4.1444380244600623e-07, "loss": 0.0007294639945030212, "memory(GiB)": 38.13, "reward": 0.5869008302688599, "reward_std": 0.040126025676727295, "rewards/VisualizationJSONCombinedORM/mean": 0.5869008302688599, "rewards/VisualizationJSONCombinedORM/std": 0.23423849046230316, "step": 3201, "train_speed(iter/s)": 0.105641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 323.75, "completions/min_length": 241.0, "epoch": 2.6484698097601322, "grad_norm": 0.2432577759027481, "kl": 0.0513916015625, "learning_rate": 4.1252751776254373e-07, "loss": 0.0005145221948623657, "memory(GiB)": 38.13, "reward": 0.5920838713645935, "reward_std": 0.09823088347911835, "rewards/VisualizationJSONCombinedORM/mean": 0.5920838713645935, "rewards/VisualizationJSONCombinedORM/std": 0.15217582881450653, "step": 3202, "train_speed(iter/s)": 0.10556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 299.5625, "completions/min_length": 242.0, "epoch": 2.6492969396195205, "grad_norm": 0.1804470270872116, "kl": 0.05224609375, "learning_rate": 4.10615482922056e-07, "loss": 0.000521782785654068, "memory(GiB)": 38.13, "reward": 0.4955179989337921, "reward_std": 0.08472633361816406, "rewards/VisualizationJSONCombinedORM/mean": 0.4955179989337921, "rewards/VisualizationJSONCombinedORM/std": 0.23205427825450897, "step": 3203, "train_speed(iter/s)": 0.105492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 308.125, "completions/min_length": 239.0, "epoch": 2.6501240694789083, "grad_norm": 0.1817573606967926, "kl": 0.04669189453125, "learning_rate": 4.087076996958561e-07, "loss": 0.0004673302173614502, "memory(GiB)": 38.13, "reward": 0.448592871427536, "reward_std": 0.03669683262705803, "rewards/VisualizationJSONCombinedORM/mean": 0.448592871427536, "rewards/VisualizationJSONCombinedORM/std": 0.2338217943906784, "step": 3204, "train_speed(iter/s)": 0.105452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 298.1875, "completions/min_length": 219.0, "epoch": 2.650951199338296, "grad_norm": 0.2170349657535553, "kl": 0.0810546875, "learning_rate": 4.0680416985131756e-07, "loss": 0.0008084475994110107, "memory(GiB)": 38.13, "reward": 0.5215505361557007, "reward_std": 0.0755368322134018, "rewards/VisualizationJSONCombinedORM/mean": 0.5215505361557007, "rewards/VisualizationJSONCombinedORM/std": 0.1789918839931488, "step": 3205, "train_speed(iter/s)": 0.105401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 285.625, "completions/min_length": 232.0, "epoch": 2.651778329197684, "grad_norm": 0.18155629932880402, "kl": 0.0665283203125, "learning_rate": 4.049048951518741e-07, "loss": 0.0006638076156377792, "memory(GiB)": 38.13, "reward": 0.5907188057899475, "reward_std": 0.05907343700528145, "rewards/VisualizationJSONCombinedORM/mean": 0.5907188057899475, "rewards/VisualizationJSONCombinedORM/std": 0.07497375458478928, "step": 3206, "train_speed(iter/s)": 0.105351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 327.25, "completions/min_length": 256.0, "epoch": 2.652605459057072, "grad_norm": 0.17500297725200653, "kl": 0.1494140625, "learning_rate": 4.0300987735701733e-07, "loss": 0.001490965485572815, "memory(GiB)": 38.13, "reward": 0.5329685211181641, "reward_std": 0.07184850424528122, "rewards/VisualizationJSONCombinedORM/mean": 0.5329685211181641, "rewards/VisualizationJSONCombinedORM/std": 0.23417790234088898, "step": 3207, "train_speed(iter/s)": 0.105273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 299.625, "completions/min_length": 248.0, "epoch": 2.65343258891646, "grad_norm": 0.19798113405704498, "kl": 0.0850830078125, "learning_rate": 4.0111911822229677e-07, "loss": 0.0008497163653373718, "memory(GiB)": 38.13, "reward": 0.5761144161224365, "reward_std": 0.05122756212949753, "rewards/VisualizationJSONCombinedORM/mean": 0.5761144161224365, "rewards/VisualizationJSONCombinedORM/std": 0.16254302859306335, "step": 3208, "train_speed(iter/s)": 0.105198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 283.8125, "completions/min_length": 211.0, "epoch": 2.654259718775848, "grad_norm": 0.19866685569286346, "kl": 0.0665283203125, "learning_rate": 3.9923261949931235e-07, "loss": 0.0006641820073127747, "memory(GiB)": 38.13, "reward": 0.5922959446907043, "reward_std": 0.10929621756076813, "rewards/VisualizationJSONCombinedORM/mean": 0.5922959446907043, "rewards/VisualizationJSONCombinedORM/std": 0.12020836770534515, "step": 3209, "train_speed(iter/s)": 0.105149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 317.6875, "completions/min_length": 248.0, "epoch": 2.6550868486352357, "grad_norm": 0.19273023307323456, "kl": 0.04168701171875, "learning_rate": 3.973503829357223e-07, "loss": 0.00041716545820236206, "memory(GiB)": 38.13, "reward": 0.5761399865150452, "reward_std": 0.09435036778450012, "rewards/VisualizationJSONCombinedORM/mean": 0.5761399865150452, "rewards/VisualizationJSONCombinedORM/std": 0.1254338026046753, "step": 3210, "train_speed(iter/s)": 0.105068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 307.75, "completions/min_length": 260.0, "epoch": 2.6559139784946235, "grad_norm": 0.16038940846920013, "kl": 0.04132080078125, "learning_rate": 3.9547241027523164e-07, "loss": 0.0004126802086830139, "memory(GiB)": 38.13, "reward": 0.7454861402511597, "reward_std": 0.06561983376741409, "rewards/VisualizationJSONCombinedORM/mean": 0.7454861402511597, "rewards/VisualizationJSONCombinedORM/std": 0.1137072965502739, "step": 3211, "train_speed(iter/s)": 0.105014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 284.25, "completions/min_length": 226.0, "epoch": 2.6567411083540113, "grad_norm": 0.20516622066497803, "kl": 0.07501220703125, "learning_rate": 3.935987032576011e-07, "loss": 0.0007502995431423187, "memory(GiB)": 38.13, "reward": 0.34742408990859985, "reward_std": 0.03103030100464821, "rewards/VisualizationJSONCombinedORM/mean": 0.34742408990859985, "rewards/VisualizationJSONCombinedORM/std": 0.05607995390892029, "step": 3212, "train_speed(iter/s)": 0.104971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 289.0625, "completions/min_length": 237.0, "epoch": 2.6575682382133996, "grad_norm": 0.19186004996299744, "kl": 0.0272216796875, "learning_rate": 3.9172926361863316e-07, "loss": 0.00027308985590934753, "memory(GiB)": 38.13, "reward": 0.7070358991622925, "reward_std": 0.05098046734929085, "rewards/VisualizationJSONCombinedORM/mean": 0.7070358991622925, "rewards/VisualizationJSONCombinedORM/std": 0.15912450850009918, "step": 3213, "train_speed(iter/s)": 0.104899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 293.0625, "completions/min_length": 234.0, "epoch": 2.6583953680727874, "grad_norm": 0.19106286764144897, "kl": 0.10137939453125, "learning_rate": 3.898640930901826e-07, "loss": 0.0010146456770598888, "memory(GiB)": 38.13, "reward": 0.7090173959732056, "reward_std": 0.06741738319396973, "rewards/VisualizationJSONCombinedORM/mean": 0.7090173959732056, "rewards/VisualizationJSONCombinedORM/std": 0.06894169747829437, "step": 3214, "train_speed(iter/s)": 0.104813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 318.1875, "completions/min_length": 269.0, "epoch": 2.6592224979321752, "grad_norm": 0.1626916229724884, "kl": 0.1358642578125, "learning_rate": 3.8800319340014437e-07, "loss": 0.001357339322566986, "memory(GiB)": 38.13, "reward": 0.4707379937171936, "reward_std": 0.07788926362991333, "rewards/VisualizationJSONCombinedORM/mean": 0.4707379937171936, "rewards/VisualizationJSONCombinedORM/std": 0.13855679333209991, "step": 3215, "train_speed(iter/s)": 0.104757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 325.375, "completions/min_length": 230.0, "epoch": 2.6600496277915635, "grad_norm": 0.18915316462516785, "kl": 0.028839111328125, "learning_rate": 3.8614656627246115e-07, "loss": 0.00028884410858154297, "memory(GiB)": 38.13, "reward": 0.43731361627578735, "reward_std": 0.04002955183386803, "rewards/VisualizationJSONCombinedORM/mean": 0.43731361627578735, "rewards/VisualizationJSONCombinedORM/std": 0.1656109243631363, "step": 3216, "train_speed(iter/s)": 0.104687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 309.8125, "completions/min_length": 245.0, "epoch": 2.6608767576509513, "grad_norm": 0.16691748797893524, "kl": 0.08294677734375, "learning_rate": 3.842942134271149e-07, "loss": 0.0008289497345685959, "memory(GiB)": 38.13, "reward": 0.6412924528121948, "reward_std": 0.08760522305965424, "rewards/VisualizationJSONCombinedORM/mean": 0.6412924528121948, "rewards/VisualizationJSONCombinedORM/std": 0.08661772310733795, "step": 3217, "train_speed(iter/s)": 0.104629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 307.75, "completions/min_length": 236.0, "epoch": 2.661703887510339, "grad_norm": 1.0667002201080322, "kl": 0.06060791015625, "learning_rate": 3.8244613658013016e-07, "loss": 0.0006064027547836304, "memory(GiB)": 38.13, "reward": 0.3203190267086029, "reward_std": 0.039404354989528656, "rewards/VisualizationJSONCombinedORM/mean": 0.3203190267086029, "rewards/VisualizationJSONCombinedORM/std": 0.09847547113895416, "step": 3218, "train_speed(iter/s)": 0.104577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 290.25, "completions/min_length": 229.0, "epoch": 2.662531017369727, "grad_norm": 0.19976738095283508, "kl": 0.07611083984375, "learning_rate": 3.8060233744356634e-07, "loss": 0.0007619336247444153, "memory(GiB)": 38.13, "reward": 0.403680682182312, "reward_std": 0.03639810532331467, "rewards/VisualizationJSONCombinedORM/mean": 0.403680682182312, "rewards/VisualizationJSONCombinedORM/std": 0.05858146399259567, "step": 3219, "train_speed(iter/s)": 0.104526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 287.4375, "completions/min_length": 221.0, "epoch": 2.663358147229115, "grad_norm": 0.1788976937532425, "kl": 0.1136474609375, "learning_rate": 3.7876281772552426e-07, "loss": 0.0011352002620697021, "memory(GiB)": 38.13, "reward": 0.7223761677742004, "reward_std": 0.07912571728229523, "rewards/VisualizationJSONCombinedORM/mean": 0.7223761677742004, "rewards/VisualizationJSONCombinedORM/std": 0.07780106365680695, "step": 3220, "train_speed(iter/s)": 0.104471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 279.1875, "completions/min_length": 242.0, "epoch": 2.664185277088503, "grad_norm": 0.1853506714105606, "kl": 0.0552978515625, "learning_rate": 3.76927579130138e-07, "loss": 0.0005512647330760956, "memory(GiB)": 38.13, "reward": 0.580154299736023, "reward_std": 0.07032398134469986, "rewards/VisualizationJSONCombinedORM/mean": 0.580154299736023, "rewards/VisualizationJSONCombinedORM/std": 0.12147166579961777, "step": 3221, "train_speed(iter/s)": 0.104421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 311.9375, "completions/min_length": 244.0, "epoch": 2.665012406947891, "grad_norm": 0.2073521912097931, "kl": 0.024566650390625, "learning_rate": 3.750966233575753e-07, "loss": 0.0002458132803440094, "memory(GiB)": 38.13, "reward": 0.5276422500610352, "reward_std": 0.05459270626306534, "rewards/VisualizationJSONCombinedORM/mean": 0.5276422500610352, "rewards/VisualizationJSONCombinedORM/std": 0.10932637751102448, "step": 3222, "train_speed(iter/s)": 0.104369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 314.0, "completions/min_length": 247.0, "epoch": 2.6658395368072787, "grad_norm": 0.17716491222381592, "kl": 0.0511474609375, "learning_rate": 3.732699521040378e-07, "loss": 0.000510822981595993, "memory(GiB)": 38.13, "reward": 0.5047819018363953, "reward_std": 0.04963008314371109, "rewards/VisualizationJSONCombinedORM/mean": 0.5047819018363953, "rewards/VisualizationJSONCombinedORM/std": 0.1607513129711151, "step": 3223, "train_speed(iter/s)": 0.104318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 310.4375, "completions/min_length": 254.0, "epoch": 2.6666666666666665, "grad_norm": 0.20619240403175354, "kl": 0.0645751953125, "learning_rate": 3.7144756706175624e-07, "loss": 0.0006451969966292381, "memory(GiB)": 38.13, "reward": 0.5945611000061035, "reward_std": 0.09060537815093994, "rewards/VisualizationJSONCombinedORM/mean": 0.5945611000061035, "rewards/VisualizationJSONCombinedORM/std": 0.15275327861309052, "step": 3224, "train_speed(iter/s)": 0.10424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 321.1875, "completions/min_length": 242.0, "epoch": 2.6674937965260543, "grad_norm": 0.17104992270469666, "kl": 0.02398681640625, "learning_rate": 3.696294699189934e-07, "loss": 0.00023872032761573792, "memory(GiB)": 38.13, "reward": 0.5684930086135864, "reward_std": 0.02891339734196663, "rewards/VisualizationJSONCombinedORM/mean": 0.5684930086135864, "rewards/VisualizationJSONCombinedORM/std": 0.029420997947454453, "step": 3225, "train_speed(iter/s)": 0.104186 }, { "epoch": 2.6674937965260543, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 370.8333333333333, "eval_completions/mean_length": 300.8385416666667, "eval_completions/min_length": 249.83333333333334, "eval_kl": 0.072662353515625, "eval_loss": 0.0007236562669277191, "eval_reward": 0.44747393640379113, "eval_reward_std": 0.05811960335510472, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44747393640379113, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05811960451925794, "eval_runtime": 314.363, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 302.375, "completions/min_length": 237.0, "epoch": 2.6683209263854426, "grad_norm": 0.20494338870048523, "kl": 0.04388427734375, "learning_rate": 3.6781566236003486e-07, "loss": 0.000438883900642395, "memory(GiB)": 38.13, "reward": 0.4625847041606903, "reward_std": 0.04142110422253609, "rewards/VisualizationJSONCombinedORM/mean": 0.4625847041606903, "rewards/VisualizationJSONCombinedORM/std": 0.18422512710094452, "step": 3226, "train_speed(iter/s)": 0.103065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 296.125, "completions/min_length": 235.0, "epoch": 2.6691480562448304, "grad_norm": 0.2371036410331726, "kl": 0.1612548828125, "learning_rate": 3.660061460651981e-07, "loss": 0.0016136430203914642, "memory(GiB)": 38.13, "reward": 0.42559781670570374, "reward_std": 0.09569565951824188, "rewards/VisualizationJSONCombinedORM/mean": 0.42559781670570374, "rewards/VisualizationJSONCombinedORM/std": 0.22496038675308228, "step": 3227, "train_speed(iter/s)": 0.102997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 309.875, "completions/min_length": 252.0, "epoch": 2.6699751861042182, "grad_norm": 0.3975529372692108, "kl": 0.0390625, "learning_rate": 3.642009227108195e-07, "loss": 0.0003907345235347748, "memory(GiB)": 38.13, "reward": 0.4450415372848511, "reward_std": 0.05520034208893776, "rewards/VisualizationJSONCombinedORM/mean": 0.4450415372848511, "rewards/VisualizationJSONCombinedORM/std": 0.07168679684400558, "step": 3228, "train_speed(iter/s)": 0.102936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 272.875, "completions/min_length": 226.0, "epoch": 2.6708023159636065, "grad_norm": 0.18740814924240112, "kl": 0.078125, "learning_rate": 3.623999939692646e-07, "loss": 0.0007836446166038513, "memory(GiB)": 38.13, "reward": 0.40470021963119507, "reward_std": 0.056558310985565186, "rewards/VisualizationJSONCombinedORM/mean": 0.40470021963119507, "rewards/VisualizationJSONCombinedORM/std": 0.054795678704977036, "step": 3229, "train_speed(iter/s)": 0.102877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 280.375, "completions/min_length": 215.0, "epoch": 2.6716294458229943, "grad_norm": 0.18891845643520355, "kl": 0.0941162109375, "learning_rate": 3.606033615089144e-07, "loss": 0.0009407065808773041, "memory(GiB)": 38.13, "reward": 0.5254145264625549, "reward_std": 0.06217721477150917, "rewards/VisualizationJSONCombinedORM/mean": 0.5254145264625549, "rewards/VisualizationJSONCombinedORM/std": 0.1467423141002655, "step": 3230, "train_speed(iter/s)": 0.102819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 269.75, "completions/min_length": 223.0, "epoch": 2.672456575682382, "grad_norm": 0.2069283425807953, "kl": 0.0560302734375, "learning_rate": 3.588110269941747e-07, "loss": 0.0005604848265647888, "memory(GiB)": 38.13, "reward": 0.6442821025848389, "reward_std": 0.12869442999362946, "rewards/VisualizationJSONCombinedORM/mean": 0.6442821025848389, "rewards/VisualizationJSONCombinedORM/std": 0.12882933020591736, "step": 3231, "train_speed(iter/s)": 0.102774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 305.75, "completions/min_length": 241.0, "epoch": 2.67328370554177, "grad_norm": 0.1741241067647934, "kl": 0.052490234375, "learning_rate": 3.570229920854651e-07, "loss": 0.0005243392661213875, "memory(GiB)": 38.13, "reward": 0.5120941400527954, "reward_std": 0.05063605681061745, "rewards/VisualizationJSONCombinedORM/mean": 0.5120941400527954, "rewards/VisualizationJSONCombinedORM/std": 0.24361860752105713, "step": 3232, "train_speed(iter/s)": 0.102721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 286.0625, "completions/min_length": 231.0, "epoch": 2.674110835401158, "grad_norm": 0.26498255133628845, "kl": 0.05706787109375, "learning_rate": 3.5523925843922613e-07, "loss": 0.000570591539144516, "memory(GiB)": 38.13, "reward": 0.5376104116439819, "reward_std": 0.05374962091445923, "rewards/VisualizationJSONCombinedORM/mean": 0.5376104116439819, "rewards/VisualizationJSONCombinedORM/std": 0.218655064702034, "step": 3233, "train_speed(iter/s)": 0.102668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 281.8125, "completions/min_length": 208.0, "epoch": 2.674937965260546, "grad_norm": 0.26000353693962097, "kl": 0.0911865234375, "learning_rate": 3.5345982770791096e-07, "loss": 0.0009119771420955658, "memory(GiB)": 38.13, "reward": 0.6981097459793091, "reward_std": 0.06814134120941162, "rewards/VisualizationJSONCombinedORM/mean": 0.6981097459793091, "rewards/VisualizationJSONCombinedORM/std": 0.10572739690542221, "step": 3234, "train_speed(iter/s)": 0.102598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 317.5, "completions/min_length": 249.0, "epoch": 2.675765095119934, "grad_norm": 0.18263664841651917, "kl": 0.07244873046875, "learning_rate": 3.5168470153998937e-07, "loss": 0.0007248632609844208, "memory(GiB)": 38.13, "reward": 0.5047327876091003, "reward_std": 0.04868294298648834, "rewards/VisualizationJSONCombinedORM/mean": 0.5047327876091003, "rewards/VisualizationJSONCombinedORM/std": 0.09471097588539124, "step": 3235, "train_speed(iter/s)": 0.102523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 265.25, "completions/min_length": 218.0, "epoch": 2.6765922249793217, "grad_norm": 0.22293956577777863, "kl": 0.0592041015625, "learning_rate": 3.4991388157993967e-07, "loss": 0.0005928128957748413, "memory(GiB)": 38.13, "reward": 0.3993476331233978, "reward_std": 0.06772427260875702, "rewards/VisualizationJSONCombinedORM/mean": 0.3993476331233978, "rewards/VisualizationJSONCombinedORM/std": 0.07099957019090652, "step": 3236, "train_speed(iter/s)": 0.102461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 314.5, "completions/min_length": 266.0, "epoch": 2.6774193548387095, "grad_norm": 0.17900951206684113, "kl": 0.0938720703125, "learning_rate": 3.4814736946825357e-07, "loss": 0.0009372523054480553, "memory(GiB)": 38.13, "reward": 0.5575999021530151, "reward_std": 0.07965584099292755, "rewards/VisualizationJSONCombinedORM/mean": 0.5575999021530151, "rewards/VisualizationJSONCombinedORM/std": 0.1059415191411972, "step": 3237, "train_speed(iter/s)": 0.102412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 292.3125, "completions/min_length": 239.0, "epoch": 2.6782464846980973, "grad_norm": 0.33268290758132935, "kl": 0.05145263671875, "learning_rate": 3.463851668414303e-07, "loss": 0.0005146563053131104, "memory(GiB)": 38.13, "reward": 0.6437368392944336, "reward_std": 0.09138615429401398, "rewards/VisualizationJSONCombinedORM/mean": 0.6437368392944336, "rewards/VisualizationJSONCombinedORM/std": 0.10911205410957336, "step": 3238, "train_speed(iter/s)": 0.10235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 316.125, "completions/min_length": 265.0, "epoch": 2.6790736145574856, "grad_norm": 0.1934846192598343, "kl": 0.0404052734375, "learning_rate": 3.4462727533197794e-07, "loss": 0.0004035867750644684, "memory(GiB)": 38.13, "reward": 0.43409886956214905, "reward_std": 0.05905995890498161, "rewards/VisualizationJSONCombinedORM/mean": 0.43409886956214905, "rewards/VisualizationJSONCombinedORM/std": 0.10925620049238205, "step": 3239, "train_speed(iter/s)": 0.10229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 298.4375, "completions/min_length": 229.0, "epoch": 2.6799007444168734, "grad_norm": 0.2112431675195694, "kl": 0.101806640625, "learning_rate": 3.4287369656841095e-07, "loss": 0.0010195523500442505, "memory(GiB)": 38.13, "reward": 0.5998534560203552, "reward_std": 0.08482159674167633, "rewards/VisualizationJSONCombinedORM/mean": 0.5998534560203552, "rewards/VisualizationJSONCombinedORM/std": 0.13136084377765656, "step": 3240, "train_speed(iter/s)": 0.102237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 282.875, "completions/min_length": 233.0, "epoch": 2.6807278742762612, "grad_norm": 0.17984779179096222, "kl": 0.1033935546875, "learning_rate": 3.4112443217524827e-07, "loss": 0.0010299831628799438, "memory(GiB)": 38.13, "reward": 0.5880293250083923, "reward_std": 0.020037392154335976, "rewards/VisualizationJSONCombinedORM/mean": 0.5880293250083923, "rewards/VisualizationJSONCombinedORM/std": 0.20173723995685577, "step": 3241, "train_speed(iter/s)": 0.102181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 291.25, "completions/min_length": 237.0, "epoch": 2.6815550041356495, "grad_norm": 0.18577341735363007, "kl": 0.1005859375, "learning_rate": 3.393794837730102e-07, "loss": 0.0010070186108350754, "memory(GiB)": 38.13, "reward": 0.6956220865249634, "reward_std": 0.07573540508747101, "rewards/VisualizationJSONCombinedORM/mean": 0.6956220865249634, "rewards/VisualizationJSONCombinedORM/std": 0.08839841187000275, "step": 3242, "train_speed(iter/s)": 0.102135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.8125, "completions/min_length": 246.0, "epoch": 2.6823821339950373, "grad_norm": 0.2486816942691803, "kl": 0.0931396484375, "learning_rate": 3.3763885297822153e-07, "loss": 0.000929318368434906, "memory(GiB)": 38.13, "reward": 0.6148857474327087, "reward_std": 0.10573609173297882, "rewards/VisualizationJSONCombinedORM/mean": 0.6148857474327087, "rewards/VisualizationJSONCombinedORM/std": 0.10742396116256714, "step": 3243, "train_speed(iter/s)": 0.102082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 279.875, "completions/min_length": 217.0, "epoch": 2.683209263854425, "grad_norm": 0.18506334722042084, "kl": 0.0684814453125, "learning_rate": 3.359025414034045e-07, "loss": 0.0006849616765975952, "memory(GiB)": 38.13, "reward": 0.4093027412891388, "reward_std": 0.05631616339087486, "rewards/VisualizationJSONCombinedORM/mean": 0.4093027412891388, "rewards/VisualizationJSONCombinedORM/std": 0.12799154222011566, "step": 3244, "train_speed(iter/s)": 0.102033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 314.8125, "completions/min_length": 261.0, "epoch": 2.684036393713813, "grad_norm": 0.22294864058494568, "kl": 0.0743408203125, "learning_rate": 3.34170550657083e-07, "loss": 0.0007426328957080841, "memory(GiB)": 38.13, "reward": 0.5619889497756958, "reward_std": 0.05140165984630585, "rewards/VisualizationJSONCombinedORM/mean": 0.5619889497756958, "rewards/VisualizationJSONCombinedORM/std": 0.20901645720005035, "step": 3245, "train_speed(iter/s)": 0.101978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 314.4375, "completions/min_length": 248.0, "epoch": 2.684863523573201, "grad_norm": 0.22900810837745667, "kl": 0.0797119140625, "learning_rate": 3.324428823437753e-07, "loss": 0.0007957704365253448, "memory(GiB)": 38.13, "reward": 0.40546715259552, "reward_std": 0.04058331996202469, "rewards/VisualizationJSONCombinedORM/mean": 0.40546715259552, "rewards/VisualizationJSONCombinedORM/std": 0.16796843707561493, "step": 3246, "train_speed(iter/s)": 0.101911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 295.75, "completions/min_length": 242.0, "epoch": 2.685690653432589, "grad_norm": 0.18583153188228607, "kl": 0.0738525390625, "learning_rate": 3.307195380639977e-07, "loss": 0.0007389858365058899, "memory(GiB)": 38.13, "reward": 0.5990846753120422, "reward_std": 0.08840079605579376, "rewards/VisualizationJSONCombinedORM/mean": 0.5990846753120422, "rewards/VisualizationJSONCombinedORM/std": 0.09628400206565857, "step": 3247, "train_speed(iter/s)": 0.10186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 315.4375, "completions/min_length": 245.0, "epoch": 2.686517783291977, "grad_norm": 0.16538044810295105, "kl": 0.0396728515625, "learning_rate": 3.290005194142576e-07, "loss": 0.00039624422788619995, "memory(GiB)": 38.13, "reward": 0.6939539909362793, "reward_std": 0.08380154520273209, "rewards/VisualizationJSONCombinedORM/mean": 0.6939539909362793, "rewards/VisualizationJSONCombinedORM/std": 0.11098304390907288, "step": 3248, "train_speed(iter/s)": 0.101789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 300.8125, "completions/min_length": 226.0, "epoch": 2.6873449131513647, "grad_norm": 0.15124639868736267, "kl": 0.05010986328125, "learning_rate": 3.272858279870583e-07, "loss": 0.000501241534948349, "memory(GiB)": 38.13, "reward": 0.48525211215019226, "reward_std": 0.049473557621240616, "rewards/VisualizationJSONCombinedORM/mean": 0.48525211215019226, "rewards/VisualizationJSONCombinedORM/std": 0.26414164900779724, "step": 3249, "train_speed(iter/s)": 0.101737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 300.625, "completions/min_length": 237.0, "epoch": 2.688172043010753, "grad_norm": 0.18712377548217773, "kl": 0.15576171875, "learning_rate": 3.255754653708926e-07, "loss": 0.0015559010207653046, "memory(GiB)": 38.13, "reward": 0.5003311634063721, "reward_std": 0.053192608058452606, "rewards/VisualizationJSONCombinedORM/mean": 0.5003311634063721, "rewards/VisualizationJSONCombinedORM/std": 0.08135262131690979, "step": 3250, "train_speed(iter/s)": 0.101701 }, { "epoch": 2.688172043010753, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.9166666666667, "eval_completions/mean_length": 299.3645833333333, "eval_completions/min_length": 248.79166666666666, "eval_kl": 0.06787109375, "eval_loss": 0.0006830940837971866, "eval_reward": 0.4480487151692311, "eval_reward_std": 0.06168397896302243, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4480487151692311, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06168398156296462, "eval_runtime": 310.8246, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 310.25, "completions/min_length": 232.0, "epoch": 2.6889991728701403, "grad_norm": 0.18058857321739197, "kl": 0.040771484375, "learning_rate": 3.238694331502451e-07, "loss": 0.0004068128764629364, "memory(GiB)": 38.13, "reward": 0.5349429249763489, "reward_std": 0.053349703550338745, "rewards/VisualizationJSONCombinedORM/mean": 0.5349429249763489, "rewards/VisualizationJSONCombinedORM/std": 0.19015267491340637, "step": 3251, "train_speed(iter/s)": 0.100669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 274.8125, "completions/min_length": 215.0, "epoch": 2.6898263027295286, "grad_norm": 0.2491818070411682, "kl": 0.06817626953125, "learning_rate": 3.22167732905585e-07, "loss": 0.0006835684180259705, "memory(GiB)": 38.13, "reward": 0.6757509708404541, "reward_std": 0.10719242691993713, "rewards/VisualizationJSONCombinedORM/mean": 0.6757509708404541, "rewards/VisualizationJSONCombinedORM/std": 0.11279220134019852, "step": 3252, "train_speed(iter/s)": 0.100607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 316.375, "completions/min_length": 251.0, "epoch": 2.6906534325889164, "grad_norm": 0.17732632160186768, "kl": 0.0621337890625, "learning_rate": 3.204703662133724e-07, "loss": 0.0006205253303050995, "memory(GiB)": 38.13, "reward": 0.6895171403884888, "reward_std": 0.07675245404243469, "rewards/VisualizationJSONCombinedORM/mean": 0.6895171403884888, "rewards/VisualizationJSONCombinedORM/std": 0.1089368611574173, "step": 3253, "train_speed(iter/s)": 0.100568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 297.9375, "completions/min_length": 225.0, "epoch": 2.6914805624483042, "grad_norm": 0.18602913618087769, "kl": 0.0960693359375, "learning_rate": 3.187773346460488e-07, "loss": 0.0009599924087524414, "memory(GiB)": 38.13, "reward": 0.6260416507720947, "reward_std": 0.08167147636413574, "rewards/VisualizationJSONCombinedORM/mean": 0.6260416507720947, "rewards/VisualizationJSONCombinedORM/std": 0.08559151738882065, "step": 3254, "train_speed(iter/s)": 0.100504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 313.0625, "completions/min_length": 237.0, "epoch": 2.6923076923076925, "grad_norm": 0.19478458166122437, "kl": 0.06988525390625, "learning_rate": 3.170886397720435e-07, "loss": 0.0006971359252929688, "memory(GiB)": 38.13, "reward": 0.5458863973617554, "reward_std": 0.07706247270107269, "rewards/VisualizationJSONCombinedORM/mean": 0.5458863973617554, "rewards/VisualizationJSONCombinedORM/std": 0.08720389753580093, "step": 3255, "train_speed(iter/s)": 0.100463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 306.625, "completions/min_length": 209.0, "epoch": 2.6931348221670803, "grad_norm": 0.16302286088466644, "kl": 0.066162109375, "learning_rate": 3.1540428315576635e-07, "loss": 0.0006622076034545898, "memory(GiB)": 38.13, "reward": 0.4154830574989319, "reward_std": 0.038631223142147064, "rewards/VisualizationJSONCombinedORM/mean": 0.4154830574989319, "rewards/VisualizationJSONCombinedORM/std": 0.2279587835073471, "step": 3256, "train_speed(iter/s)": 0.100409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 296.4375, "completions/min_length": 214.0, "epoch": 2.693961952026468, "grad_norm": 0.19574420154094696, "kl": 0.0545654296875, "learning_rate": 3.13724266357609e-07, "loss": 0.000546010211110115, "memory(GiB)": 38.13, "reward": 0.40287578105926514, "reward_std": 0.04824579507112503, "rewards/VisualizationJSONCombinedORM/mean": 0.40287578105926514, "rewards/VisualizationJSONCombinedORM/std": 0.09399320185184479, "step": 3257, "train_speed(iter/s)": 0.100358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 280.8125, "completions/min_length": 218.0, "epoch": 2.694789081885856, "grad_norm": 0.17422455549240112, "kl": 0.041015625, "learning_rate": 3.120485909339399e-07, "loss": 0.0004092678427696228, "memory(GiB)": 38.13, "reward": 0.590228259563446, "reward_std": 0.055169180035591125, "rewards/VisualizationJSONCombinedORM/mean": 0.590228259563446, "rewards/VisualizationJSONCombinedORM/std": 0.1400030255317688, "step": 3258, "train_speed(iter/s)": 0.100301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 311.0625, "completions/min_length": 245.0, "epoch": 2.695616211745244, "grad_norm": 0.35573408007621765, "kl": 0.07318115234375, "learning_rate": 3.103772584371106e-07, "loss": 0.0007326584309339523, "memory(GiB)": 38.13, "reward": 0.33398106694221497, "reward_std": 0.20315784215927124, "rewards/VisualizationJSONCombinedORM/mean": 0.33398106694221497, "rewards/VisualizationJSONCombinedORM/std": 0.2542651891708374, "step": 3259, "train_speed(iter/s)": 0.10025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 284.5625, "completions/min_length": 240.0, "epoch": 2.696443341604632, "grad_norm": 0.15647819638252258, "kl": 0.06182861328125, "learning_rate": 3.0871027041544323e-07, "loss": 0.0006184354424476624, "memory(GiB)": 38.13, "reward": 0.5215403437614441, "reward_std": 0.05129124969244003, "rewards/VisualizationJSONCombinedORM/mean": 0.5215403437614441, "rewards/VisualizationJSONCombinedORM/std": 0.16131997108459473, "step": 3260, "train_speed(iter/s)": 0.100202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 300.8125, "completions/min_length": 251.0, "epoch": 2.69727047146402, "grad_norm": 0.18756358325481415, "kl": 0.04864501953125, "learning_rate": 3.070476284132429e-07, "loss": 0.00048576295375823975, "memory(GiB)": 38.13, "reward": 0.48657816648483276, "reward_std": 0.03350149840116501, "rewards/VisualizationJSONCombinedORM/mean": 0.48657816648483276, "rewards/VisualizationJSONCombinedORM/std": 0.2244441658258438, "step": 3261, "train_speed(iter/s)": 0.100155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 278.75, "completions/min_length": 223.0, "epoch": 2.6980976013234077, "grad_norm": 0.1724112629890442, "kl": 0.01904296875, "learning_rate": 3.053893339707803e-07, "loss": 0.00019120611250400543, "memory(GiB)": 38.13, "reward": 0.6646421551704407, "reward_std": 0.03838717192411423, "rewards/VisualizationJSONCombinedORM/mean": 0.6646421551704407, "rewards/VisualizationJSONCombinedORM/std": 0.04693116620182991, "step": 3262, "train_speed(iter/s)": 0.100103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 300.9375, "completions/min_length": 238.0, "epoch": 2.698924731182796, "grad_norm": 0.15587405860424042, "kl": 0.0521240234375, "learning_rate": 3.037353886243055e-07, "loss": 0.0005215723067522049, "memory(GiB)": 38.13, "reward": 0.7466585636138916, "reward_std": 0.0923130214214325, "rewards/VisualizationJSONCombinedORM/mean": 0.7466585636138916, "rewards/VisualizationJSONCombinedORM/std": 0.10285890102386475, "step": 3263, "train_speed(iter/s)": 0.100044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 321.4375, "completions/min_length": 265.0, "epoch": 2.699751861042184, "grad_norm": 0.2450440227985382, "kl": 0.05645751953125, "learning_rate": 3.02085793906034e-07, "loss": 0.0005646795034408569, "memory(GiB)": 38.13, "reward": 0.6098436713218689, "reward_std": 0.07266692817211151, "rewards/VisualizationJSONCombinedORM/mean": 0.6098436713218689, "rewards/VisualizationJSONCombinedORM/std": 0.14003439247608185, "step": 3264, "train_speed(iter/s)": 0.099999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.6875, "completions/min_length": 245.0, "epoch": 2.7005789909015716, "grad_norm": 0.18683844804763794, "kl": 0.059814453125, "learning_rate": 3.004405513441544e-07, "loss": 0.0005977936089038849, "memory(GiB)": 38.13, "reward": 0.7104935646057129, "reward_std": 0.06209982559084892, "rewards/VisualizationJSONCombinedORM/mean": 0.7104935646057129, "rewards/VisualizationJSONCombinedORM/std": 0.07402519881725311, "step": 3265, "train_speed(iter/s)": 0.099947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 324.0, "completions/min_length": 252.0, "epoch": 2.7014061207609594, "grad_norm": 0.18773892521858215, "kl": 0.094970703125, "learning_rate": 2.9879966246282255e-07, "loss": 0.0009523071348667145, "memory(GiB)": 38.13, "reward": 0.6399940252304077, "reward_std": 0.036491263657808304, "rewards/VisualizationJSONCombinedORM/mean": 0.6399940252304077, "rewards/VisualizationJSONCombinedORM/std": 0.23014314472675323, "step": 3266, "train_speed(iter/s)": 0.099897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 302.9375, "completions/min_length": 232.0, "epoch": 2.7022332506203472, "grad_norm": 0.21239668130874634, "kl": 0.04107666015625, "learning_rate": 2.9716312878216194e-07, "loss": 0.0004105567932128906, "memory(GiB)": 38.13, "reward": 0.6960539817810059, "reward_std": 0.07303895056247711, "rewards/VisualizationJSONCombinedORM/mean": 0.6960539817810059, "rewards/VisualizationJSONCombinedORM/std": 0.1262454092502594, "step": 3267, "train_speed(iter/s)": 0.099845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 285.6875, "completions/min_length": 232.0, "epoch": 2.7030603804797355, "grad_norm": 0.21475328505039215, "kl": 0.0263671875, "learning_rate": 2.955309518182586e-07, "loss": 0.0002632737159729004, "memory(GiB)": 38.13, "reward": 0.6760796308517456, "reward_std": 0.08389610797166824, "rewards/VisualizationJSONCombinedORM/mean": 0.6760796308517456, "rewards/VisualizationJSONCombinedORM/std": 0.13359089195728302, "step": 3268, "train_speed(iter/s)": 0.099799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 296.5, "completions/min_length": 212.0, "epoch": 2.7038875103391233, "grad_norm": 0.2463742047548294, "kl": 0.0877685546875, "learning_rate": 2.9390313308316597e-07, "loss": 0.0008762702345848083, "memory(GiB)": 38.13, "reward": 0.4278339147567749, "reward_std": 0.07234906405210495, "rewards/VisualizationJSONCombinedORM/mean": 0.4278339147567749, "rewards/VisualizationJSONCombinedORM/std": 0.0727422833442688, "step": 3269, "train_speed(iter/s)": 0.099728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 284.3125, "completions/min_length": 218.0, "epoch": 2.704714640198511, "grad_norm": 0.17494355142116547, "kl": 0.218994140625, "learning_rate": 2.9227967408489653e-07, "loss": 0.0021977685391902924, "memory(GiB)": 38.13, "reward": 0.6228395700454712, "reward_std": 0.08239811658859253, "rewards/VisualizationJSONCombinedORM/mean": 0.6228395700454712, "rewards/VisualizationJSONCombinedORM/std": 0.08516577631235123, "step": 3270, "train_speed(iter/s)": 0.099691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 306.1875, "completions/min_length": 223.0, "epoch": 2.705541770057899, "grad_norm": 0.1775372475385666, "kl": 0.09423828125, "learning_rate": 2.9066057632742674e-07, "loss": 0.0009443089365959167, "memory(GiB)": 38.13, "reward": 0.48876649141311646, "reward_std": 0.05321631208062172, "rewards/VisualizationJSONCombinedORM/mean": 0.48876649141311646, "rewards/VisualizationJSONCombinedORM/std": 0.0962105244398117, "step": 3271, "train_speed(iter/s)": 0.099648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 284.5625, "completions/min_length": 236.0, "epoch": 2.706368899917287, "grad_norm": 0.20257146656513214, "kl": 0.08447265625, "learning_rate": 2.890458413106911e-07, "loss": 0.0008441656827926636, "memory(GiB)": 38.13, "reward": 0.4882412850856781, "reward_std": 0.1053231731057167, "rewards/VisualizationJSONCombinedORM/mean": 0.4882412850856781, "rewards/VisualizationJSONCombinedORM/std": 0.13655716180801392, "step": 3272, "train_speed(iter/s)": 0.099606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 275.875, "completions/min_length": 222.0, "epoch": 2.707196029776675, "grad_norm": 0.16660583019256592, "kl": 0.056640625, "learning_rate": 2.874354705305843e-07, "loss": 0.0005662515759468079, "memory(GiB)": 38.13, "reward": 0.6508992314338684, "reward_std": 0.07932864129543304, "rewards/VisualizationJSONCombinedORM/mean": 0.6508992314338684, "rewards/VisualizationJSONCombinedORM/std": 0.12423892319202423, "step": 3273, "train_speed(iter/s)": 0.09956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 328.875, "completions/min_length": 237.0, "epoch": 2.708023159636063, "grad_norm": 0.1740492880344391, "kl": 0.0762939453125, "learning_rate": 2.858294654789567e-07, "loss": 0.000764649361371994, "memory(GiB)": 38.13, "reward": 0.36824357509613037, "reward_std": 0.03918972611427307, "rewards/VisualizationJSONCombinedORM/mean": 0.36824357509613037, "rewards/VisualizationJSONCombinedORM/std": 0.10330259799957275, "step": 3274, "train_speed(iter/s)": 0.099503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 317.0, "completions/min_length": 251.0, "epoch": 2.7088502894954507, "grad_norm": 0.18498654663562775, "kl": 0.06231689453125, "learning_rate": 2.842278276436128e-07, "loss": 0.0006229355931282043, "memory(GiB)": 38.13, "reward": 0.21538928151130676, "reward_std": 0.013602402061223984, "rewards/VisualizationJSONCombinedORM/mean": 0.21538928151130676, "rewards/VisualizationJSONCombinedORM/std": 0.060711219906806946, "step": 3275, "train_speed(iter/s)": 0.099452 }, { "epoch": 2.7088502894954507, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 362.4166666666667, "eval_completions/mean_length": 301.671875, "eval_completions/min_length": 251.83333333333334, "eval_kl": 0.061503092447916664, "eval_loss": 0.0006154899601824582, "eval_reward": 0.44451056234538555, "eval_reward_std": 0.052723932972488306, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44451056234538555, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052723934253056846, "eval_runtime": 309.627, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 341.375, "completions/min_length": 263.0, "epoch": 2.709677419354839, "grad_norm": 0.18736952543258667, "kl": 0.04119873046875, "learning_rate": 2.826305585083144e-07, "loss": 0.00041288137435913086, "memory(GiB)": 38.13, "reward": 0.4547584056854248, "reward_std": 0.0788763016462326, "rewards/VisualizationJSONCombinedORM/mean": 0.4547584056854248, "rewards/VisualizationJSONCombinedORM/std": 0.10599852353334427, "step": 3276, "train_speed(iter/s)": 0.098471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 300.875, "completions/min_length": 250.0, "epoch": 2.710504549214227, "grad_norm": 0.1888294816017151, "kl": 0.0665283203125, "learning_rate": 2.810376595527731e-07, "loss": 0.0006650276482105255, "memory(GiB)": 38.13, "reward": 0.6178898811340332, "reward_std": 0.057536445558071136, "rewards/VisualizationJSONCombinedORM/mean": 0.6178898811340332, "rewards/VisualizationJSONCombinedORM/std": 0.11094502359628677, "step": 3277, "train_speed(iter/s)": 0.098437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 306.6875, "completions/min_length": 218.0, "epoch": 2.7113316790736146, "grad_norm": 0.19227300584316254, "kl": 0.10321044921875, "learning_rate": 2.794491322526555e-07, "loss": 0.0010348409414291382, "memory(GiB)": 38.13, "reward": 0.5800156593322754, "reward_std": 0.09460294246673584, "rewards/VisualizationJSONCombinedORM/mean": 0.5800156593322754, "rewards/VisualizationJSONCombinedORM/std": 0.09251099079847336, "step": 3278, "train_speed(iter/s)": 0.098373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 259.75, "completions/min_length": 205.0, "epoch": 2.7121588089330024, "grad_norm": 0.16117556393146515, "kl": 0.03759765625, "learning_rate": 2.778649780795739e-07, "loss": 0.00037645548582077026, "memory(GiB)": 38.13, "reward": 0.5469969511032104, "reward_std": 0.05088477581739426, "rewards/VisualizationJSONCombinedORM/mean": 0.5469969511032104, "rewards/VisualizationJSONCombinedORM/std": 0.13066360354423523, "step": 3279, "train_speed(iter/s)": 0.098334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 270.1875, "completions/min_length": 218.0, "epoch": 2.7129859387923903, "grad_norm": 0.2303583323955536, "kl": 0.1290283203125, "learning_rate": 2.7628519850109394e-07, "loss": 0.001285947859287262, "memory(GiB)": 38.13, "reward": 0.4624614715576172, "reward_std": 0.07558228820562363, "rewards/VisualizationJSONCombinedORM/mean": 0.4624614715576172, "rewards/VisualizationJSONCombinedORM/std": 0.23356826603412628, "step": 3280, "train_speed(iter/s)": 0.098291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 299.8125, "completions/min_length": 255.0, "epoch": 2.7138130686517785, "grad_norm": 0.23490118980407715, "kl": 0.0731201171875, "learning_rate": 2.747097949807248e-07, "loss": 0.0007303692400455475, "memory(GiB)": 38.13, "reward": 0.6664237380027771, "reward_std": 0.09428945183753967, "rewards/VisualizationJSONCombinedORM/mean": 0.6664237380027771, "rewards/VisualizationJSONCombinedORM/std": 0.10709315538406372, "step": 3281, "train_speed(iter/s)": 0.09825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 306.125, "completions/min_length": 239.0, "epoch": 2.7146401985111663, "grad_norm": 0.14876118302345276, "kl": 0.01690673828125, "learning_rate": 2.7313876897792304e-07, "loss": 0.00016963109374046326, "memory(GiB)": 38.13, "reward": 0.5272717475891113, "reward_std": 0.03230999410152435, "rewards/VisualizationJSONCombinedORM/mean": 0.5272717475891113, "rewards/VisualizationJSONCombinedORM/std": 0.12886396050453186, "step": 3282, "train_speed(iter/s)": 0.098195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 314.6875, "completions/min_length": 236.0, "epoch": 2.715467328370554, "grad_norm": 0.221944659948349, "kl": 0.05743408203125, "learning_rate": 2.71572121948091e-07, "loss": 0.0005740225315093994, "memory(GiB)": 38.13, "reward": 0.3396705389022827, "reward_std": 0.03956633061170578, "rewards/VisualizationJSONCombinedORM/mean": 0.3396705389022827, "rewards/VisualizationJSONCombinedORM/std": 0.18462346494197845, "step": 3283, "train_speed(iter/s)": 0.098148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 339.75, "completions/min_length": 262.0, "epoch": 2.716294458229942, "grad_norm": 0.19783835113048553, "kl": 0.06805419921875, "learning_rate": 2.700098553425734e-07, "loss": 0.0006808936595916748, "memory(GiB)": 38.13, "reward": 0.29748600721359253, "reward_std": 0.025286730378866196, "rewards/VisualizationJSONCombinedORM/mean": 0.29748600721359253, "rewards/VisualizationJSONCombinedORM/std": 0.07031714916229248, "step": 3284, "train_speed(iter/s)": 0.098093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 309.5, "completions/min_length": 264.0, "epoch": 2.71712158808933, "grad_norm": 0.18999244272708893, "kl": 0.0548095703125, "learning_rate": 2.684519706086558e-07, "loss": 0.0005483254790306091, "memory(GiB)": 38.13, "reward": 0.5750454664230347, "reward_std": 0.04600421339273453, "rewards/VisualizationJSONCombinedORM/mean": 0.5750454664230347, "rewards/VisualizationJSONCombinedORM/std": 0.1930190920829773, "step": 3285, "train_speed(iter/s)": 0.098052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 289.6875, "completions/min_length": 246.0, "epoch": 2.717948717948718, "grad_norm": 0.17843611538410187, "kl": 0.07000732421875, "learning_rate": 2.668984691895671e-07, "loss": 0.0007014870643615723, "memory(GiB)": 38.13, "reward": 0.5798457860946655, "reward_std": 0.06159553304314613, "rewards/VisualizationJSONCombinedORM/mean": 0.5798457860946655, "rewards/VisualizationJSONCombinedORM/std": 0.06004045903682709, "step": 3286, "train_speed(iter/s)": 0.098011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 297.9375, "completions/min_length": 245.0, "epoch": 2.718775847808106, "grad_norm": 0.22313864529132843, "kl": 0.0689697265625, "learning_rate": 2.653493525244721e-07, "loss": 0.0006887167692184448, "memory(GiB)": 38.13, "reward": 0.38611698150634766, "reward_std": 0.05938301235437393, "rewards/VisualizationJSONCombinedORM/mean": 0.38611698150634766, "rewards/VisualizationJSONCombinedORM/std": 0.059049174189567566, "step": 3287, "train_speed(iter/s)": 0.097966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 323.9375, "completions/min_length": 239.0, "epoch": 2.7196029776674937, "grad_norm": 0.2229945808649063, "kl": 0.0616455078125, "learning_rate": 2.6380462204847633e-07, "loss": 0.0006169434636831284, "memory(GiB)": 38.13, "reward": 0.45803505182266235, "reward_std": 0.0321570560336113, "rewards/VisualizationJSONCombinedORM/mean": 0.45803505182266235, "rewards/VisualizationJSONCombinedORM/std": 0.04862602800130844, "step": 3288, "train_speed(iter/s)": 0.097929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 315.0, "completions/min_length": 255.0, "epoch": 2.720430107526882, "grad_norm": 0.21944071352481842, "kl": 0.15869140625, "learning_rate": 2.6226427919262056e-07, "loss": 0.0015853866934776306, "memory(GiB)": 38.13, "reward": 0.5736026167869568, "reward_std": 0.10780779272317886, "rewards/VisualizationJSONCombinedORM/mean": 0.5736026167869568, "rewards/VisualizationJSONCombinedORM/std": 0.12595979869365692, "step": 3289, "train_speed(iter/s)": 0.097892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 315.6875, "completions/min_length": 280.0, "epoch": 2.72125723738627, "grad_norm": 0.17606928944587708, "kl": 0.08251953125, "learning_rate": 2.607283253838827e-07, "loss": 0.0008263923227787018, "memory(GiB)": 38.13, "reward": 0.5555660128593445, "reward_std": 0.07543997466564178, "rewards/VisualizationJSONCombinedORM/mean": 0.5555660128593445, "rewards/VisualizationJSONCombinedORM/std": 0.08132211118936539, "step": 3290, "train_speed(iter/s)": 0.097843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 274.1875, "completions/min_length": 238.0, "epoch": 2.7220843672456576, "grad_norm": 0.1931050419807434, "kl": 0.039642333984375, "learning_rate": 2.5919676204517073e-07, "loss": 0.00039625540375709534, "memory(GiB)": 38.13, "reward": 0.7412000298500061, "reward_std": 0.0852668359875679, "rewards/VisualizationJSONCombinedORM/mean": 0.7412000298500061, "rewards/VisualizationJSONCombinedORM/std": 0.09095963835716248, "step": 3291, "train_speed(iter/s)": 0.09781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 295.0625, "completions/min_length": 254.0, "epoch": 2.7229114971050454, "grad_norm": 0.1688903123140335, "kl": 0.0538330078125, "learning_rate": 2.576695905953303e-07, "loss": 0.0005380511283874512, "memory(GiB)": 38.13, "reward": 0.6469970345497131, "reward_std": 0.09313707053661346, "rewards/VisualizationJSONCombinedORM/mean": 0.6469970345497131, "rewards/VisualizationJSONCombinedORM/std": 0.16018058359622955, "step": 3292, "train_speed(iter/s)": 0.097764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 282.375, "completions/min_length": 206.0, "epoch": 2.7237386269644333, "grad_norm": 0.23784907162189484, "kl": 0.062255859375, "learning_rate": 2.5614681244913287e-07, "loss": 0.0006233286112546921, "memory(GiB)": 38.13, "reward": 0.33300715684890747, "reward_std": 0.05711222440004349, "rewards/VisualizationJSONCombinedORM/mean": 0.33300715684890747, "rewards/VisualizationJSONCombinedORM/std": 0.18418774008750916, "step": 3293, "train_speed(iter/s)": 0.097712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 296.25, "completions/min_length": 214.0, "epoch": 2.7245657568238215, "grad_norm": 0.2032589316368103, "kl": 0.032318115234375, "learning_rate": 2.546284290172862e-07, "loss": 0.0003234557807445526, "memory(GiB)": 38.13, "reward": 0.4490510821342468, "reward_std": 0.03906489908695221, "rewards/VisualizationJSONCombinedORM/mean": 0.4490510821342468, "rewards/VisualizationJSONCombinedORM/std": 0.06078719347715378, "step": 3294, "train_speed(iter/s)": 0.09767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 285.1875, "completions/min_length": 199.0, "epoch": 2.7253928866832093, "grad_norm": 0.21752770245075226, "kl": 0.052734375, "learning_rate": 2.531144417064213e-07, "loss": 0.0005273036658763885, "memory(GiB)": 38.13, "reward": 0.43743014335632324, "reward_std": 0.06474483013153076, "rewards/VisualizationJSONCombinedORM/mean": 0.43743014335632324, "rewards/VisualizationJSONCombinedORM/std": 0.1726827472448349, "step": 3295, "train_speed(iter/s)": 0.097627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 281.0, "completions/min_length": 227.0, "epoch": 2.726220016542597, "grad_norm": 0.16840116679668427, "kl": 0.1083984375, "learning_rate": 2.516048519191e-07, "loss": 0.00108279287815094, "memory(GiB)": 38.13, "reward": 0.6100339889526367, "reward_std": 0.03720260411500931, "rewards/VisualizationJSONCombinedORM/mean": 0.6100339889526367, "rewards/VisualizationJSONCombinedORM/std": 0.12650588154792786, "step": 3296, "train_speed(iter/s)": 0.097582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 315.25, "completions/min_length": 258.0, "epoch": 2.727047146401985, "grad_norm": 0.18828172981739044, "kl": 0.09033203125, "learning_rate": 2.500996610538081e-07, "loss": 0.0009035170078277588, "memory(GiB)": 38.13, "reward": 0.40394327044487, "reward_std": 0.03765380010008812, "rewards/VisualizationJSONCombinedORM/mean": 0.40394327044487, "rewards/VisualizationJSONCombinedORM/std": 0.0697849914431572, "step": 3297, "train_speed(iter/s)": 0.097536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 297.4375, "completions/min_length": 218.0, "epoch": 2.727874276261373, "grad_norm": 0.18674179911613464, "kl": 0.03564453125, "learning_rate": 2.4859887050495744e-07, "loss": 0.0003576129674911499, "memory(GiB)": 38.13, "reward": 0.6248700022697449, "reward_std": 0.05891593173146248, "rewards/VisualizationJSONCombinedORM/mean": 0.6248700022697449, "rewards/VisualizationJSONCombinedORM/std": 0.07274750620126724, "step": 3298, "train_speed(iter/s)": 0.097486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 283.8125, "completions/min_length": 229.0, "epoch": 2.728701406120761, "grad_norm": 0.14411389827728271, "kl": 0.048095703125, "learning_rate": 2.471024816628836e-07, "loss": 0.0004802828188985586, "memory(GiB)": 38.13, "reward": 0.621558666229248, "reward_std": 0.027199899777770042, "rewards/VisualizationJSONCombinedORM/mean": 0.621558666229248, "rewards/VisualizationJSONCombinedORM/std": 0.22258979082107544, "step": 3299, "train_speed(iter/s)": 0.097444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 289.125, "completions/min_length": 245.0, "epoch": 2.729528535980149, "grad_norm": 0.1642017960548401, "kl": 0.10113525390625, "learning_rate": 2.4561049591384387e-07, "loss": 0.0010120868682861328, "memory(GiB)": 38.13, "reward": 0.5994800329208374, "reward_std": 0.07560849189758301, "rewards/VisualizationJSONCombinedORM/mean": 0.5994800329208374, "rewards/VisualizationJSONCombinedORM/std": 0.190145343542099, "step": 3300, "train_speed(iter/s)": 0.097403 }, { "epoch": 2.729528535980149, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.75, "eval_completions/mean_length": 301.1614583333333, "eval_completions/min_length": 248.625, "eval_kl": 0.07698567708333333, "eval_loss": 0.0007790771196596324, "eval_reward": 0.446925454462568, "eval_reward_std": 0.058224566436062254, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.446925454462568, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05822456542712947, "eval_runtime": 312.4418, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 328.625, "completions/min_length": 246.0, "epoch": 2.7303556658395367, "grad_norm": 0.19121715426445007, "kl": 0.0450439453125, "learning_rate": 2.441229146400165e-07, "loss": 0.0004490464925765991, "memory(GiB)": 38.13, "reward": 0.570746123790741, "reward_std": 0.06395786255598068, "rewards/VisualizationJSONCombinedORM/mean": 0.570746123790741, "rewards/VisualizationJSONCombinedORM/std": 0.08437618613243103, "step": 3301, "train_speed(iter/s)": 0.096466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 337.4375, "completions/min_length": 234.0, "epoch": 2.731182795698925, "grad_norm": 0.2055419534444809, "kl": 0.0888671875, "learning_rate": 2.4263973921949955e-07, "loss": 0.000888538546860218, "memory(GiB)": 38.13, "reward": 0.6134414076805115, "reward_std": 0.13010317087173462, "rewards/VisualizationJSONCombinedORM/mean": 0.6134414076805115, "rewards/VisualizationJSONCombinedORM/std": 0.13450591266155243, "step": 3302, "train_speed(iter/s)": 0.09642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 294.9375, "completions/min_length": 249.0, "epoch": 2.732009925558313, "grad_norm": 0.16389018297195435, "kl": 0.13543701171875, "learning_rate": 2.411609710263091e-07, "loss": 0.0013528279960155487, "memory(GiB)": 38.13, "reward": 0.553177535533905, "reward_std": 0.07639618217945099, "rewards/VisualizationJSONCombinedORM/mean": 0.553177535533905, "rewards/VisualizationJSONCombinedORM/std": 0.18578647077083588, "step": 3303, "train_speed(iter/s)": 0.096379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 273.375, "completions/min_length": 244.0, "epoch": 2.7328370554177006, "grad_norm": 0.33247044682502747, "kl": 0.2412109375, "learning_rate": 2.3968661143037864e-07, "loss": 0.0024116113781929016, "memory(GiB)": 38.13, "reward": 0.6787890195846558, "reward_std": 0.11942148953676224, "rewards/VisualizationJSONCombinedORM/mean": 0.6787890195846558, "rewards/VisualizationJSONCombinedORM/std": 0.12740112841129303, "step": 3304, "train_speed(iter/s)": 0.096345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 301.25, "completions/min_length": 249.0, "epoch": 2.7336641852770884, "grad_norm": 0.19157910346984863, "kl": 0.101806640625, "learning_rate": 2.3821666179755842e-07, "loss": 0.0010215099900960922, "memory(GiB)": 38.13, "reward": 0.557777464389801, "reward_std": 0.0772411972284317, "rewards/VisualizationJSONCombinedORM/mean": 0.557777464389801, "rewards/VisualizationJSONCombinedORM/std": 0.09168898314237595, "step": 3305, "train_speed(iter/s)": 0.096303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 323.625, "completions/min_length": 217.0, "epoch": 2.7344913151364763, "grad_norm": 0.1861114501953125, "kl": 0.0657958984375, "learning_rate": 2.367511234896125e-07, "loss": 0.0006586611270904541, "memory(GiB)": 38.13, "reward": 0.33694344758987427, "reward_std": 0.043485820293426514, "rewards/VisualizationJSONCombinedORM/mean": 0.33694344758987427, "rewards/VisualizationJSONCombinedORM/std": 0.15200670063495636, "step": 3306, "train_speed(iter/s)": 0.096259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 334.9375, "completions/min_length": 255.0, "epoch": 2.7353184449958645, "grad_norm": 0.18842676281929016, "kl": 0.048828125, "learning_rate": 2.3528999786421758e-07, "loss": 0.0004888996481895447, "memory(GiB)": 38.13, "reward": 0.705848217010498, "reward_std": 0.06140484660863876, "rewards/VisualizationJSONCombinedORM/mean": 0.705848217010498, "rewards/VisualizationJSONCombinedORM/std": 0.06193310394883156, "step": 3307, "train_speed(iter/s)": 0.096208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 271.25, "completions/min_length": 213.0, "epoch": 2.7361455748552523, "grad_norm": 0.1612345427274704, "kl": 0.07855224609375, "learning_rate": 2.338332862749637e-07, "loss": 0.0007851123809814453, "memory(GiB)": 38.13, "reward": 0.3840147852897644, "reward_std": 0.03260280191898346, "rewards/VisualizationJSONCombinedORM/mean": 0.3840147852897644, "rewards/VisualizationJSONCombinedORM/std": 0.17745059728622437, "step": 3308, "train_speed(iter/s)": 0.09617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 313.25, "completions/min_length": 244.0, "epoch": 2.73697270471464, "grad_norm": 0.19297948479652405, "kl": 0.084228515625, "learning_rate": 2.3238099007134973e-07, "loss": 0.0008430331945419312, "memory(GiB)": 38.13, "reward": 0.23123657703399658, "reward_std": 0.055089063942432404, "rewards/VisualizationJSONCombinedORM/mean": 0.23123657703399658, "rewards/VisualizationJSONCombinedORM/std": 0.06458230316638947, "step": 3309, "train_speed(iter/s)": 0.096123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 252.25, "completions/min_length": 219.0, "epoch": 2.737799834574028, "grad_norm": 0.227304607629776, "kl": 0.06829833984375, "learning_rate": 2.3093311059878776e-07, "loss": 0.0006837248802185059, "memory(GiB)": 38.13, "reward": 0.4298776686191559, "reward_std": 0.0614679753780365, "rewards/VisualizationJSONCombinedORM/mean": 0.4298776686191559, "rewards/VisualizationJSONCombinedORM/std": 0.0794593021273613, "step": 3310, "train_speed(iter/s)": 0.096088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 289.9375, "completions/min_length": 220.0, "epoch": 2.738626964433416, "grad_norm": 0.23035499453544617, "kl": 0.0548095703125, "learning_rate": 2.2948964919859428e-07, "loss": 0.0005478914827108383, "memory(GiB)": 38.13, "reward": 0.6364108324050903, "reward_std": 0.08725062012672424, "rewards/VisualizationJSONCombinedORM/mean": 0.6364108324050903, "rewards/VisualizationJSONCombinedORM/std": 0.08746396005153656, "step": 3311, "train_speed(iter/s)": 0.096056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 329.0, "completions/min_length": 247.0, "epoch": 2.739454094292804, "grad_norm": 0.16501855850219727, "kl": 0.0311279296875, "learning_rate": 2.280506072079963e-07, "loss": 0.0003125816583633423, "memory(GiB)": 38.13, "reward": 0.3812333941459656, "reward_std": 0.031752683222293854, "rewards/VisualizationJSONCombinedORM/mean": 0.3812333941459656, "rewards/VisualizationJSONCombinedORM/std": 0.06549357622861862, "step": 3312, "train_speed(iter/s)": 0.096012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 288.375, "completions/min_length": 236.0, "epoch": 2.740281224152192, "grad_norm": 0.20479315519332886, "kl": 0.04168701171875, "learning_rate": 2.26615985960123e-07, "loss": 0.0004167407751083374, "memory(GiB)": 38.13, "reward": 0.49142464995384216, "reward_std": 0.05944029614329338, "rewards/VisualizationJSONCombinedORM/mean": 0.49142464995384216, "rewards/VisualizationJSONCombinedORM/std": 0.10998452454805374, "step": 3313, "train_speed(iter/s)": 0.095966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 294.5625, "completions/min_length": 214.0, "epoch": 2.7411083540115797, "grad_norm": 0.21046727895736694, "kl": 0.0562744140625, "learning_rate": 2.2518578678401127e-07, "loss": 0.000562373548746109, "memory(GiB)": 38.13, "reward": 0.4941924512386322, "reward_std": 0.05309073626995087, "rewards/VisualizationJSONCombinedORM/mean": 0.4941924512386322, "rewards/VisualizationJSONCombinedORM/std": 0.26041802763938904, "step": 3314, "train_speed(iter/s)": 0.095919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 280.1875, "completions/min_length": 231.0, "epoch": 2.741935483870968, "grad_norm": 0.15067976713180542, "kl": 0.066864013671875, "learning_rate": 2.237600110046001e-07, "loss": 0.000665660947561264, "memory(GiB)": 38.13, "reward": 0.6921858787536621, "reward_std": 0.06224878132343292, "rewards/VisualizationJSONCombinedORM/mean": 0.6921858787536621, "rewards/VisualizationJSONCombinedORM/std": 0.1017376109957695, "step": 3315, "train_speed(iter/s)": 0.09588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 293.5, "completions/min_length": 254.0, "epoch": 2.742762613730356, "grad_norm": 0.1882510781288147, "kl": 0.06231689453125, "learning_rate": 2.2233865994273128e-07, "loss": 0.0006241202354431152, "memory(GiB)": 38.13, "reward": 0.5177081227302551, "reward_std": 0.08125287294387817, "rewards/VisualizationJSONCombinedORM/mean": 0.5177081227302551, "rewards/VisualizationJSONCombinedORM/std": 0.13282592594623566, "step": 3316, "train_speed(iter/s)": 0.095853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 300.625, "completions/min_length": 229.0, "epoch": 2.7435897435897436, "grad_norm": 0.20202916860580444, "kl": 0.0799560546875, "learning_rate": 2.20921734915146e-07, "loss": 0.0008010156452655792, "memory(GiB)": 38.13, "reward": 0.6542522311210632, "reward_std": 0.08650445938110352, "rewards/VisualizationJSONCombinedORM/mean": 0.6542522311210632, "rewards/VisualizationJSONCombinedORM/std": 0.08788402378559113, "step": 3317, "train_speed(iter/s)": 0.095808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 283.8125, "completions/min_length": 231.0, "epoch": 2.7444168734491314, "grad_norm": 0.19780518114566803, "kl": 0.1075439453125, "learning_rate": 2.1950923723448704e-07, "loss": 0.0010760799050331116, "memory(GiB)": 38.13, "reward": 0.5615853667259216, "reward_std": 0.09077946841716766, "rewards/VisualizationJSONCombinedORM/mean": 0.5615853667259216, "rewards/VisualizationJSONCombinedORM/std": 0.09687866270542145, "step": 3318, "train_speed(iter/s)": 0.095769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 312.375, "completions/min_length": 260.0, "epoch": 2.7452440033085193, "grad_norm": 0.20208390057086945, "kl": 0.0423583984375, "learning_rate": 2.1810116820929427e-07, "loss": 0.0004235506057739258, "memory(GiB)": 38.13, "reward": 0.6293325424194336, "reward_std": 0.12455224990844727, "rewards/VisualizationJSONCombinedORM/mean": 0.6293325424194336, "rewards/VisualizationJSONCombinedORM/std": 0.17791178822517395, "step": 3319, "train_speed(iter/s)": 0.095705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 302.25, "completions/min_length": 240.0, "epoch": 2.7460711331679075, "grad_norm": 0.19592565298080444, "kl": 0.0869140625, "learning_rate": 2.16697529144006e-07, "loss": 0.0008687605150043964, "memory(GiB)": 38.13, "reward": 0.5285458564758301, "reward_std": 0.0730317160487175, "rewards/VisualizationJSONCombinedORM/mean": 0.5285458564758301, "rewards/VisualizationJSONCombinedORM/std": 0.18585990369319916, "step": 3320, "train_speed(iter/s)": 0.095648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 287.8125, "completions/min_length": 241.0, "epoch": 2.7468982630272953, "grad_norm": 0.21206314861774445, "kl": 0.06768798828125, "learning_rate": 2.152983213389559e-07, "loss": 0.0006773136556148529, "memory(GiB)": 38.13, "reward": 0.6443787813186646, "reward_std": 0.08197490125894547, "rewards/VisualizationJSONCombinedORM/mean": 0.6443787813186646, "rewards/VisualizationJSONCombinedORM/std": 0.17349347472190857, "step": 3321, "train_speed(iter/s)": 0.095619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 288.0, "completions/min_length": 233.0, "epoch": 2.747725392886683, "grad_norm": 0.21812279522418976, "kl": 0.049774169921875, "learning_rate": 2.1390354609037212e-07, "loss": 0.000497967004776001, "memory(GiB)": 38.13, "reward": 0.5081956386566162, "reward_std": 0.062168218195438385, "rewards/VisualizationJSONCombinedORM/mean": 0.5081956386566162, "rewards/VisualizationJSONCombinedORM/std": 0.0680236965417862, "step": 3322, "train_speed(iter/s)": 0.095582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 269.4375, "completions/min_length": 225.0, "epoch": 2.748552522746071, "grad_norm": 0.1806909441947937, "kl": 0.1322021484375, "learning_rate": 2.1251320469037827e-07, "loss": 0.0013204514980316162, "memory(GiB)": 38.13, "reward": 0.37049341201782227, "reward_std": 0.052145663648843765, "rewards/VisualizationJSONCombinedORM/mean": 0.37049341201782227, "rewards/VisualizationJSONCombinedORM/std": 0.0523763969540596, "step": 3323, "train_speed(iter/s)": 0.095552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 304.375, "completions/min_length": 247.0, "epoch": 2.749379652605459, "grad_norm": 0.19693922996520996, "kl": 0.173828125, "learning_rate": 2.11127298426988e-07, "loss": 0.0017390362918376923, "memory(GiB)": 38.13, "reward": 0.32648003101348877, "reward_std": 0.0515969879925251, "rewards/VisualizationJSONCombinedORM/mean": 0.32648003101348877, "rewards/VisualizationJSONCombinedORM/std": 0.15136025846004486, "step": 3324, "train_speed(iter/s)": 0.095503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 294.1875, "completions/min_length": 219.0, "epoch": 2.750206782464847, "grad_norm": 0.20000529289245605, "kl": 0.05218505859375, "learning_rate": 2.0974582858410809e-07, "loss": 0.000521710142493248, "memory(GiB)": 38.13, "reward": 0.47474032640457153, "reward_std": 0.053403325378894806, "rewards/VisualizationJSONCombinedORM/mean": 0.47474032640457153, "rewards/VisualizationJSONCombinedORM/std": 0.07661307603120804, "step": 3325, "train_speed(iter/s)": 0.095461 }, { "epoch": 2.750206782464847, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 352.4583333333333, "eval_completions/mean_length": 297.9010416666667, "eval_completions/min_length": 250.83333333333334, "eval_kl": 0.06401570638020833, "eval_loss": 0.0006445161998271942, "eval_reward": 0.4433383972694476, "eval_reward_std": 0.05013590100376556, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4433383972694476, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05013590317685157, "eval_runtime": 304.0946, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 276.0, "completions/min_length": 239.0, "epoch": 2.751033912324235, "grad_norm": 0.21734142303466797, "kl": 0.0999755859375, "learning_rate": 2.0836879644153374e-07, "loss": 0.000997886061668396, "memory(GiB)": 38.13, "reward": 0.4280523359775543, "reward_std": 0.06603989005088806, "rewards/VisualizationJSONCombinedORM/mean": 0.4280523359775543, "rewards/VisualizationJSONCombinedORM/std": 0.1018206849694252, "step": 3326, "train_speed(iter/s)": 0.094589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 298.6875, "completions/min_length": 231.0, "epoch": 2.7518610421836227, "grad_norm": 0.187435120344162, "kl": 0.06817626953125, "learning_rate": 2.0699620327495174e-07, "loss": 0.0006812326610088348, "memory(GiB)": 38.13, "reward": 0.5454742908477783, "reward_std": 0.0867307111620903, "rewards/VisualizationJSONCombinedORM/mean": 0.5454742908477783, "rewards/VisualizationJSONCombinedORM/std": 0.08627782016992569, "step": 3327, "train_speed(iter/s)": 0.094542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 308.5625, "completions/min_length": 234.0, "epoch": 2.752688172043011, "grad_norm": 0.22945533692836761, "kl": 0.0948486328125, "learning_rate": 2.0562805035593324e-07, "loss": 0.0009499061852693558, "memory(GiB)": 38.13, "reward": 0.37347888946533203, "reward_std": 0.04344407469034195, "rewards/VisualizationJSONCombinedORM/mean": 0.37347888946533203, "rewards/VisualizationJSONCombinedORM/std": 0.12081317603588104, "step": 3328, "train_speed(iter/s)": 0.094488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 311.25, "completions/min_length": 243.0, "epoch": 2.753515301902399, "grad_norm": 0.19023452699184418, "kl": 0.06585693359375, "learning_rate": 2.0426433895193942e-07, "loss": 0.0006608814001083374, "memory(GiB)": 38.13, "reward": 0.5686637163162231, "reward_std": 0.10765038430690765, "rewards/VisualizationJSONCombinedORM/mean": 0.5686637163162231, "rewards/VisualizationJSONCombinedORM/std": 0.16298441588878632, "step": 3329, "train_speed(iter/s)": 0.094433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 284.5, "completions/min_length": 231.0, "epoch": 2.7543424317617866, "grad_norm": 0.2163543552160263, "kl": 0.0665283203125, "learning_rate": 2.0290507032631356e-07, "loss": 0.0006650611758232117, "memory(GiB)": 38.13, "reward": 0.6412428617477417, "reward_std": 0.07987348735332489, "rewards/VisualizationJSONCombinedORM/mean": 0.6412428617477417, "rewards/VisualizationJSONCombinedORM/std": 0.10216008871793747, "step": 3330, "train_speed(iter/s)": 0.094394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 321.875, "completions/min_length": 261.0, "epoch": 2.7551695616211744, "grad_norm": 0.1714138686656952, "kl": 0.05548095703125, "learning_rate": 2.0155024573828452e-07, "loss": 0.0005554687231779099, "memory(GiB)": 38.13, "reward": 0.5990707874298096, "reward_std": 0.06414787471294403, "rewards/VisualizationJSONCombinedORM/mean": 0.5990707874298096, "rewards/VisualizationJSONCombinedORM/std": 0.07165737450122833, "step": 3331, "train_speed(iter/s)": 0.094351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 303.625, "completions/min_length": 246.0, "epoch": 2.7559966914805623, "grad_norm": 0.19497551023960114, "kl": 0.0443115234375, "learning_rate": 2.001998664429655e-07, "loss": 0.00044374167919158936, "memory(GiB)": 38.13, "reward": 0.677254319190979, "reward_std": 0.06736963987350464, "rewards/VisualizationJSONCombinedORM/mean": 0.677254319190979, "rewards/VisualizationJSONCombinedORM/std": 0.1127575933933258, "step": 3332, "train_speed(iter/s)": 0.094306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 324.5, "completions/min_length": 233.0, "epoch": 2.7568238213399505, "grad_norm": 0.16860774159431458, "kl": 0.077392578125, "learning_rate": 1.9885393369134976e-07, "loss": 0.0007739141583442688, "memory(GiB)": 38.13, "reward": 0.5613828301429749, "reward_std": 0.07425356656312943, "rewards/VisualizationJSONCombinedORM/mean": 0.5613828301429749, "rewards/VisualizationJSONCombinedORM/std": 0.17505618929862976, "step": 3333, "train_speed(iter/s)": 0.094265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 293.875, "completions/min_length": 249.0, "epoch": 2.7576509511993383, "grad_norm": 0.1970079094171524, "kl": 0.04559326171875, "learning_rate": 1.975124487303115e-07, "loss": 0.00045581161975860596, "memory(GiB)": 38.13, "reward": 0.46614593267440796, "reward_std": 0.07966753840446472, "rewards/VisualizationJSONCombinedORM/mean": 0.46614593267440796, "rewards/VisualizationJSONCombinedORM/std": 0.08648977428674698, "step": 3334, "train_speed(iter/s)": 0.094222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 296.25, "completions/min_length": 250.0, "epoch": 2.758478081058726, "grad_norm": 0.2523377537727356, "kl": 0.06024169921875, "learning_rate": 1.961754128026061e-07, "loss": 0.0006035789847373962, "memory(GiB)": 38.13, "reward": 0.3772536516189575, "reward_std": 0.05979897081851959, "rewards/VisualizationJSONCombinedORM/mean": 0.3772536516189575, "rewards/VisualizationJSONCombinedORM/std": 0.0694354698061943, "step": 3335, "train_speed(iter/s)": 0.094184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 297.0625, "completions/min_length": 251.0, "epoch": 2.759305210918114, "grad_norm": 0.1991387903690338, "kl": 0.219970703125, "learning_rate": 1.9484282714686442e-07, "loss": 0.0022065602242946625, "memory(GiB)": 38.13, "reward": 0.6168447136878967, "reward_std": 0.035520270466804504, "rewards/VisualizationJSONCombinedORM/mean": 0.6168447136878967, "rewards/VisualizationJSONCombinedORM/std": 0.2222948670387268, "step": 3336, "train_speed(iter/s)": 0.09414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 276.625, "completions/min_length": 223.0, "epoch": 2.760132340777502, "grad_norm": 0.27552956342697144, "kl": 0.130126953125, "learning_rate": 1.9351469299759728e-07, "loss": 0.0013000816106796265, "memory(GiB)": 38.13, "reward": 0.4146259129047394, "reward_std": 0.05228269100189209, "rewards/VisualizationJSONCombinedORM/mean": 0.4146259129047394, "rewards/VisualizationJSONCombinedORM/std": 0.09649377316236496, "step": 3337, "train_speed(iter/s)": 0.094101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 295.625, "completions/min_length": 251.0, "epoch": 2.76095947063689, "grad_norm": 0.2342071831226349, "kl": 0.0921630859375, "learning_rate": 1.9219101158518993e-07, "loss": 0.0009219422936439514, "memory(GiB)": 38.13, "reward": 0.5474302768707275, "reward_std": 0.08978062868118286, "rewards/VisualizationJSONCombinedORM/mean": 0.5474302768707275, "rewards/VisualizationJSONCombinedORM/std": 0.09321806579828262, "step": 3338, "train_speed(iter/s)": 0.094059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 304.625, "completions/min_length": 238.0, "epoch": 2.761786600496278, "grad_norm": 0.17745919525623322, "kl": 0.0567626953125, "learning_rate": 1.908717841359048e-07, "loss": 0.0005678385496139526, "memory(GiB)": 38.13, "reward": 0.5128094553947449, "reward_std": 0.05787855014204979, "rewards/VisualizationJSONCombinedORM/mean": 0.5128094553947449, "rewards/VisualizationJSONCombinedORM/std": 0.2439689338207245, "step": 3339, "train_speed(iter/s)": 0.094015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 293.6875, "completions/min_length": 236.0, "epoch": 2.7626137303556657, "grad_norm": 0.22508078813552856, "kl": 0.0693359375, "learning_rate": 1.8955701187187536e-07, "loss": 0.0006928928196430206, "memory(GiB)": 38.13, "reward": 0.5432559251785278, "reward_std": 0.09266316145658493, "rewards/VisualizationJSONCombinedORM/mean": 0.5432559251785278, "rewards/VisualizationJSONCombinedORM/std": 0.10041419416666031, "step": 3340, "train_speed(iter/s)": 0.093965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 316.6875, "completions/min_length": 238.0, "epoch": 2.763440860215054, "grad_norm": 0.21720431745052338, "kl": 0.1405029296875, "learning_rate": 1.88246696011109e-07, "loss": 0.0014046989381313324, "memory(GiB)": 38.13, "reward": 0.5007884502410889, "reward_std": 0.07064292579889297, "rewards/VisualizationJSONCombinedORM/mean": 0.5007884502410889, "rewards/VisualizationJSONCombinedORM/std": 0.207550048828125, "step": 3341, "train_speed(iter/s)": 0.09392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 300.9375, "completions/min_length": 220.0, "epoch": 2.764267990074442, "grad_norm": 0.23619258403778076, "kl": 0.0623779296875, "learning_rate": 1.8694083776748472e-07, "loss": 0.0006235316395759583, "memory(GiB)": 38.13, "reward": 0.5513908267021179, "reward_std": 0.0969364196062088, "rewards/VisualizationJSONCombinedORM/mean": 0.5513908267021179, "rewards/VisualizationJSONCombinedORM/std": 0.10950887203216553, "step": 3342, "train_speed(iter/s)": 0.093883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 268.3125, "completions/min_length": 212.0, "epoch": 2.7650951199338296, "grad_norm": 0.1625993400812149, "kl": 0.0284423828125, "learning_rate": 1.8563943835075315e-07, "loss": 0.0002837255597114563, "memory(GiB)": 38.13, "reward": 0.6656655073165894, "reward_std": 0.057604070752859116, "rewards/VisualizationJSONCombinedORM/mean": 0.6656655073165894, "rewards/VisualizationJSONCombinedORM/std": 0.17581523954868317, "step": 3343, "train_speed(iter/s)": 0.093848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 300.0625, "completions/min_length": 242.0, "epoch": 2.7659222497932174, "grad_norm": 0.22759242355823517, "kl": 0.026580810546875, "learning_rate": 1.8434249896653157e-07, "loss": 0.0002659820020198822, "memory(GiB)": 38.13, "reward": 0.5034358501434326, "reward_std": 0.038923099637031555, "rewards/VisualizationJSONCombinedORM/mean": 0.5034358501434326, "rewards/VisualizationJSONCombinedORM/std": 0.04176883026957512, "step": 3344, "train_speed(iter/s)": 0.093806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 263.5, "completions/min_length": 198.0, "epoch": 2.7667493796526053, "grad_norm": 0.16258424520492554, "kl": 0.04278564453125, "learning_rate": 1.8305002081630885e-07, "loss": 0.000427834689617157, "memory(GiB)": 38.13, "reward": 0.5784128308296204, "reward_std": 0.06285911053419113, "rewards/VisualizationJSONCombinedORM/mean": 0.5784128308296204, "rewards/VisualizationJSONCombinedORM/std": 0.19261552393436432, "step": 3345, "train_speed(iter/s)": 0.093773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 302.1875, "completions/min_length": 229.0, "epoch": 2.7675765095119935, "grad_norm": 0.25406113266944885, "kl": 0.079833984375, "learning_rate": 1.817620050974367e-07, "loss": 0.0007996037602424622, "memory(GiB)": 38.13, "reward": 0.46140584349632263, "reward_std": 0.060517020523548126, "rewards/VisualizationJSONCombinedORM/mean": 0.46140584349632263, "rewards/VisualizationJSONCombinedORM/std": 0.17912495136260986, "step": 3346, "train_speed(iter/s)": 0.093737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 312.125, "completions/min_length": 241.0, "epoch": 2.7684036393713813, "grad_norm": 0.17068609595298767, "kl": 0.0467529296875, "learning_rate": 1.8047845300313726e-07, "loss": 0.00046900659799575806, "memory(GiB)": 38.13, "reward": 0.260887086391449, "reward_std": 0.010219132527709007, "rewards/VisualizationJSONCombinedORM/mean": 0.260887086391449, "rewards/VisualizationJSONCombinedORM/std": 0.12959544360637665, "step": 3347, "train_speed(iter/s)": 0.093694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 283.375, "completions/min_length": 229.0, "epoch": 2.769230769230769, "grad_norm": 0.19031602144241333, "kl": 0.0728759765625, "learning_rate": 1.7919936572249442e-07, "loss": 0.0007295012474060059, "memory(GiB)": 38.13, "reward": 0.45905736088752747, "reward_std": 0.05327851325273514, "rewards/VisualizationJSONCombinedORM/mean": 0.45905736088752747, "rewards/VisualizationJSONCombinedORM/std": 0.08189789950847626, "step": 3348, "train_speed(iter/s)": 0.093653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 303.6875, "completions/min_length": 249.0, "epoch": 2.7700578990901574, "grad_norm": 0.17669808864593506, "kl": 0.0430908203125, "learning_rate": 1.7792474444045859e-07, "loss": 0.0004303678870201111, "memory(GiB)": 38.13, "reward": 0.44452014565467834, "reward_std": 0.03726988285779953, "rewards/VisualizationJSONCombinedORM/mean": 0.44452014565467834, "rewards/VisualizationJSONCombinedORM/std": 0.11385148018598557, "step": 3349, "train_speed(iter/s)": 0.093626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 274.6875, "completions/min_length": 217.0, "epoch": 2.770885028949545, "grad_norm": 0.2211630493402481, "kl": 0.0704345703125, "learning_rate": 1.7665459033783915e-07, "loss": 0.0007042288780212402, "memory(GiB)": 38.13, "reward": 0.6037403345108032, "reward_std": 0.09751714766025543, "rewards/VisualizationJSONCombinedORM/mean": 0.6037403345108032, "rewards/VisualizationJSONCombinedORM/std": 0.09688272327184677, "step": 3350, "train_speed(iter/s)": 0.093591 }, { "epoch": 2.770885028949545, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.1666666666667, "eval_completions/mean_length": 300.0885416666667, "eval_completions/min_length": 246.04166666666666, "eval_kl": 0.073822021484375, "eval_loss": 0.0007378607988357544, "eval_reward": 0.4507453056673209, "eval_reward_std": 0.06203322500611345, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4507453056673209, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06203322888662418, "eval_runtime": 315.8107, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 283.625, "completions/min_length": 229.0, "epoch": 2.771712158808933, "grad_norm": 0.1555243730545044, "kl": 0.03045654296875, "learning_rate": 1.7538890459131098e-07, "loss": 0.0003045797348022461, "memory(GiB)": 38.13, "reward": 0.7664694786071777, "reward_std": 0.07007370889186859, "rewards/VisualizationJSONCombinedORM/mean": 0.7664694786071777, "rewards/VisualizationJSONCombinedORM/std": 0.07860080897808075, "step": 3351, "train_speed(iter/s)": 0.092737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 292.9375, "completions/min_length": 241.0, "epoch": 2.772539288668321, "grad_norm": 0.2022896260023117, "kl": 0.06707763671875, "learning_rate": 1.7412768837340666e-07, "loss": 0.0006699115037918091, "memory(GiB)": 38.13, "reward": 0.6116234064102173, "reward_std": 0.0744769498705864, "rewards/VisualizationJSONCombinedORM/mean": 0.6116234064102173, "rewards/VisualizationJSONCombinedORM/std": 0.09337242692708969, "step": 3352, "train_speed(iter/s)": 0.092702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 285.4375, "completions/min_length": 235.0, "epoch": 2.7733664185277087, "grad_norm": 0.21096834540367126, "kl": 0.05810546875, "learning_rate": 1.7287094285251882e-07, "loss": 0.000581890344619751, "memory(GiB)": 38.13, "reward": 0.5982538461685181, "reward_std": 0.04572783410549164, "rewards/VisualizationJSONCombinedORM/mean": 0.5982538461685181, "rewards/VisualizationJSONCombinedORM/std": 0.19206438958644867, "step": 3353, "train_speed(iter/s)": 0.092678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 311.75, "completions/min_length": 245.0, "epoch": 2.774193548387097, "grad_norm": 0.1612786054611206, "kl": 0.0540771484375, "learning_rate": 1.7161866919290004e-07, "loss": 0.0005414485931396484, "memory(GiB)": 38.13, "reward": 0.3923913240432739, "reward_std": 0.022567879408597946, "rewards/VisualizationJSONCombinedORM/mean": 0.3923913240432739, "rewards/VisualizationJSONCombinedORM/std": 0.1917336881160736, "step": 3354, "train_speed(iter/s)": 0.092637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 320.5, "completions/min_length": 243.0, "epoch": 2.775020678246485, "grad_norm": 0.18587277829647064, "kl": 0.07940673828125, "learning_rate": 1.7037086855465902e-07, "loss": 0.0007942542433738708, "memory(GiB)": 38.13, "reward": 0.3762231171131134, "reward_std": 0.055132389068603516, "rewards/VisualizationJSONCombinedORM/mean": 0.3762231171131134, "rewards/VisualizationJSONCombinedORM/std": 0.11875323951244354, "step": 3355, "train_speed(iter/s)": 0.092599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 276.8125, "completions/min_length": 231.0, "epoch": 2.7758478081058726, "grad_norm": 0.21411815285682678, "kl": 0.085205078125, "learning_rate": 1.6912754209375936e-07, "loss": 0.0008534509688615799, "memory(GiB)": 38.13, "reward": 0.65291428565979, "reward_std": 0.11337126791477203, "rewards/VisualizationJSONCombinedORM/mean": 0.65291428565979, "rewards/VisualizationJSONCombinedORM/std": 0.14504408836364746, "step": 3356, "train_speed(iter/s)": 0.092562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 286.75, "completions/min_length": 240.0, "epoch": 2.7766749379652604, "grad_norm": 0.19061748683452606, "kl": 0.0906982421875, "learning_rate": 1.6788869096202197e-07, "loss": 0.0009050220251083374, "memory(GiB)": 38.13, "reward": 0.21872398257255554, "reward_std": 0.02694675326347351, "rewards/VisualizationJSONCombinedORM/mean": 0.21872398257255554, "rewards/VisualizationJSONCombinedORM/std": 0.03231943026185036, "step": 3357, "train_speed(iter/s)": 0.09254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 307.5, "completions/min_length": 243.0, "epoch": 2.7775020678246483, "grad_norm": 0.19595476984977722, "kl": 0.03619384765625, "learning_rate": 1.6665431630711936e-07, "loss": 0.0003622397780418396, "memory(GiB)": 38.13, "reward": 0.7644699811935425, "reward_std": 0.09560225903987885, "rewards/VisualizationJSONCombinedORM/mean": 0.7644699811935425, "rewards/VisualizationJSONCombinedORM/std": 0.11093585938215256, "step": 3358, "train_speed(iter/s)": 0.092502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 320.1875, "completions/min_length": 260.0, "epoch": 2.7783291976840365, "grad_norm": 0.189691960811615, "kl": 0.059326171875, "learning_rate": 1.6542441927258068e-07, "loss": 0.0005934387445449829, "memory(GiB)": 38.13, "reward": 0.5778337121009827, "reward_std": 0.08541786670684814, "rewards/VisualizationJSONCombinedORM/mean": 0.5778337121009827, "rewards/VisualizationJSONCombinedORM/std": 0.12412828207015991, "step": 3359, "train_speed(iter/s)": 0.092471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 259.5625, "completions/min_length": 224.0, "epoch": 2.7791563275434243, "grad_norm": 0.20125344395637512, "kl": 0.034881591796875, "learning_rate": 1.641990009977834e-07, "loss": 0.00034796446561813354, "memory(GiB)": 38.13, "reward": 0.5832645297050476, "reward_std": 0.0716206282377243, "rewards/VisualizationJSONCombinedORM/mean": 0.5832645297050476, "rewards/VisualizationJSONCombinedORM/std": 0.15382219851016998, "step": 3360, "train_speed(iter/s)": 0.092448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 275.75, "completions/min_length": 226.0, "epoch": 2.779983457402812, "grad_norm": 0.16591046750545502, "kl": 0.10260009765625, "learning_rate": 1.629780626179578e-07, "loss": 0.0010267198085784912, "memory(GiB)": 38.13, "reward": 0.6054632663726807, "reward_std": 0.07259298861026764, "rewards/VisualizationJSONCombinedORM/mean": 0.6054632663726807, "rewards/VisualizationJSONCombinedORM/std": 0.16289855539798737, "step": 3361, "train_speed(iter/s)": 0.09241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 284.375, "completions/min_length": 222.0, "epoch": 2.7808105872622004, "grad_norm": 0.19953611493110657, "kl": 0.1539306640625, "learning_rate": 1.6176160526418294e-07, "loss": 0.0015413537621498108, "memory(GiB)": 38.13, "reward": 0.5465992093086243, "reward_std": 0.06294435262680054, "rewards/VisualizationJSONCombinedORM/mean": 0.5465992093086243, "rewards/VisualizationJSONCombinedORM/std": 0.06484349817037582, "step": 3362, "train_speed(iter/s)": 0.092381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 305.0, "completions/min_length": 252.0, "epoch": 2.7816377171215882, "grad_norm": 0.17226603627204895, "kl": 0.190673828125, "learning_rate": 1.6054963006338742e-07, "loss": 0.001905187964439392, "memory(GiB)": 38.13, "reward": 0.42114952206611633, "reward_std": 0.07543936371803284, "rewards/VisualizationJSONCombinedORM/mean": 0.42114952206611633, "rewards/VisualizationJSONCombinedORM/std": 0.1928788274526596, "step": 3363, "train_speed(iter/s)": 0.092341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 312.0625, "completions/min_length": 248.0, "epoch": 2.782464846980976, "grad_norm": 0.16973119974136353, "kl": 0.0567626953125, "learning_rate": 1.5934213813834697e-07, "loss": 0.0005668178200721741, "memory(GiB)": 38.13, "reward": 0.5272451639175415, "reward_std": 0.058666326105594635, "rewards/VisualizationJSONCombinedORM/mean": 0.5272451639175415, "rewards/VisualizationJSONCombinedORM/std": 0.1419849395751953, "step": 3364, "train_speed(iter/s)": 0.092303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 313.75, "completions/min_length": 262.0, "epoch": 2.783291976840364, "grad_norm": 0.2062017321586609, "kl": 0.08343505859375, "learning_rate": 1.5813913060768571e-07, "loss": 0.0008352175354957581, "memory(GiB)": 38.13, "reward": 0.4454966187477112, "reward_std": 0.05136759206652641, "rewards/VisualizationJSONCombinedORM/mean": 0.4454966187477112, "rewards/VisualizationJSONCombinedORM/std": 0.19757875800132751, "step": 3365, "train_speed(iter/s)": 0.092262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 287.9375, "completions/min_length": 217.0, "epoch": 2.7841191066997517, "grad_norm": 0.26441532373428345, "kl": 0.04156494140625, "learning_rate": 1.5694060858587046e-07, "loss": 0.0004157647490501404, "memory(GiB)": 38.13, "reward": 0.7714199423789978, "reward_std": 0.10283468663692474, "rewards/VisualizationJSONCombinedORM/mean": 0.7714199423789978, "rewards/VisualizationJSONCombinedORM/std": 0.11096799373626709, "step": 3366, "train_speed(iter/s)": 0.092231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 305.625, "completions/min_length": 242.0, "epoch": 2.78494623655914, "grad_norm": 0.15330453217029572, "kl": 0.032867431640625, "learning_rate": 1.5574657318321528e-07, "loss": 0.000328943133354187, "memory(GiB)": 38.13, "reward": 0.43590566515922546, "reward_std": 0.03776422142982483, "rewards/VisualizationJSONCombinedORM/mean": 0.43590566515922546, "rewards/VisualizationJSONCombinedORM/std": 0.17343367636203766, "step": 3367, "train_speed(iter/s)": 0.092184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 312.5625, "completions/min_length": 258.0, "epoch": 2.785773366418528, "grad_norm": 0.19467590749263763, "kl": 0.06756591796875, "learning_rate": 1.5455702550587538e-07, "loss": 0.0006767287850379944, "memory(GiB)": 38.13, "reward": 0.6307797431945801, "reward_std": 0.06433790922164917, "rewards/VisualizationJSONCombinedORM/mean": 0.6307797431945801, "rewards/VisualizationJSONCombinedORM/std": 0.12128772586584091, "step": 3368, "train_speed(iter/s)": 0.092145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 316.0, "completions/min_length": 262.0, "epoch": 2.7866004962779156, "grad_norm": 0.13531160354614258, "kl": 0.042236328125, "learning_rate": 1.533719666558514e-07, "loss": 0.0004223657597322017, "memory(GiB)": 38.13, "reward": 0.6759325265884399, "reward_std": 0.035031113773584366, "rewards/VisualizationJSONCombinedORM/mean": 0.6759325265884399, "rewards/VisualizationJSONCombinedORM/std": 0.0646669790148735, "step": 3369, "train_speed(iter/s)": 0.092105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 298.8125, "completions/min_length": 222.0, "epoch": 2.7874276261373034, "grad_norm": 0.18601135909557343, "kl": 0.0374755859375, "learning_rate": 1.5219139773098356e-07, "loss": 0.00037483684718608856, "memory(GiB)": 38.13, "reward": 0.6549404859542847, "reward_std": 0.1006021723151207, "rewards/VisualizationJSONCombinedORM/mean": 0.6549404859542847, "rewards/VisualizationJSONCombinedORM/std": 0.11673483997583389, "step": 3370, "train_speed(iter/s)": 0.092076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 288.1875, "completions/min_length": 227.0, "epoch": 2.7882547559966913, "grad_norm": 0.2142079919576645, "kl": 0.0855712890625, "learning_rate": 1.510153198249531e-07, "loss": 0.000856306403875351, "memory(GiB)": 38.13, "reward": 0.6314905881881714, "reward_std": 0.07288631051778793, "rewards/VisualizationJSONCombinedORM/mean": 0.6314905881881714, "rewards/VisualizationJSONCombinedORM/std": 0.08894506841897964, "step": 3371, "train_speed(iter/s)": 0.092026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 296.9375, "completions/min_length": 223.0, "epoch": 2.7890818858560795, "grad_norm": 0.1642036736011505, "kl": 0.082275390625, "learning_rate": 1.4984373402728014e-07, "loss": 0.0008217990398406982, "memory(GiB)": 38.13, "reward": 0.40410029888153076, "reward_std": 0.02588557079434395, "rewards/VisualizationJSONCombinedORM/mean": 0.40410029888153076, "rewards/VisualizationJSONCombinedORM/std": 0.25031182169914246, "step": 3372, "train_speed(iter/s)": 0.09199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 295.5, "completions/min_length": 255.0, "epoch": 2.7899090157154673, "grad_norm": 0.2180163711309433, "kl": 0.06304931640625, "learning_rate": 1.4867664142332483e-07, "loss": 0.0006302669644355774, "memory(GiB)": 38.13, "reward": 0.5403580069541931, "reward_std": 0.06779901683330536, "rewards/VisualizationJSONCombinedORM/mean": 0.5403580069541931, "rewards/VisualizationJSONCombinedORM/std": 0.2767871618270874, "step": 3373, "train_speed(iter/s)": 0.091955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 308.3125, "completions/min_length": 222.0, "epoch": 2.790736145574855, "grad_norm": 0.17018355429172516, "kl": 0.093505859375, "learning_rate": 1.4751404309428396e-07, "loss": 0.0009347721934318542, "memory(GiB)": 38.13, "reward": 0.5125720500946045, "reward_std": 0.06736836582422256, "rewards/VisualizationJSONCombinedORM/mean": 0.5125720500946045, "rewards/VisualizationJSONCombinedORM/std": 0.1960783302783966, "step": 3374, "train_speed(iter/s)": 0.09191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 302.0625, "completions/min_length": 245.0, "epoch": 2.7915632754342434, "grad_norm": 0.18941736221313477, "kl": 0.07568359375, "learning_rate": 1.4635594011718935e-07, "loss": 0.0007583796977996826, "memory(GiB)": 38.13, "reward": 0.44387248158454895, "reward_std": 0.04740707576274872, "rewards/VisualizationJSONCombinedORM/mean": 0.44387248158454895, "rewards/VisualizationJSONCombinedORM/std": 0.1984667181968689, "step": 3375, "train_speed(iter/s)": 0.091869 }, { "epoch": 2.7915632754342434, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 352.7916666666667, "eval_completions/mean_length": 298.9583333333333, "eval_completions/min_length": 253.45833333333334, "eval_kl": 0.07090250651041667, "eval_loss": 0.0007113988394849002, "eval_reward": 0.45588954786459607, "eval_reward_std": 0.06327430638096605, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45588954786459607, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0632743076227295, "eval_runtime": 304.2561, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 291.5, "completions/min_length": 242.0, "epoch": 2.7923904052936313, "grad_norm": 0.17766818404197693, "kl": 0.15283203125, "learning_rate": 1.4520233356491165e-07, "loss": 0.001527983695268631, "memory(GiB)": 38.13, "reward": 0.46055901050567627, "reward_std": 0.06186173856258392, "rewards/VisualizationJSONCombinedORM/mean": 0.46055901050567627, "rewards/VisualizationJSONCombinedORM/std": 0.25325262546539307, "step": 3376, "train_speed(iter/s)": 0.09108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 305.0625, "completions/min_length": 214.0, "epoch": 2.793217535153019, "grad_norm": 0.20016221702098846, "kl": 0.04510498046875, "learning_rate": 1.4405322450615266e-07, "loss": 0.0004508979618549347, "memory(GiB)": 38.13, "reward": 0.5041622519493103, "reward_std": 0.03845662996172905, "rewards/VisualizationJSONCombinedORM/mean": 0.5041622519493103, "rewards/VisualizationJSONCombinedORM/std": 0.23879925906658173, "step": 3377, "train_speed(iter/s)": 0.091046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 305.6875, "completions/min_length": 254.0, "epoch": 2.794044665012407, "grad_norm": 0.17285551130771637, "kl": 0.0968017578125, "learning_rate": 1.4290861400545031e-07, "loss": 0.0009693615138530731, "memory(GiB)": 38.13, "reward": 0.38247546553611755, "reward_std": 0.04138578101992607, "rewards/VisualizationJSONCombinedORM/mean": 0.38247546553611755, "rewards/VisualizationJSONCombinedORM/std": 0.1635863482952118, "step": 3378, "train_speed(iter/s)": 0.091013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 323.75, "completions/min_length": 283.0, "epoch": 2.7948717948717947, "grad_norm": 0.16028591990470886, "kl": 0.09893798828125, "learning_rate": 1.4176850312317246e-07, "loss": 0.00099090114235878, "memory(GiB)": 38.13, "reward": 0.4450625479221344, "reward_std": 0.030636727809906006, "rewards/VisualizationJSONCombinedORM/mean": 0.4450625479221344, "rewards/VisualizationJSONCombinedORM/std": 0.23369024693965912, "step": 3379, "train_speed(iter/s)": 0.090974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 273.5, "completions/min_length": 232.0, "epoch": 2.795698924731183, "grad_norm": 0.1590522974729538, "kl": 0.041900634765625, "learning_rate": 1.4063289291552095e-07, "loss": 0.00041878968477249146, "memory(GiB)": 38.13, "reward": 0.4477824866771698, "reward_std": 0.03941821679472923, "rewards/VisualizationJSONCombinedORM/mean": 0.4477824866771698, "rewards/VisualizationJSONCombinedORM/std": 0.17565175890922546, "step": 3380, "train_speed(iter/s)": 0.090956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 319.125, "completions/min_length": 265.0, "epoch": 2.796526054590571, "grad_norm": 0.18035580217838287, "kl": 0.03167724609375, "learning_rate": 1.39501784434527e-07, "loss": 0.0003166906535625458, "memory(GiB)": 38.13, "reward": 0.37742817401885986, "reward_std": 0.040736690163612366, "rewards/VisualizationJSONCombinedORM/mean": 0.37742817401885986, "rewards/VisualizationJSONCombinedORM/std": 0.04440953582525253, "step": 3381, "train_speed(iter/s)": 0.090918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 276.75, "completions/min_length": 221.0, "epoch": 2.7973531844499586, "grad_norm": 0.19101189076900482, "kl": 0.05035400390625, "learning_rate": 1.3837517872805185e-07, "loss": 0.0005027912557125092, "memory(GiB)": 38.13, "reward": 0.36378899216651917, "reward_std": 0.042502738535404205, "rewards/VisualizationJSONCombinedORM/mean": 0.36378899216651917, "rewards/VisualizationJSONCombinedORM/std": 0.06140514463186264, "step": 3382, "train_speed(iter/s)": 0.090883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 303.25, "completions/min_length": 235.0, "epoch": 2.7981803143093464, "grad_norm": 0.20654766261577606, "kl": 0.05059814453125, "learning_rate": 1.372530768397845e-07, "loss": 0.0005070716142654419, "memory(GiB)": 38.13, "reward": 0.6192724704742432, "reward_std": 0.06701690703630447, "rewards/VisualizationJSONCombinedORM/mean": 0.6192724704742432, "rewards/VisualizationJSONCombinedORM/std": 0.09063572436571121, "step": 3383, "train_speed(iter/s)": 0.090852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 333.375, "completions/min_length": 264.0, "epoch": 2.7990074441687343, "grad_norm": 0.16795676946640015, "kl": 0.07684326171875, "learning_rate": 1.361354798092429e-07, "loss": 0.0007687155157327652, "memory(GiB)": 38.13, "reward": 0.6355079412460327, "reward_std": 0.0539199635386467, "rewards/VisualizationJSONCombinedORM/mean": 0.6355079412460327, "rewards/VisualizationJSONCombinedORM/std": 0.06265196204185486, "step": 3384, "train_speed(iter/s)": 0.090815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 269.125, "completions/min_length": 190.0, "epoch": 2.7998345740281225, "grad_norm": 0.17461544275283813, "kl": 0.0792236328125, "learning_rate": 1.3502238867176998e-07, "loss": 0.0007912144064903259, "memory(GiB)": 38.13, "reward": 0.7562530040740967, "reward_std": 0.03445979952812195, "rewards/VisualizationJSONCombinedORM/mean": 0.7562530040740967, "rewards/VisualizationJSONCombinedORM/std": 0.13555099070072174, "step": 3385, "train_speed(iter/s)": 0.090779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 326.8125, "completions/min_length": 277.0, "epoch": 2.8006617038875103, "grad_norm": 0.5673636198043823, "kl": 0.0419921875, "learning_rate": 1.339138044585364e-07, "loss": 0.0004198448732495308, "memory(GiB)": 38.13, "reward": 0.6208759546279907, "reward_std": 0.047426965087652206, "rewards/VisualizationJSONCombinedORM/mean": 0.6208759546279907, "rewards/VisualizationJSONCombinedORM/std": 0.23437495529651642, "step": 3386, "train_speed(iter/s)": 0.090743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 289.25, "completions/min_length": 231.0, "epoch": 2.801488833746898, "grad_norm": 0.20749425888061523, "kl": 0.0550537109375, "learning_rate": 1.328097281965357e-07, "loss": 0.0005512386560440063, "memory(GiB)": 38.13, "reward": 0.45009562373161316, "reward_std": 0.0551665797829628, "rewards/VisualizationJSONCombinedORM/mean": 0.45009562373161316, "rewards/VisualizationJSONCombinedORM/std": 0.06312645971775055, "step": 3387, "train_speed(iter/s)": 0.09071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 308.1875, "completions/min_length": 223.0, "epoch": 2.8023159636062864, "grad_norm": 0.17601162195205688, "kl": 0.0760498046875, "learning_rate": 1.3171016090858747e-07, "loss": 0.0007599089294672012, "memory(GiB)": 38.13, "reward": 0.45931917428970337, "reward_std": 0.05145617574453354, "rewards/VisualizationJSONCombinedORM/mean": 0.45931917428970337, "rewards/VisualizationJSONCombinedORM/std": 0.10482292622327805, "step": 3388, "train_speed(iter/s)": 0.090678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 270.6875, "completions/min_length": 222.0, "epoch": 2.8031430934656743, "grad_norm": 0.1972939819097519, "kl": 0.032958984375, "learning_rate": 1.3061510361333186e-07, "loss": 0.0003290921449661255, "memory(GiB)": 38.13, "reward": 0.49389007687568665, "reward_std": 0.04973848536610603, "rewards/VisualizationJSONCombinedORM/mean": 0.49389007687568665, "rewards/VisualizationJSONCombinedORM/std": 0.2858728766441345, "step": 3389, "train_speed(iter/s)": 0.090639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 297.0, "completions/min_length": 249.0, "epoch": 2.803970223325062, "grad_norm": 0.19743205606937408, "kl": 0.04559326171875, "learning_rate": 1.2952455732523238e-07, "loss": 0.00045611336827278137, "memory(GiB)": 38.13, "reward": 0.7022705674171448, "reward_std": 0.07843433320522308, "rewards/VisualizationJSONCombinedORM/mean": 0.7022705674171448, "rewards/VisualizationJSONCombinedORM/std": 0.10457064211368561, "step": 3390, "train_speed(iter/s)": 0.090602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 281.6875, "completions/min_length": 252.0, "epoch": 2.80479735318445, "grad_norm": 0.20269812643527985, "kl": 0.04522705078125, "learning_rate": 1.284385230545726e-07, "loss": 0.00045169517397880554, "memory(GiB)": 38.13, "reward": 0.6273908615112305, "reward_std": 0.0546686016023159, "rewards/VisualizationJSONCombinedORM/mean": 0.6273908615112305, "rewards/VisualizationJSONCombinedORM/std": 0.07295002043247223, "step": 3391, "train_speed(iter/s)": 0.090567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 313.0, "completions/min_length": 242.0, "epoch": 2.8056244830438377, "grad_norm": 0.23335209488868713, "kl": 0.07000732421875, "learning_rate": 1.2735700180745769e-07, "loss": 0.0006996553856879473, "memory(GiB)": 38.13, "reward": 0.6799437403678894, "reward_std": 0.06618466973304749, "rewards/VisualizationJSONCombinedORM/mean": 0.6799437403678894, "rewards/VisualizationJSONCombinedORM/std": 0.10577289015054703, "step": 3392, "train_speed(iter/s)": 0.090528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 284.6875, "completions/min_length": 241.0, "epoch": 2.806451612903226, "grad_norm": 0.21514266729354858, "kl": 0.1502685546875, "learning_rate": 1.2627999458580952e-07, "loss": 0.0015040198341012, "memory(GiB)": 38.13, "reward": 0.40885117650032043, "reward_std": 0.06463784724473953, "rewards/VisualizationJSONCombinedORM/mean": 0.40885117650032043, "rewards/VisualizationJSONCombinedORM/std": 0.11950021982192993, "step": 3393, "train_speed(iter/s)": 0.090498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 270.625, "completions/min_length": 220.0, "epoch": 2.807278742762614, "grad_norm": 0.21343114972114563, "kl": 0.0401611328125, "learning_rate": 1.2520750238737113e-07, "loss": 0.00040101632475852966, "memory(GiB)": 38.13, "reward": 0.7029357552528381, "reward_std": 0.09721642732620239, "rewards/VisualizationJSONCombinedORM/mean": 0.7029357552528381, "rewards/VisualizationJSONCombinedORM/std": 0.0979028195142746, "step": 3394, "train_speed(iter/s)": 0.090466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 269.1875, "completions/min_length": 205.0, "epoch": 2.8081058726220016, "grad_norm": 0.2523532211780548, "kl": 0.0501708984375, "learning_rate": 1.241395262056999e-07, "loss": 0.0005016997456550598, "memory(GiB)": 38.13, "reward": 0.49927279353141785, "reward_std": 0.0621863417327404, "rewards/VisualizationJSONCombinedORM/mean": 0.49927279353141785, "rewards/VisualizationJSONCombinedORM/std": 0.18044832348823547, "step": 3395, "train_speed(iter/s)": 0.090432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 289.1875, "completions/min_length": 238.0, "epoch": 2.8089330024813894, "grad_norm": 0.19987793266773224, "kl": 0.0677490234375, "learning_rate": 1.2307606703017173e-07, "loss": 0.0006764084100723267, "memory(GiB)": 38.13, "reward": 0.7485054731369019, "reward_std": 0.07529832422733307, "rewards/VisualizationJSONCombinedORM/mean": 0.7485054731369019, "rewards/VisualizationJSONCombinedORM/std": 0.07472386956214905, "step": 3396, "train_speed(iter/s)": 0.090403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 323.375, "completions/min_length": 273.0, "epoch": 2.8097601323407773, "grad_norm": 0.17945456504821777, "kl": 0.0745849609375, "learning_rate": 1.220171258459768e-07, "loss": 0.0007457360625267029, "memory(GiB)": 38.13, "reward": 0.5998247861862183, "reward_std": 0.08852067589759827, "rewards/VisualizationJSONCombinedORM/mean": 0.5998247861862183, "rewards/VisualizationJSONCombinedORM/std": 0.1440821886062622, "step": 3397, "train_speed(iter/s)": 0.090362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 311.0625, "completions/min_length": 249.0, "epoch": 2.8105872622001655, "grad_norm": 0.22121523320674896, "kl": 0.0426025390625, "learning_rate": 1.20962703634121e-07, "loss": 0.0004263371229171753, "memory(GiB)": 38.13, "reward": 0.5661736130714417, "reward_std": 0.06421729922294617, "rewards/VisualizationJSONCombinedORM/mean": 0.5661736130714417, "rewards/VisualizationJSONCombinedORM/std": 0.08386953175067902, "step": 3398, "train_speed(iter/s)": 0.090319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 284.125, "completions/min_length": 218.0, "epoch": 2.8114143920595533, "grad_norm": 0.16584216058254242, "kl": 0.100341796875, "learning_rate": 1.199128013714218e-07, "loss": 0.0010047927498817444, "memory(GiB)": 38.13, "reward": 0.2987968921661377, "reward_std": 0.03261803835630417, "rewards/VisualizationJSONCombinedORM/mean": 0.2987968921661377, "rewards/VisualizationJSONCombinedORM/std": 0.04835950955748558, "step": 3399, "train_speed(iter/s)": 0.090268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 290.9375, "completions/min_length": 234.0, "epoch": 2.812241521918941, "grad_norm": 0.17412325739860535, "kl": 0.05255126953125, "learning_rate": 1.1886742003051177e-07, "loss": 0.0005262158811092377, "memory(GiB)": 38.13, "reward": 0.49829110503196716, "reward_std": 0.055248748511075974, "rewards/VisualizationJSONCombinedORM/mean": 0.49829110503196716, "rewards/VisualizationJSONCombinedORM/std": 0.12320509552955627, "step": 3400, "train_speed(iter/s)": 0.090234 }, { "epoch": 2.812241521918941, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 363.25, "eval_completions/mean_length": 301.4166666666667, "eval_completions/min_length": 250.45833333333334, "eval_kl": 0.09074910481770833, "eval_loss": 0.0008961123530752957, "eval_reward": 0.4542475286871195, "eval_reward_std": 0.056932021553317703, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4542475286871195, "eval_rewards/VisualizationJSONCombinedORM/std": 0.056932024036844574, "eval_runtime": 310.2321, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 294.9375, "completions/min_length": 229.0, "epoch": 2.8130686517783294, "grad_norm": 0.30469855666160583, "kl": 0.1065673828125, "learning_rate": 1.1782656057983233e-07, "loss": 0.0010652132332324982, "memory(GiB)": 38.13, "reward": 0.5643455386161804, "reward_std": 0.11276036500930786, "rewards/VisualizationJSONCombinedORM/mean": 0.5643455386161804, "rewards/VisualizationJSONCombinedORM/std": 0.1300400346517563, "step": 3401, "train_speed(iter/s)": 0.089458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 302.875, "completions/min_length": 242.0, "epoch": 2.8138957816377173, "grad_norm": 0.17806529998779297, "kl": 0.0731201171875, "learning_rate": 1.1679022398363937e-07, "loss": 0.0007314532995223999, "memory(GiB)": 38.13, "reward": 0.4759002923965454, "reward_std": 0.07269901782274246, "rewards/VisualizationJSONCombinedORM/mean": 0.4759002923965454, "rewards/VisualizationJSONCombinedORM/std": 0.1818377673625946, "step": 3402, "train_speed(iter/s)": 0.089409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 298.875, "completions/min_length": 238.0, "epoch": 2.814722911497105, "grad_norm": 0.1947663575410843, "kl": 0.0667724609375, "learning_rate": 1.157584112019966e-07, "loss": 0.0006671901792287827, "memory(GiB)": 38.13, "reward": 0.41830456256866455, "reward_std": 0.05743654817342758, "rewards/VisualizationJSONCombinedORM/mean": 0.41830456256866455, "rewards/VisualizationJSONCombinedORM/std": 0.10585544258356094, "step": 3403, "train_speed(iter/s)": 0.089385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 332.75, "completions/min_length": 251.0, "epoch": 2.815550041356493, "grad_norm": 0.19108985364437103, "kl": 0.196533203125, "learning_rate": 1.1473112319077717e-07, "loss": 0.001962520182132721, "memory(GiB)": 38.13, "reward": 0.5542884469032288, "reward_std": 0.06834287941455841, "rewards/VisualizationJSONCombinedORM/mean": 0.5542884469032288, "rewards/VisualizationJSONCombinedORM/std": 0.11468561738729477, "step": 3404, "train_speed(iter/s)": 0.089348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 312.625, "completions/min_length": 255.0, "epoch": 2.8163771712158807, "grad_norm": 0.1939316838979721, "kl": 0.0869140625, "learning_rate": 1.1370836090166204e-07, "loss": 0.0008712746202945709, "memory(GiB)": 38.13, "reward": 0.39232373237609863, "reward_std": 0.03586399555206299, "rewards/VisualizationJSONCombinedORM/mean": 0.39232373237609863, "rewards/VisualizationJSONCombinedORM/std": 0.07238684594631195, "step": 3405, "train_speed(iter/s)": 0.089313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 304.25, "completions/min_length": 255.0, "epoch": 2.817204301075269, "grad_norm": 0.23242345452308655, "kl": 0.1732177734375, "learning_rate": 1.1269012528214108e-07, "loss": 0.0017340891063213348, "memory(GiB)": 38.13, "reward": 0.5245641469955444, "reward_std": 0.06947211176156998, "rewards/VisualizationJSONCombinedORM/mean": 0.5245641469955444, "rewards/VisualizationJSONCombinedORM/std": 0.1357726901769638, "step": 3406, "train_speed(iter/s)": 0.089288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 324.5, "completions/min_length": 277.0, "epoch": 2.818031430934657, "grad_norm": 0.21524269878864288, "kl": 0.086181640625, "learning_rate": 1.1167641727550804e-07, "loss": 0.0008636582642793655, "memory(GiB)": 38.13, "reward": 0.6366310119628906, "reward_std": 0.07893364131450653, "rewards/VisualizationJSONCombinedORM/mean": 0.6366310119628906, "rewards/VisualizationJSONCombinedORM/std": 0.10343889892101288, "step": 3407, "train_speed(iter/s)": 0.089256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 319.6875, "completions/min_length": 245.0, "epoch": 2.8188585607940446, "grad_norm": 0.1746826469898224, "kl": 0.100830078125, "learning_rate": 1.1066723782086619e-07, "loss": 0.0010093152523040771, "memory(GiB)": 38.13, "reward": 0.45643770694732666, "reward_std": 0.06936828047037125, "rewards/VisualizationJSONCombinedORM/mean": 0.45643770694732666, "rewards/VisualizationJSONCombinedORM/std": 0.22948405146598816, "step": 3408, "train_speed(iter/s)": 0.089217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 278.8125, "completions/min_length": 220.0, "epoch": 2.8196856906534324, "grad_norm": 0.1435011774301529, "kl": 0.1649169921875, "learning_rate": 1.0966258785311989e-07, "loss": 0.001653328537940979, "memory(GiB)": 38.13, "reward": 0.5445834398269653, "reward_std": 0.08517417311668396, "rewards/VisualizationJSONCombinedORM/mean": 0.5445834398269653, "rewards/VisualizationJSONCombinedORM/std": 0.244135782122612, "step": 3409, "train_speed(iter/s)": 0.089184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 275.6875, "completions/min_length": 228.0, "epoch": 2.8205128205128203, "grad_norm": 0.19953423738479614, "kl": 0.0648193359375, "learning_rate": 1.0866246830297855e-07, "loss": 0.0006496310234069824, "memory(GiB)": 38.13, "reward": 0.41776537895202637, "reward_std": 0.04225388914346695, "rewards/VisualizationJSONCombinedORM/mean": 0.41776537895202637, "rewards/VisualizationJSONCombinedORM/std": 0.26665595173835754, "step": 3410, "train_speed(iter/s)": 0.089146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 283.8125, "completions/min_length": 255.0, "epoch": 2.8213399503722085, "grad_norm": 0.19223885238170624, "kl": 0.0755615234375, "learning_rate": 1.0766688009695548e-07, "loss": 0.0007549189031124115, "memory(GiB)": 38.13, "reward": 0.7487648725509644, "reward_std": 0.09257996082305908, "rewards/VisualizationJSONCombinedORM/mean": 0.7487648725509644, "rewards/VisualizationJSONCombinedORM/std": 0.10386276245117188, "step": 3411, "train_speed(iter/s)": 0.089113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 293.6875, "completions/min_length": 242.0, "epoch": 2.8221670802315963, "grad_norm": 0.19822746515274048, "kl": 0.051513671875, "learning_rate": 1.0667582415736455e-07, "loss": 0.0005154963582754135, "memory(GiB)": 38.13, "reward": 0.36007118225097656, "reward_std": 0.04519754648208618, "rewards/VisualizationJSONCombinedORM/mean": 0.36007118225097656, "rewards/VisualizationJSONCombinedORM/std": 0.04592778533697128, "step": 3412, "train_speed(iter/s)": 0.089081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 285.125, "completions/min_length": 221.0, "epoch": 2.822994210090984, "grad_norm": 0.17113086581230164, "kl": 0.04595947265625, "learning_rate": 1.0568930140232192e-07, "loss": 0.00045883841812610626, "memory(GiB)": 38.13, "reward": 0.6908719539642334, "reward_std": 0.06841779500246048, "rewards/VisualizationJSONCombinedORM/mean": 0.6908719539642334, "rewards/VisualizationJSONCombinedORM/std": 0.10488442331552505, "step": 3413, "train_speed(iter/s)": 0.089047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 299.125, "completions/min_length": 238.0, "epoch": 2.8238213399503724, "grad_norm": 0.18847337365150452, "kl": 0.0416259765625, "learning_rate": 1.0470731274574542e-07, "loss": 0.0004162304103374481, "memory(GiB)": 38.13, "reward": 0.6696693897247314, "reward_std": 0.08567705005407333, "rewards/VisualizationJSONCombinedORM/mean": 0.6696693897247314, "rewards/VisualizationJSONCombinedORM/std": 0.08285222947597504, "step": 3414, "train_speed(iter/s)": 0.089011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.3125, "completions/min_length": 239.0, "epoch": 2.8246484698097603, "grad_norm": 0.17727090418338776, "kl": 0.0877685546875, "learning_rate": 1.0372985909734956e-07, "loss": 0.0008781477808952332, "memory(GiB)": 38.13, "reward": 0.6370179653167725, "reward_std": 0.08706873655319214, "rewards/VisualizationJSONCombinedORM/mean": 0.6370179653167725, "rewards/VisualizationJSONCombinedORM/std": 0.10010765492916107, "step": 3415, "train_speed(iter/s)": 0.088966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 314.5625, "completions/min_length": 253.0, "epoch": 2.825475599669148, "grad_norm": 0.20074397325515747, "kl": 0.1112060546875, "learning_rate": 1.0275694136265057e-07, "loss": 0.0011127851903438568, "memory(GiB)": 38.13, "reward": 0.5567591190338135, "reward_std": 0.06391732394695282, "rewards/VisualizationJSONCombinedORM/mean": 0.5567591190338135, "rewards/VisualizationJSONCombinedORM/std": 0.11083965003490448, "step": 3416, "train_speed(iter/s)": 0.088934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 288.0, "completions/min_length": 215.0, "epoch": 2.826302729528536, "grad_norm": 0.18436329066753387, "kl": 0.0615234375, "learning_rate": 1.0178856044295971e-07, "loss": 0.0006164610385894775, "memory(GiB)": 38.13, "reward": 0.27327388525009155, "reward_std": 0.03366696089506149, "rewards/VisualizationJSONCombinedORM/mean": 0.27327388525009155, "rewards/VisualizationJSONCombinedORM/std": 0.04572339728474617, "step": 3417, "train_speed(iter/s)": 0.088906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 303.4375, "completions/min_length": 230.0, "epoch": 2.8271298593879237, "grad_norm": 0.17927436530590057, "kl": 0.12353515625, "learning_rate": 1.0082471723538767e-07, "loss": 0.0012337788939476013, "memory(GiB)": 38.13, "reward": 0.5217716693878174, "reward_std": 0.08248675614595413, "rewards/VisualizationJSONCombinedORM/mean": 0.5217716693878174, "rewards/VisualizationJSONCombinedORM/std": 0.16721710562705994, "step": 3418, "train_speed(iter/s)": 0.088884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 331.0, "completions/min_length": 275.0, "epoch": 2.827956989247312, "grad_norm": 0.17020830512046814, "kl": 0.0587158203125, "learning_rate": 9.986541263284077e-08, "loss": 0.0005869872402399778, "memory(GiB)": 38.13, "reward": 0.2545529007911682, "reward_std": 0.028448328375816345, "rewards/VisualizationJSONCombinedORM/mean": 0.2545529007911682, "rewards/VisualizationJSONCombinedORM/std": 0.05813896283507347, "step": 3419, "train_speed(iter/s)": 0.088852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 260.875, "completions/min_length": 219.0, "epoch": 2.8287841191067, "grad_norm": 0.16993474960327148, "kl": 0.0380859375, "learning_rate": 9.891064752402091e-08, "loss": 0.0003809891641139984, "memory(GiB)": 38.13, "reward": 0.6205493211746216, "reward_std": 0.04355774074792862, "rewards/VisualizationJSONCombinedORM/mean": 0.6205493211746216, "rewards/VisualizationJSONCombinedORM/std": 0.18288670480251312, "step": 3420, "train_speed(iter/s)": 0.088829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 309.625, "completions/min_length": 242.0, "epoch": 2.8296112489660876, "grad_norm": 0.18694880604743958, "kl": 0.0504150390625, "learning_rate": 9.796042279342277e-08, "loss": 0.0005025677382946014, "memory(GiB)": 38.13, "reward": 0.5193651914596558, "reward_std": 0.05587241053581238, "rewards/VisualizationJSONCombinedORM/mean": 0.5193651914596558, "rewards/VisualizationJSONCombinedORM/std": 0.06673648953437805, "step": 3421, "train_speed(iter/s)": 0.088792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 304.0625, "completions/min_length": 262.0, "epoch": 2.8304383788254754, "grad_norm": 0.1924775242805481, "kl": 0.0828857421875, "learning_rate": 9.701473932133776e-08, "loss": 0.0008274167776107788, "memory(GiB)": 38.13, "reward": 0.5303303003311157, "reward_std": 0.05783645436167717, "rewards/VisualizationJSONCombinedORM/mean": 0.5303303003311157, "rewards/VisualizationJSONCombinedORM/std": 0.08181243389844894, "step": 3422, "train_speed(iter/s)": 0.088762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 332.125, "completions/min_length": 274.0, "epoch": 2.8312655086848633, "grad_norm": 0.1966780424118042, "kl": 0.1185302734375, "learning_rate": 9.607359798384785e-08, "loss": 0.0011870861053466797, "memory(GiB)": 38.13, "reward": 0.3837568163871765, "reward_std": 0.047785449773073196, "rewards/VisualizationJSONCombinedORM/mean": 0.3837568163871765, "rewards/VisualizationJSONCombinedORM/std": 0.11622559279203415, "step": 3423, "train_speed(iter/s)": 0.088716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 319.9375, "completions/min_length": 230.0, "epoch": 2.8320926385442515, "grad_norm": 0.17826467752456665, "kl": 0.05419921875, "learning_rate": 9.51369996528284e-08, "loss": 0.0005421116948127747, "memory(GiB)": 38.13, "reward": 0.5131730437278748, "reward_std": 0.06289704889059067, "rewards/VisualizationJSONCombinedORM/mean": 0.5131730437278748, "rewards/VisualizationJSONCombinedORM/std": 0.07754658162593842, "step": 3424, "train_speed(iter/s)": 0.088681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 277.5, "completions/min_length": 224.0, "epoch": 2.8329197684036393, "grad_norm": 0.2004341036081314, "kl": 0.0970458984375, "learning_rate": 9.420494519594648e-08, "loss": 0.0009675808250904083, "memory(GiB)": 38.13, "reward": 0.5051788091659546, "reward_std": 0.052077945321798325, "rewards/VisualizationJSONCombinedORM/mean": 0.5051788091659546, "rewards/VisualizationJSONCombinedORM/std": 0.1618908792734146, "step": 3425, "train_speed(iter/s)": 0.088642 }, { "epoch": 2.8329197684036393, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.8333333333333, "eval_completions/mean_length": 302.28125, "eval_completions/min_length": 252.79166666666666, "eval_kl": 0.081817626953125, "eval_loss": 0.0008119518752209842, "eval_reward": 0.4542897151162227, "eval_reward_std": 0.05818422751811644, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4542897151162227, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05818422899271051, "eval_runtime": 306.0774, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 301.5, "completions/min_length": 241.0, "epoch": 2.833746898263027, "grad_norm": 0.15589278936386108, "kl": 0.0264892578125, "learning_rate": 9.327743547665858e-08, "loss": 0.00026459619402885437, "memory(GiB)": 38.13, "reward": 0.5758012533187866, "reward_std": 0.02295985445380211, "rewards/VisualizationJSONCombinedORM/mean": 0.5758012533187866, "rewards/VisualizationJSONCombinedORM/std": 0.17626230418682098, "step": 3426, "train_speed(iter/s)": 0.087915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 293.125, "completions/min_length": 222.0, "epoch": 2.8345740281224154, "grad_norm": 0.1777793914079666, "kl": 0.06292724609375, "learning_rate": 9.235447135421127e-08, "loss": 0.0006291307508945465, "memory(GiB)": 38.13, "reward": 0.3818826973438263, "reward_std": 0.040104422718286514, "rewards/VisualizationJSONCombinedORM/mean": 0.3818826973438263, "rewards/VisualizationJSONCombinedORM/std": 0.09599094092845917, "step": 3427, "train_speed(iter/s)": 0.087889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 277.25, "completions/min_length": 203.0, "epoch": 2.8354011579818033, "grad_norm": 0.2707309424877167, "kl": 0.0618896484375, "learning_rate": 9.143605368364006e-08, "loss": 0.0006180219352245331, "memory(GiB)": 38.13, "reward": 0.39743226766586304, "reward_std": 0.12039659917354584, "rewards/VisualizationJSONCombinedORM/mean": 0.39743226766586304, "rewards/VisualizationJSONCombinedORM/std": 0.19350433349609375, "step": 3428, "train_speed(iter/s)": 0.087858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 301.8125, "completions/min_length": 230.0, "epoch": 2.836228287841191, "grad_norm": 0.17920628190040588, "kl": 0.03668212890625, "learning_rate": 9.052218331576878e-08, "loss": 0.00036733224987983704, "memory(GiB)": 38.13, "reward": 0.7523834705352783, "reward_std": 0.06565891206264496, "rewards/VisualizationJSONCombinedORM/mean": 0.7523834705352783, "rewards/VisualizationJSONCombinedORM/std": 0.08517498522996902, "step": 3429, "train_speed(iter/s)": 0.087818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 350.0625, "completions/min_length": 250.0, "epoch": 2.837055417700579, "grad_norm": 0.23669394850730896, "kl": 0.0762939453125, "learning_rate": 8.961286109720912e-08, "loss": 0.0007629822939634323, "memory(GiB)": 38.13, "reward": 0.6604364514350891, "reward_std": 0.09567341208457947, "rewards/VisualizationJSONCombinedORM/mean": 0.6604364514350891, "rewards/VisualizationJSONCombinedORM/std": 0.09893646836280823, "step": 3430, "train_speed(iter/s)": 0.087787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 282.5, "completions/min_length": 237.0, "epoch": 2.8378825475599667, "grad_norm": 0.28620803356170654, "kl": 0.1058349609375, "learning_rate": 8.87080878703589e-08, "loss": 0.0010578632354736328, "memory(GiB)": 38.13, "reward": 0.5855082869529724, "reward_std": 0.062081120908260345, "rewards/VisualizationJSONCombinedORM/mean": 0.5855082869529724, "rewards/VisualizationJSONCombinedORM/std": 0.10432934015989304, "step": 3431, "train_speed(iter/s)": 0.087751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 314.25, "completions/min_length": 236.0, "epoch": 2.838709677419355, "grad_norm": 0.18894343078136444, "kl": 0.037872314453125, "learning_rate": 8.780786447340095e-08, "loss": 0.0003792792558670044, "memory(GiB)": 38.13, "reward": 0.3473443388938904, "reward_std": 0.03183494880795479, "rewards/VisualizationJSONCombinedORM/mean": 0.3473443388938904, "rewards/VisualizationJSONCombinedORM/std": 0.11788076162338257, "step": 3432, "train_speed(iter/s)": 0.087718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 288.5, "completions/min_length": 222.0, "epoch": 2.839536807278743, "grad_norm": 0.21335695683956146, "kl": 0.0570068359375, "learning_rate": 8.691219174030485e-08, "loss": 0.0005716234445571899, "memory(GiB)": 38.13, "reward": 0.4131130874156952, "reward_std": 0.03758593648672104, "rewards/VisualizationJSONCombinedORM/mean": 0.4131130874156952, "rewards/VisualizationJSONCombinedORM/std": 0.13313299417495728, "step": 3433, "train_speed(iter/s)": 0.087686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 304.4375, "completions/min_length": 241.0, "epoch": 2.8403639371381306, "grad_norm": 0.18547479808330536, "kl": 0.0638427734375, "learning_rate": 8.602107050082298e-08, "loss": 0.0006387345492839813, "memory(GiB)": 38.13, "reward": 0.36833494901657104, "reward_std": 0.04396633058786392, "rewards/VisualizationJSONCombinedORM/mean": 0.36833494901657104, "rewards/VisualizationJSONCombinedORM/std": 0.061124566942453384, "step": 3434, "train_speed(iter/s)": 0.087655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 310.0, "completions/min_length": 240.0, "epoch": 2.8411910669975184, "grad_norm": 0.2398199588060379, "kl": 0.0623779296875, "learning_rate": 8.513450158049109e-08, "loss": 0.0006246492266654968, "memory(GiB)": 38.13, "reward": 0.3773435354232788, "reward_std": 0.038350909948349, "rewards/VisualizationJSONCombinedORM/mean": 0.3773435354232788, "rewards/VisualizationJSONCombinedORM/std": 0.0782780572772026, "step": 3435, "train_speed(iter/s)": 0.087624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 331.8125, "completions/min_length": 263.0, "epoch": 2.8420181968569063, "grad_norm": 0.23664531111717224, "kl": 0.26800537109375, "learning_rate": 8.425248580062939e-08, "loss": 0.002673584967851639, "memory(GiB)": 38.13, "reward": 0.623347818851471, "reward_std": 0.07034161686897278, "rewards/VisualizationJSONCombinedORM/mean": 0.623347818851471, "rewards/VisualizationJSONCombinedORM/std": 0.07212850451469421, "step": 3436, "train_speed(iter/s)": 0.087582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 331.1875, "completions/min_length": 250.0, "epoch": 2.8428453267162945, "grad_norm": 0.214086651802063, "kl": 0.1217041015625, "learning_rate": 8.337502397833819e-08, "loss": 0.0012186765670776367, "memory(GiB)": 38.13, "reward": 0.6159915924072266, "reward_std": 0.07477735728025436, "rewards/VisualizationJSONCombinedORM/mean": 0.6159915924072266, "rewards/VisualizationJSONCombinedORM/std": 0.18867431581020355, "step": 3437, "train_speed(iter/s)": 0.087554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 283.125, "completions/min_length": 234.0, "epoch": 2.8436724565756824, "grad_norm": 0.2194991409778595, "kl": 0.0445556640625, "learning_rate": 8.250211692650001e-08, "loss": 0.00044561177492141724, "memory(GiB)": 38.13, "reward": 0.40264153480529785, "reward_std": 0.0633455440402031, "rewards/VisualizationJSONCombinedORM/mean": 0.40264153480529785, "rewards/VisualizationJSONCombinedORM/std": 0.11460793018341064, "step": 3438, "train_speed(iter/s)": 0.087533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 288.8125, "completions/min_length": 220.0, "epoch": 2.84449958643507, "grad_norm": 0.15607832372188568, "kl": 0.03167724609375, "learning_rate": 8.163376545377744e-08, "loss": 0.00031767040491104126, "memory(GiB)": 38.13, "reward": 0.7683535814285278, "reward_std": 0.04904098063707352, "rewards/VisualizationJSONCombinedORM/mean": 0.7683535814285278, "rewards/VisualizationJSONCombinedORM/std": 0.0577927827835083, "step": 3439, "train_speed(iter/s)": 0.087499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 296.375, "completions/min_length": 231.0, "epoch": 2.8453267162944584, "grad_norm": 0.19269245862960815, "kl": 0.093994140625, "learning_rate": 8.076997036461254e-08, "loss": 0.0009420439600944519, "memory(GiB)": 38.13, "reward": 0.3494403064250946, "reward_std": 0.033293016254901886, "rewards/VisualizationJSONCombinedORM/mean": 0.3494403064250946, "rewards/VisualizationJSONCombinedORM/std": 0.13443678617477417, "step": 3440, "train_speed(iter/s)": 0.087467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 310.5625, "completions/min_length": 258.0, "epoch": 2.8461538461538463, "grad_norm": 0.21255086362361908, "kl": 0.0877685546875, "learning_rate": 7.991073245922798e-08, "loss": 0.0008787252008914948, "memory(GiB)": 38.13, "reward": 0.5651458501815796, "reward_std": 0.06618661433458328, "rewards/VisualizationJSONCombinedORM/mean": 0.5651458501815796, "rewards/VisualizationJSONCombinedORM/std": 0.20682750642299652, "step": 3441, "train_speed(iter/s)": 0.087432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 292.4375, "completions/min_length": 219.0, "epoch": 2.846980976013234, "grad_norm": 0.1809167116880417, "kl": 0.053466796875, "learning_rate": 7.905605253362203e-08, "loss": 0.0005340985953807831, "memory(GiB)": 38.13, "reward": 0.6759840250015259, "reward_std": 0.07317742705345154, "rewards/VisualizationJSONCombinedORM/mean": 0.6759840250015259, "rewards/VisualizationJSONCombinedORM/std": 0.08286216855049133, "step": 3442, "train_speed(iter/s)": 0.087401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 319.25, "completions/min_length": 251.0, "epoch": 2.847808105872622, "grad_norm": 0.19054014980793, "kl": 0.0595703125, "learning_rate": 7.820593137957244e-08, "loss": 0.0005964934825897217, "memory(GiB)": 38.13, "reward": 0.561479926109314, "reward_std": 0.07870117574930191, "rewards/VisualizationJSONCombinedORM/mean": 0.561479926109314, "rewards/VisualizationJSONCombinedORM/std": 0.09478174895048141, "step": 3443, "train_speed(iter/s)": 0.087367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 305.625, "completions/min_length": 236.0, "epoch": 2.8486352357320097, "grad_norm": 0.19980013370513916, "kl": 0.05413818359375, "learning_rate": 7.736036978463202e-08, "loss": 0.0005428530275821686, "memory(GiB)": 38.13, "reward": 0.4568398892879486, "reward_std": 0.07487568259239197, "rewards/VisualizationJSONCombinedORM/mean": 0.4568398892879486, "rewards/VisualizationJSONCombinedORM/std": 0.1335972398519516, "step": 3444, "train_speed(iter/s)": 0.087336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 280.125, "completions/min_length": 248.0, "epoch": 2.849462365591398, "grad_norm": 0.14430388808250427, "kl": 0.05322265625, "learning_rate": 7.651936853213193e-08, "loss": 0.0005328953266143799, "memory(GiB)": 38.13, "reward": 0.5783815383911133, "reward_std": 0.055785875767469406, "rewards/VisualizationJSONCombinedORM/mean": 0.5783815383911133, "rewards/VisualizationJSONCombinedORM/std": 0.13952107727527618, "step": 3445, "train_speed(iter/s)": 0.087315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 297.875, "completions/min_length": 227.0, "epoch": 2.850289495450786, "grad_norm": 0.2047688066959381, "kl": 0.07025146484375, "learning_rate": 7.56829284011762e-08, "loss": 0.0007036179304122925, "memory(GiB)": 38.13, "reward": 0.6241722106933594, "reward_std": 0.08964310586452484, "rewards/VisualizationJSONCombinedORM/mean": 0.6241722106933594, "rewards/VisualizationJSONCombinedORM/std": 0.08777610957622528, "step": 3446, "train_speed(iter/s)": 0.08729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 322.0, "completions/min_length": 240.0, "epoch": 2.8511166253101736, "grad_norm": 0.2152300775051117, "kl": 0.072509765625, "learning_rate": 7.485105016664551e-08, "loss": 0.000722743570804596, "memory(GiB)": 38.13, "reward": 0.5554001927375793, "reward_std": 0.055782418698072433, "rewards/VisualizationJSONCombinedORM/mean": 0.5554001927375793, "rewards/VisualizationJSONCombinedORM/std": 0.12873688340187073, "step": 3447, "train_speed(iter/s)": 0.087257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 309.9375, "completions/min_length": 237.0, "epoch": 2.851943755169562, "grad_norm": 0.21506890654563904, "kl": 0.0533447265625, "learning_rate": 7.402373459919231e-08, "loss": 0.0005333423614501953, "memory(GiB)": 38.13, "reward": 0.5167628526687622, "reward_std": 0.1065884679555893, "rewards/VisualizationJSONCombinedORM/mean": 0.5167628526687622, "rewards/VisualizationJSONCombinedORM/std": 0.2016115039587021, "step": 3448, "train_speed(iter/s)": 0.087224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 326.8125, "completions/min_length": 234.0, "epoch": 2.8527708850289497, "grad_norm": 0.19612069427967072, "kl": 0.0947265625, "learning_rate": 7.320098246524465e-08, "loss": 0.0009459443390369415, "memory(GiB)": 38.13, "reward": 0.7344555258750916, "reward_std": 0.08755412697792053, "rewards/VisualizationJSONCombinedORM/mean": 0.7344555258750916, "rewards/VisualizationJSONCombinedORM/std": 0.086490698158741, "step": 3449, "train_speed(iter/s)": 0.08718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 322.4375, "completions/min_length": 227.0, "epoch": 2.8535980148883375, "grad_norm": 0.18137268722057343, "kl": 0.05755615234375, "learning_rate": 7.238279452700004e-08, "loss": 0.0005762092769145966, "memory(GiB)": 38.13, "reward": 0.3386910855770111, "reward_std": 0.02662644535303116, "rewards/VisualizationJSONCombinedORM/mean": 0.3386910855770111, "rewards/VisualizationJSONCombinedORM/std": 0.03285061940550804, "step": 3450, "train_speed(iter/s)": 0.087153 }, { "epoch": 2.8535980148883375, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.875, "eval_completions/mean_length": 294.0885416666667, "eval_completions/min_length": 243.83333333333334, "eval_kl": 0.07099405924479167, "eval_loss": 0.0007142734830267727, "eval_reward": 0.4587616690744956, "eval_reward_std": 0.06240627317068478, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4587616690744956, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0624062759646525, "eval_runtime": 299.247, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 306.6875, "completions/min_length": 254.0, "epoch": 2.8544251447477254, "grad_norm": 0.18297079205513, "kl": 0.0626220703125, "learning_rate": 7.156917154243048e-08, "loss": 0.0006260685622692108, "memory(GiB)": 38.13, "reward": 0.559524655342102, "reward_std": 0.057342417538166046, "rewards/VisualizationJSONCombinedORM/mean": 0.559524655342102, "rewards/VisualizationJSONCombinedORM/std": 0.055472567677497864, "step": 3451, "train_speed(iter/s)": 0.086474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 284.375, "completions/min_length": 224.0, "epoch": 2.855252274607113, "grad_norm": 0.1748414933681488, "kl": 0.03680419921875, "learning_rate": 7.076011426527696e-08, "loss": 0.0003678537905216217, "memory(GiB)": 38.13, "reward": 0.37179917097091675, "reward_std": 0.0379662923514843, "rewards/VisualizationJSONCombinedORM/mean": 0.37179917097091675, "rewards/VisualizationJSONCombinedORM/std": 0.10836013406515121, "step": 3452, "train_speed(iter/s)": 0.086447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 318.375, "completions/min_length": 244.0, "epoch": 2.8560794044665014, "grad_norm": 0.17349325120449066, "kl": 0.0399169921875, "learning_rate": 6.995562344505213e-08, "loss": 0.0003988891839981079, "memory(GiB)": 38.13, "reward": 0.40628302097320557, "reward_std": 0.03015226684510708, "rewards/VisualizationJSONCombinedORM/mean": 0.40628302097320557, "rewards/VisualizationJSONCombinedORM/std": 0.11982813477516174, "step": 3453, "train_speed(iter/s)": 0.086418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 298.3125, "completions/min_length": 241.0, "epoch": 2.8569065343258893, "grad_norm": 0.2001788318157196, "kl": 0.0836181640625, "learning_rate": 6.915569982703819e-08, "loss": 0.0008375011384487152, "memory(GiB)": 38.13, "reward": 0.4732441008090973, "reward_std": 0.06470675021409988, "rewards/VisualizationJSONCombinedORM/mean": 0.4732441008090973, "rewards/VisualizationJSONCombinedORM/std": 0.07932239770889282, "step": 3454, "train_speed(iter/s)": 0.086394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 280.5625, "completions/min_length": 233.0, "epoch": 2.857733664185277, "grad_norm": 0.2058744728565216, "kl": 0.1446533203125, "learning_rate": 6.836034415228509e-08, "loss": 0.001443319022655487, "memory(GiB)": 38.13, "reward": 0.5991730690002441, "reward_std": 0.07096852362155914, "rewards/VisualizationJSONCombinedORM/mean": 0.5991730690002441, "rewards/VisualizationJSONCombinedORM/std": 0.07962675392627716, "step": 3455, "train_speed(iter/s)": 0.086371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 287.75, "completions/min_length": 245.0, "epoch": 2.858560794044665, "grad_norm": 0.19047993421554565, "kl": 0.121337890625, "learning_rate": 6.756955715761127e-08, "loss": 0.0012146756052970886, "memory(GiB)": 38.13, "reward": 0.4393329620361328, "reward_std": 0.035888317972421646, "rewards/VisualizationJSONCombinedORM/mean": 0.4393329620361328, "rewards/VisualizationJSONCombinedORM/std": 0.10470631718635559, "step": 3456, "train_speed(iter/s)": 0.086338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 295.75, "completions/min_length": 234.0, "epoch": 2.8593879239040527, "grad_norm": 0.18185114860534668, "kl": 0.13140869140625, "learning_rate": 6.678333957560513e-08, "loss": 0.0013162419199943542, "memory(GiB)": 38.13, "reward": 0.7168760895729065, "reward_std": 0.07492700219154358, "rewards/VisualizationJSONCombinedORM/mean": 0.7168760895729065, "rewards/VisualizationJSONCombinedORM/std": 0.07353977113962173, "step": 3457, "train_speed(iter/s)": 0.086317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 276.1875, "completions/min_length": 236.0, "epoch": 2.860215053763441, "grad_norm": 0.19882149994373322, "kl": 0.0631103515625, "learning_rate": 6.600169213461849e-08, "loss": 0.000629521906375885, "memory(GiB)": 38.13, "reward": 0.48868656158447266, "reward_std": 0.07305851578712463, "rewards/VisualizationJSONCombinedORM/mean": 0.48868656158447266, "rewards/VisualizationJSONCombinedORM/std": 0.23357751965522766, "step": 3458, "train_speed(iter/s)": 0.086276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 300.5625, "completions/min_length": 243.0, "epoch": 2.861042183622829, "grad_norm": 0.18480148911476135, "kl": 0.0592041015625, "learning_rate": 6.522461555877213e-08, "loss": 0.0005917176604270935, "memory(GiB)": 38.13, "reward": 0.5694183707237244, "reward_std": 0.06164320185780525, "rewards/VisualizationJSONCombinedORM/mean": 0.5694183707237244, "rewards/VisualizationJSONCombinedORM/std": 0.187912255525589, "step": 3459, "train_speed(iter/s)": 0.086246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 274.3125, "completions/min_length": 251.0, "epoch": 2.8618693134822166, "grad_norm": 0.17614707350730896, "kl": 0.0802001953125, "learning_rate": 6.445211056794965e-08, "loss": 0.0008017346262931824, "memory(GiB)": 38.13, "reward": 0.36355412006378174, "reward_std": 0.030668608844280243, "rewards/VisualizationJSONCombinedORM/mean": 0.36355412006378174, "rewards/VisualizationJSONCombinedORM/std": 0.1160801500082016, "step": 3460, "train_speed(iter/s)": 0.086218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 326.5625, "completions/min_length": 273.0, "epoch": 2.862696443341605, "grad_norm": 0.2075710892677307, "kl": 0.04718017578125, "learning_rate": 6.368417787780246e-08, "loss": 0.0004722345620393753, "memory(GiB)": 38.13, "reward": 0.43765100836753845, "reward_std": 0.06851876527070999, "rewards/VisualizationJSONCombinedORM/mean": 0.43765100836753845, "rewards/VisualizationJSONCombinedORM/std": 0.07933993637561798, "step": 3461, "train_speed(iter/s)": 0.086188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 319.0, "completions/min_length": 237.0, "epoch": 2.8635235732009927, "grad_norm": 0.24407002329826355, "kl": 0.07135009765625, "learning_rate": 6.292081819974427e-08, "loss": 0.0007128007709980011, "memory(GiB)": 38.13, "reward": 0.5892595052719116, "reward_std": 0.06809136271476746, "rewards/VisualizationJSONCombinedORM/mean": 0.5892595052719116, "rewards/VisualizationJSONCombinedORM/std": 0.17047081887722015, "step": 3462, "train_speed(iter/s)": 0.086165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 304.3125, "completions/min_length": 236.0, "epoch": 2.8643507030603805, "grad_norm": 0.17673338949680328, "kl": 0.0692138671875, "learning_rate": 6.216203224095386e-08, "loss": 0.0006912276148796082, "memory(GiB)": 38.13, "reward": 0.7478584051132202, "reward_std": 0.031607769429683685, "rewards/VisualizationJSONCombinedORM/mean": 0.7478584051132202, "rewards/VisualizationJSONCombinedORM/std": 0.05439452826976776, "step": 3463, "train_speed(iter/s)": 0.086143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 291.1875, "completions/min_length": 235.0, "epoch": 2.8651778329197684, "grad_norm": 0.23262356221675873, "kl": 0.0614013671875, "learning_rate": 6.140782070437002e-08, "loss": 0.0006133466958999634, "memory(GiB)": 38.13, "reward": 0.49865493178367615, "reward_std": 0.06249484419822693, "rewards/VisualizationJSONCombinedORM/mean": 0.49865493178367615, "rewards/VisualizationJSONCombinedORM/std": 0.11647342145442963, "step": 3464, "train_speed(iter/s)": 0.086121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 277.25, "completions/min_length": 225.0, "epoch": 2.866004962779156, "grad_norm": 0.18242718279361725, "kl": 0.1204833984375, "learning_rate": 6.065818428869774e-08, "loss": 0.0012055188417434692, "memory(GiB)": 38.13, "reward": 0.6155290603637695, "reward_std": 0.08660553395748138, "rewards/VisualizationJSONCombinedORM/mean": 0.6155290603637695, "rewards/VisualizationJSONCombinedORM/std": 0.09480222314596176, "step": 3465, "train_speed(iter/s)": 0.086098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 280.3125, "completions/min_length": 238.0, "epoch": 2.8668320926385444, "grad_norm": 0.1748100221157074, "kl": 0.068603515625, "learning_rate": 5.991312368839986e-08, "loss": 0.0006851786747574806, "memory(GiB)": 38.13, "reward": 0.44112706184387207, "reward_std": 0.03856329619884491, "rewards/VisualizationJSONCombinedORM/mean": 0.44112706184387207, "rewards/VisualizationJSONCombinedORM/std": 0.2208859771490097, "step": 3466, "train_speed(iter/s)": 0.086072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 281.5, "completions/min_length": 228.0, "epoch": 2.8676592224979323, "grad_norm": 0.18035565316677094, "kl": 0.088134765625, "learning_rate": 5.917263959370312e-08, "loss": 0.0008802786469459534, "memory(GiB)": 38.13, "reward": 0.3775733709335327, "reward_std": 0.05132380872964859, "rewards/VisualizationJSONCombinedORM/mean": 0.3775733709335327, "rewards/VisualizationJSONCombinedORM/std": 0.22358675301074982, "step": 3467, "train_speed(iter/s)": 0.08604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 291.875, "completions/min_length": 205.0, "epoch": 2.86848635235732, "grad_norm": 0.18178656697273254, "kl": 0.05084228515625, "learning_rate": 5.843673269059269e-08, "loss": 0.0005080066621303558, "memory(GiB)": 38.13, "reward": 0.5370545387268066, "reward_std": 0.05308890342712402, "rewards/VisualizationJSONCombinedORM/mean": 0.5370545387268066, "rewards/VisualizationJSONCombinedORM/std": 0.052284806966781616, "step": 3468, "train_speed(iter/s)": 0.086012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 289.0, "completions/min_length": 246.0, "epoch": 2.869313482216708, "grad_norm": 0.16570068895816803, "kl": 0.064208984375, "learning_rate": 5.770540366081434e-08, "loss": 0.0006420612335205078, "memory(GiB)": 38.13, "reward": 0.32590922713279724, "reward_std": 0.02466975897550583, "rewards/VisualizationJSONCombinedORM/mean": 0.32590922713279724, "rewards/VisualizationJSONCombinedORM/std": 0.02729758992791176, "step": 3469, "train_speed(iter/s)": 0.085978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 258.25, "completions/min_length": 204.0, "epoch": 2.8701406120760957, "grad_norm": 0.22855737805366516, "kl": 0.0584716796875, "learning_rate": 5.697865318187279e-08, "loss": 0.0005845576524734497, "memory(GiB)": 38.13, "reward": 0.49003005027770996, "reward_std": 0.07166872173547745, "rewards/VisualizationJSONCombinedORM/mean": 0.49003005027770996, "rewards/VisualizationJSONCombinedORM/std": 0.1793537586927414, "step": 3470, "train_speed(iter/s)": 0.085962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 268.6875, "completions/min_length": 226.0, "epoch": 2.870967741935484, "grad_norm": 0.23574116826057434, "kl": 0.142822265625, "learning_rate": 5.625648192703115e-08, "loss": 0.0014280527830123901, "memory(GiB)": 38.13, "reward": 0.43365317583084106, "reward_std": 0.09296358376741409, "rewards/VisualizationJSONCombinedORM/mean": 0.43365317583084106, "rewards/VisualizationJSONCombinedORM/std": 0.09958775341510773, "step": 3471, "train_speed(iter/s)": 0.085932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 299.875, "completions/min_length": 232.0, "epoch": 2.871794871794872, "grad_norm": 0.22005286812782288, "kl": 0.06866455078125, "learning_rate": 5.55388905653087e-08, "loss": 0.0006872080266475677, "memory(GiB)": 38.13, "reward": 0.4585394859313965, "reward_std": 0.08399622142314911, "rewards/VisualizationJSONCombinedORM/mean": 0.4585394859313965, "rewards/VisualizationJSONCombinedORM/std": 0.1883333921432495, "step": 3472, "train_speed(iter/s)": 0.085904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 286.75, "completions/min_length": 228.0, "epoch": 2.8726220016542596, "grad_norm": 0.20604762434959412, "kl": 0.141357421875, "learning_rate": 5.4825879761485904e-08, "loss": 0.0014132633805274963, "memory(GiB)": 38.13, "reward": 0.5678507089614868, "reward_std": 0.08707758784294128, "rewards/VisualizationJSONCombinedORM/mean": 0.5678507089614868, "rewards/VisualizationJSONCombinedORM/std": 0.1469290405511856, "step": 3473, "train_speed(iter/s)": 0.085868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 328.0625, "completions/min_length": 234.0, "epoch": 2.873449131513648, "grad_norm": 0.19156713783740997, "kl": 0.04486083984375, "learning_rate": 5.411745017609493e-08, "loss": 0.0004484206438064575, "memory(GiB)": 38.13, "reward": 0.5279285311698914, "reward_std": 0.05464150756597519, "rewards/VisualizationJSONCombinedORM/mean": 0.5279285311698914, "rewards/VisualizationJSONCombinedORM/std": 0.05718393996357918, "step": 3474, "train_speed(iter/s)": 0.085836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 300.0625, "completions/min_length": 225.0, "epoch": 2.8742762613730357, "grad_norm": 0.1635889858007431, "kl": 0.0345458984375, "learning_rate": 5.341360246542804e-08, "loss": 0.00034554023295640945, "memory(GiB)": 38.13, "reward": 0.5415737628936768, "reward_std": 0.049288176000118256, "rewards/VisualizationJSONCombinedORM/mean": 0.5415737628936768, "rewards/VisualizationJSONCombinedORM/std": 0.10231886059045792, "step": 3475, "train_speed(iter/s)": 0.085803 }, { "epoch": 2.8742762613730357, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 361.7083333333333, "eval_completions/mean_length": 296.484375, "eval_completions/min_length": 247.83333333333334, "eval_kl": 0.07236735026041667, "eval_loss": 0.0007246856694109738, "eval_reward": 0.45553318535288173, "eval_reward_std": 0.054534605937078595, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45553318535288173, "eval_rewards/VisualizationJSONCombinedORM/std": 0.054534606325129666, "eval_runtime": 309.9556, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 316.9375, "completions/min_length": 247.0, "epoch": 2.8751033912324235, "grad_norm": 0.17494699358940125, "kl": 0.1048583984375, "learning_rate": 5.27143372815303e-08, "loss": 0.0010517984628677368, "memory(GiB)": 38.13, "reward": 0.5476438999176025, "reward_std": 0.06340140849351883, "rewards/VisualizationJSONCombinedORM/mean": 0.5476438999176025, "rewards/VisualizationJSONCombinedORM/std": 0.11537284404039383, "step": 3476, "train_speed(iter/s)": 0.085111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 294.0, "completions/min_length": 244.0, "epoch": 2.8759305210918114, "grad_norm": 0.1679370403289795, "kl": 0.0516357421875, "learning_rate": 5.201965527220188e-08, "loss": 0.0005147233605384827, "memory(GiB)": 38.13, "reward": 0.5747843980789185, "reward_std": 0.06861035525798798, "rewards/VisualizationJSONCombinedORM/mean": 0.5747843980789185, "rewards/VisualizationJSONCombinedORM/std": 0.19141753017902374, "step": 3477, "train_speed(iter/s)": 0.085089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 302.0, "completions/min_length": 259.0, "epoch": 2.876757650951199, "grad_norm": 0.26315245032310486, "kl": 0.03973388671875, "learning_rate": 5.132955708099796e-08, "loss": 0.00039754435420036316, "memory(GiB)": 38.13, "reward": 0.6919851303100586, "reward_std": 0.09261814504861832, "rewards/VisualizationJSONCombinedORM/mean": 0.6919851303100586, "rewards/VisualizationJSONCombinedORM/std": 0.10750333219766617, "step": 3478, "train_speed(iter/s)": 0.085063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 296.25, "completions/min_length": 249.0, "epoch": 2.8775847808105874, "grad_norm": 0.2136838436126709, "kl": 0.0633544921875, "learning_rate": 5.0644043347226615e-08, "loss": 0.0006332099437713623, "memory(GiB)": 38.13, "reward": 0.6803193688392639, "reward_std": 0.058230072259902954, "rewards/VisualizationJSONCombinedORM/mean": 0.6803193688392639, "rewards/VisualizationJSONCombinedORM/std": 0.0765281543135643, "step": 3479, "train_speed(iter/s)": 0.085036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.4375, "completions/min_length": 243.0, "epoch": 2.8784119106699753, "grad_norm": 0.19040003418922424, "kl": 0.04852294921875, "learning_rate": 4.996311470594928e-08, "loss": 0.0004862286150455475, "memory(GiB)": 38.13, "reward": 0.2614853084087372, "reward_std": 0.032937876880168915, "rewards/VisualizationJSONCombinedORM/mean": 0.2614853084087372, "rewards/VisualizationJSONCombinedORM/std": 0.11806020885705948, "step": 3480, "train_speed(iter/s)": 0.085007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 304.9375, "completions/min_length": 243.0, "epoch": 2.879239040529363, "grad_norm": 0.1574055254459381, "kl": 0.0291748046875, "learning_rate": 4.9286771787979696e-08, "loss": 0.000290602445602417, "memory(GiB)": 38.13, "reward": 0.5271216630935669, "reward_std": 0.053765565156936646, "rewards/VisualizationJSONCombinedORM/mean": 0.5271216630935669, "rewards/VisualizationJSONCombinedORM/std": 0.22373655438423157, "step": 3481, "train_speed(iter/s)": 0.08498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 354.0, "completions/min_length": 294.0, "epoch": 2.880066170388751, "grad_norm": 0.16833996772766113, "kl": 0.0845947265625, "learning_rate": 4.861501521988221e-08, "loss": 0.0008476302027702332, "memory(GiB)": 38.13, "reward": 0.27691930532455444, "reward_std": 0.03601172938942909, "rewards/VisualizationJSONCombinedORM/mean": 0.27691930532455444, "rewards/VisualizationJSONCombinedORM/std": 0.09031267464160919, "step": 3482, "train_speed(iter/s)": 0.084944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 313.5625, "completions/min_length": 253.0, "epoch": 2.8808933002481387, "grad_norm": 0.16233430802822113, "kl": 0.044219970703125, "learning_rate": 4.794784562397459e-08, "loss": 0.0004406943917274475, "memory(GiB)": 38.13, "reward": 0.641385018825531, "reward_std": 0.08511538803577423, "rewards/VisualizationJSONCombinedORM/mean": 0.641385018825531, "rewards/VisualizationJSONCombinedORM/std": 0.16006790101528168, "step": 3483, "train_speed(iter/s)": 0.084906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 294.5, "completions/min_length": 222.0, "epoch": 2.881720430107527, "grad_norm": 0.26179131865501404, "kl": 0.04864501953125, "learning_rate": 4.728526361832242e-08, "loss": 0.0004865899682044983, "memory(GiB)": 38.13, "reward": 0.27116167545318604, "reward_std": 0.04085332527756691, "rewards/VisualizationJSONCombinedORM/mean": 0.27116167545318604, "rewards/VisualizationJSONCombinedORM/std": 0.09619844704866409, "step": 3484, "train_speed(iter/s)": 0.084877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 318.875, "completions/min_length": 249.0, "epoch": 2.882547559966915, "grad_norm": 0.1881534904241562, "kl": 0.04534912109375, "learning_rate": 4.6627269816744704e-08, "loss": 0.00045218318700790405, "memory(GiB)": 38.13, "reward": 0.39368295669555664, "reward_std": 0.025357592850923538, "rewards/VisualizationJSONCombinedORM/mean": 0.39368295669555664, "rewards/VisualizationJSONCombinedORM/std": 0.1070983037352562, "step": 3485, "train_speed(iter/s)": 0.084848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 320.75, "completions/min_length": 249.0, "epoch": 2.8833746898263026, "grad_norm": 0.20873446762561798, "kl": 0.04156494140625, "learning_rate": 4.597386482880717e-08, "loss": 0.0004156380891799927, "memory(GiB)": 38.13, "reward": 0.7245148420333862, "reward_std": 0.06568378210067749, "rewards/VisualizationJSONCombinedORM/mean": 0.7245148420333862, "rewards/VisualizationJSONCombinedORM/std": 0.07663211971521378, "step": 3486, "train_speed(iter/s)": 0.084809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 308.875, "completions/min_length": 253.0, "epoch": 2.884201819685691, "grad_norm": 0.1717509627342224, "kl": 0.036590576171875, "learning_rate": 4.532504925982506e-08, "loss": 0.000364936888217926, "memory(GiB)": 38.13, "reward": 0.3224307894706726, "reward_std": 0.028406547382473946, "rewards/VisualizationJSONCombinedORM/mean": 0.3224307894706726, "rewards/VisualizationJSONCombinedORM/std": 0.1117459312081337, "step": 3487, "train_speed(iter/s)": 0.084781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 276.6875, "completions/min_length": 214.0, "epoch": 2.8850289495450787, "grad_norm": 0.23484806716442108, "kl": 0.0709228515625, "learning_rate": 4.468082371086313e-08, "loss": 0.0007100477814674377, "memory(GiB)": 38.13, "reward": 0.5514437556266785, "reward_std": 0.05555901676416397, "rewards/VisualizationJSONCombinedORM/mean": 0.5514437556266785, "rewards/VisualizationJSONCombinedORM/std": 0.11801554262638092, "step": 3488, "train_speed(iter/s)": 0.084754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 298.125, "completions/min_length": 227.0, "epoch": 2.8858560794044665, "grad_norm": 0.16678276658058167, "kl": 0.0867919921875, "learning_rate": 4.404118877873176e-08, "loss": 0.0008672326803207397, "memory(GiB)": 38.13, "reward": 0.5157493352890015, "reward_std": 0.046715013682842255, "rewards/VisualizationJSONCombinedORM/mean": 0.5157493352890015, "rewards/VisualizationJSONCombinedORM/std": 0.1451420783996582, "step": 3489, "train_speed(iter/s)": 0.084724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 314.25, "completions/min_length": 246.0, "epoch": 2.8866832092638544, "grad_norm": 0.21413840353488922, "kl": 0.0704345703125, "learning_rate": 4.340614505599194e-08, "loss": 0.0007037408649921417, "memory(GiB)": 38.13, "reward": 0.4901743531227112, "reward_std": 0.059721335768699646, "rewards/VisualizationJSONCombinedORM/mean": 0.4901743531227112, "rewards/VisualizationJSONCombinedORM/std": 0.22647041082382202, "step": 3490, "train_speed(iter/s)": 0.084695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 280.1875, "completions/min_length": 238.0, "epoch": 2.887510339123242, "grad_norm": 0.2726491689682007, "kl": 0.056640625, "learning_rate": 4.2775693130948094e-08, "loss": 0.0005659386515617371, "memory(GiB)": 38.13, "reward": 0.6108561754226685, "reward_std": 0.06747496128082275, "rewards/VisualizationJSONCombinedORM/mean": 0.6108561754226685, "rewards/VisualizationJSONCombinedORM/std": 0.12178442627191544, "step": 3491, "train_speed(iter/s)": 0.084668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 305.0625, "completions/min_length": 242.0, "epoch": 2.8883374689826304, "grad_norm": 0.16322387754917145, "kl": 0.1021728515625, "learning_rate": 4.21498335876519e-08, "loss": 0.001018136739730835, "memory(GiB)": 38.13, "reward": 0.607763946056366, "reward_std": 0.06598387658596039, "rewards/VisualizationJSONCombinedORM/mean": 0.607763946056366, "rewards/VisualizationJSONCombinedORM/std": 0.1076364815235138, "step": 3492, "train_speed(iter/s)": 0.084641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 314.875, "completions/min_length": 255.0, "epoch": 2.8891645988420183, "grad_norm": 0.39678481221199036, "kl": 0.325927734375, "learning_rate": 4.15285670059018e-08, "loss": 0.003264930099248886, "memory(GiB)": 38.13, "reward": 0.4257749319076538, "reward_std": 0.05978107452392578, "rewards/VisualizationJSONCombinedORM/mean": 0.4257749319076538, "rewards/VisualizationJSONCombinedORM/std": 0.2193576842546463, "step": 3493, "train_speed(iter/s)": 0.084608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 304.5625, "completions/min_length": 236.0, "epoch": 2.889991728701406, "grad_norm": 0.21285691857337952, "kl": 0.0980224609375, "learning_rate": 4.0911893961239066e-08, "loss": 0.0009813345968723297, "memory(GiB)": 38.13, "reward": 0.43370455503463745, "reward_std": 0.046120621263980865, "rewards/VisualizationJSONCombinedORM/mean": 0.43370455503463745, "rewards/VisualizationJSONCombinedORM/std": 0.10890980064868927, "step": 3494, "train_speed(iter/s)": 0.084585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 309.1875, "completions/min_length": 260.0, "epoch": 2.890818858560794, "grad_norm": 0.18587647378444672, "kl": 0.08209228515625, "learning_rate": 4.029981502495117e-08, "loss": 0.0008212029933929443, "memory(GiB)": 38.13, "reward": 0.47877198457717896, "reward_std": 0.06616969406604767, "rewards/VisualizationJSONCombinedORM/mean": 0.47877198457717896, "rewards/VisualizationJSONCombinedORM/std": 0.07191549986600876, "step": 3495, "train_speed(iter/s)": 0.084558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 313.125, "completions/min_length": 244.0, "epoch": 2.8916459884201817, "grad_norm": 0.1817069798707962, "kl": 0.111083984375, "learning_rate": 3.969233076407009e-08, "loss": 0.0011106133460998535, "memory(GiB)": 38.13, "reward": 0.3237909972667694, "reward_std": 0.03104604408144951, "rewards/VisualizationJSONCombinedORM/mean": 0.3237909972667694, "rewards/VisualizationJSONCombinedORM/std": 0.09133362025022507, "step": 3496, "train_speed(iter/s)": 0.084522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 295.4375, "completions/min_length": 256.0, "epoch": 2.89247311827957, "grad_norm": 0.17630448937416077, "kl": 0.129150390625, "learning_rate": 3.9089441741368974e-08, "loss": 0.0012920871376991272, "memory(GiB)": 38.13, "reward": 0.3527696430683136, "reward_std": 0.04564494639635086, "rewards/VisualizationJSONCombinedORM/mean": 0.3527696430683136, "rewards/VisualizationJSONCombinedORM/std": 0.1356014758348465, "step": 3497, "train_speed(iter/s)": 0.084499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 286.5, "completions/min_length": 239.0, "epoch": 2.893300248138958, "grad_norm": 0.2194114774465561, "kl": 0.056640625, "learning_rate": 3.8491148515366064e-08, "loss": 0.0005666166543960571, "memory(GiB)": 38.13, "reward": 0.567108154296875, "reward_std": 0.07098804414272308, "rewards/VisualizationJSONCombinedORM/mean": 0.567108154296875, "rewards/VisualizationJSONCombinedORM/std": 0.21965089440345764, "step": 3498, "train_speed(iter/s)": 0.084473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 328.1875, "completions/min_length": 239.0, "epoch": 2.8941273779983456, "grad_norm": 0.22299234569072723, "kl": 0.1011962890625, "learning_rate": 3.7897451640321326e-08, "loss": 0.001009918749332428, "memory(GiB)": 38.13, "reward": 0.40666133165359497, "reward_std": 0.10310429334640503, "rewards/VisualizationJSONCombinedORM/mean": 0.40666133165359497, "rewards/VisualizationJSONCombinedORM/std": 0.10432159155607224, "step": 3499, "train_speed(iter/s)": 0.084441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 282.875, "completions/min_length": 230.0, "epoch": 2.894954507857734, "grad_norm": 0.23188617825508118, "kl": 0.0865478515625, "learning_rate": 3.730835166623647e-08, "loss": 0.0008654221892356873, "memory(GiB)": 38.13, "reward": 0.4724269509315491, "reward_std": 0.06612525880336761, "rewards/VisualizationJSONCombinedORM/mean": 0.4724269509315491, "rewards/VisualizationJSONCombinedORM/std": 0.06474195420742035, "step": 3500, "train_speed(iter/s)": 0.084413 }, { "epoch": 2.894954507857734, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.375, "eval_completions/mean_length": 297.8697916666667, "eval_completions/min_length": 246.70833333333334, "eval_kl": 0.06805419921875, "eval_loss": 0.0006839695270173252, "eval_reward": 0.4464279618114233, "eval_reward_std": 0.058488939927580454, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4464279618114233, "eval_rewards/VisualizationJSONCombinedORM/std": 0.058488942217081785, "eval_runtime": 307.8952, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 305.3125, "completions/min_length": 237.0, "epoch": 2.8957816377171217, "grad_norm": 0.2720784544944763, "kl": 0.028045654296875, "learning_rate": 3.672384913885441e-08, "loss": 0.00028024613857269287, "memory(GiB)": 38.13, "reward": 0.6861123442649841, "reward_std": 0.06779676675796509, "rewards/VisualizationJSONCombinedORM/mean": 0.6861123442649841, "rewards/VisualizationJSONCombinedORM/std": 0.07855000346899033, "step": 3501, "train_speed(iter/s)": 0.083761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 299.1875, "completions/min_length": 217.0, "epoch": 2.8966087675765095, "grad_norm": 0.20321539044380188, "kl": 0.06591796875, "learning_rate": 3.6143944599660864e-08, "loss": 0.0006613265722990036, "memory(GiB)": 38.13, "reward": 0.47795137763023376, "reward_std": 0.07418227195739746, "rewards/VisualizationJSONCombinedORM/mean": 0.47795137763023376, "rewards/VisualizationJSONCombinedORM/std": 0.19262121617794037, "step": 3502, "train_speed(iter/s)": 0.083732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 297.0625, "completions/min_length": 211.0, "epoch": 2.8974358974358974, "grad_norm": 0.2026243805885315, "kl": 0.06964111328125, "learning_rate": 3.556863858587833e-08, "loss": 0.0006986334919929504, "memory(GiB)": 38.13, "reward": 0.6616848707199097, "reward_std": 0.05946918576955795, "rewards/VisualizationJSONCombinedORM/mean": 0.6616848707199097, "rewards/VisualizationJSONCombinedORM/std": 0.14980511367321014, "step": 3503, "train_speed(iter/s)": 0.083712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 330.3125, "completions/min_length": 262.0, "epoch": 2.898263027295285, "grad_norm": 0.16102482378482819, "kl": 0.0369873046875, "learning_rate": 3.499793163047327e-08, "loss": 0.0003704875707626343, "memory(GiB)": 38.13, "reward": 0.4950534701347351, "reward_std": 0.0366208478808403, "rewards/VisualizationJSONCombinedORM/mean": 0.4950534701347351, "rewards/VisualizationJSONCombinedORM/std": 0.11006724834442139, "step": 3504, "train_speed(iter/s)": 0.083682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 314.375, "completions/min_length": 277.0, "epoch": 2.8990901571546734, "grad_norm": 0.1749953180551529, "kl": 0.05133056640625, "learning_rate": 3.443182426214775e-08, "loss": 0.0005128234624862671, "memory(GiB)": 38.13, "reward": 0.6166704893112183, "reward_std": 0.05844647437334061, "rewards/VisualizationJSONCombinedORM/mean": 0.6166704893112183, "rewards/VisualizationJSONCombinedORM/std": 0.17365889251232147, "step": 3505, "train_speed(iter/s)": 0.083656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 301.875, "completions/min_length": 230.0, "epoch": 2.8999172870140613, "grad_norm": 0.18945132195949554, "kl": 0.1402587890625, "learning_rate": 3.387031700534671e-08, "loss": 0.0014029107987880707, "memory(GiB)": 38.13, "reward": 0.4966239333152771, "reward_std": 0.06080111861228943, "rewards/VisualizationJSONCombinedORM/mean": 0.4966239333152771, "rewards/VisualizationJSONCombinedORM/std": 0.10815121233463287, "step": 3506, "train_speed(iter/s)": 0.08363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 311.125, "completions/min_length": 247.0, "epoch": 2.900744416873449, "grad_norm": 0.18359285593032837, "kl": 0.064453125, "learning_rate": 3.3313410380250157e-08, "loss": 0.0006455779075622559, "memory(GiB)": 38.13, "reward": 0.33866560459136963, "reward_std": 0.030095163732767105, "rewards/VisualizationJSONCombinedORM/mean": 0.33866560459136963, "rewards/VisualizationJSONCombinedORM/std": 0.04086871072649956, "step": 3507, "train_speed(iter/s)": 0.083602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 316.375, "completions/min_length": 239.0, "epoch": 2.901571546732837, "grad_norm": 0.1975698620080948, "kl": 0.06219482421875, "learning_rate": 3.2761104902778175e-08, "loss": 0.0006232894957065582, "memory(GiB)": 38.13, "reward": 0.35748499631881714, "reward_std": 0.027735639363527298, "rewards/VisualizationJSONCombinedORM/mean": 0.35748499631881714, "rewards/VisualizationJSONCombinedORM/std": 0.0999729186296463, "step": 3508, "train_speed(iter/s)": 0.083562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 260.0, "completions/min_length": 193.0, "epoch": 2.9023986765922247, "grad_norm": 0.176540344953537, "kl": 0.0606689453125, "learning_rate": 3.221340108458704e-08, "loss": 0.0006064325571060181, "memory(GiB)": 38.13, "reward": 0.5947253704071045, "reward_std": 0.08272983133792877, "rewards/VisualizationJSONCombinedORM/mean": 0.5947253704071045, "rewards/VisualizationJSONCombinedORM/std": 0.11847075074911118, "step": 3509, "train_speed(iter/s)": 0.083543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 340.0625, "completions/min_length": 260.0, "epoch": 2.903225806451613, "grad_norm": 0.2615259289741516, "kl": 0.06243896484375, "learning_rate": 3.1670299433070315e-08, "loss": 0.0006245821714401245, "memory(GiB)": 38.13, "reward": 0.589760422706604, "reward_std": 0.10624656826257706, "rewards/VisualizationJSONCombinedORM/mean": 0.589760422706604, "rewards/VisualizationJSONCombinedORM/std": 0.12881003320217133, "step": 3510, "train_speed(iter/s)": 0.083513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 292.625, "completions/min_length": 245.0, "epoch": 2.904052936311001, "grad_norm": 0.17188656330108643, "kl": 0.08544921875, "learning_rate": 3.113180045135944e-08, "loss": 0.0008519068360328674, "memory(GiB)": 38.13, "reward": 0.6803858280181885, "reward_std": 0.05564854294061661, "rewards/VisualizationJSONCombinedORM/mean": 0.6803858280181885, "rewards/VisualizationJSONCombinedORM/std": 0.0757865384221077, "step": 3511, "train_speed(iter/s)": 0.083485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 309.0, "completions/min_length": 268.0, "epoch": 2.9048800661703886, "grad_norm": 0.21753264963626862, "kl": 0.19775390625, "learning_rate": 3.05979046383198e-08, "loss": 0.0019798362627625465, "memory(GiB)": 38.13, "reward": 0.4582580327987671, "reward_std": 0.07260261476039886, "rewards/VisualizationJSONCombinedORM/mean": 0.4582580327987671, "rewards/VisualizationJSONCombinedORM/std": 0.2165965437889099, "step": 3512, "train_speed(iter/s)": 0.083452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 310.75, "completions/min_length": 226.0, "epoch": 2.905707196029777, "grad_norm": 0.20243719220161438, "kl": 0.072021484375, "learning_rate": 3.0068612488554084e-08, "loss": 0.0007198192179203033, "memory(GiB)": 38.13, "reward": 0.6740230321884155, "reward_std": 0.0803258866071701, "rewards/VisualizationJSONCombinedORM/mean": 0.6740230321884155, "rewards/VisualizationJSONCombinedORM/std": 0.08229555189609528, "step": 3513, "train_speed(iter/s)": 0.083417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 313.375, "completions/min_length": 244.0, "epoch": 2.9065343258891647, "grad_norm": 0.1705990731716156, "kl": 0.087158203125, "learning_rate": 2.954392449239951e-08, "loss": 0.0008725002408027649, "memory(GiB)": 38.13, "reward": 0.604567289352417, "reward_std": 0.06312023103237152, "rewards/VisualizationJSONCombinedORM/mean": 0.604567289352417, "rewards/VisualizationJSONCombinedORM/std": 0.18228046596050262, "step": 3514, "train_speed(iter/s)": 0.08339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 308.1875, "completions/min_length": 264.0, "epoch": 2.9073614557485525, "grad_norm": 0.19478954374790192, "kl": 0.05206298828125, "learning_rate": 2.9023841135927822e-08, "loss": 0.0005206577479839325, "memory(GiB)": 38.13, "reward": 0.4040212631225586, "reward_std": 0.06322477012872696, "rewards/VisualizationJSONCombinedORM/mean": 0.4040212631225586, "rewards/VisualizationJSONCombinedORM/std": 0.14461953938007355, "step": 3515, "train_speed(iter/s)": 0.083361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 315.375, "completions/min_length": 259.0, "epoch": 2.9081885856079404, "grad_norm": 0.20880842208862305, "kl": 0.05181884765625, "learning_rate": 2.850836290094472e-08, "loss": 0.0005189254879951477, "memory(GiB)": 38.13, "reward": 0.41581353545188904, "reward_std": 0.046820469200611115, "rewards/VisualizationJSONCombinedORM/mean": 0.41581353545188904, "rewards/VisualizationJSONCombinedORM/std": 0.14332227408885956, "step": 3516, "train_speed(iter/s)": 0.08332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 315.875, "completions/min_length": 243.0, "epoch": 2.909015715467328, "grad_norm": 0.18120020627975464, "kl": 0.0638427734375, "learning_rate": 2.799749026499099e-08, "loss": 0.0006380975246429443, "memory(GiB)": 38.13, "reward": 0.579856812953949, "reward_std": 0.06667116284370422, "rewards/VisualizationJSONCombinedORM/mean": 0.579856812953949, "rewards/VisualizationJSONCombinedORM/std": 0.11982621997594833, "step": 3517, "train_speed(iter/s)": 0.08328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 286.5625, "completions/min_length": 220.0, "epoch": 2.9098428453267164, "grad_norm": 0.1677923947572708, "kl": 0.03717041015625, "learning_rate": 2.749122370133972e-08, "loss": 0.0003714263439178467, "memory(GiB)": 38.13, "reward": 0.49697616696357727, "reward_std": 0.05500202625989914, "rewards/VisualizationJSONCombinedORM/mean": 0.49697616696357727, "rewards/VisualizationJSONCombinedORM/std": 0.13566945493221283, "step": 3518, "train_speed(iter/s)": 0.083251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 290.6875, "completions/min_length": 236.0, "epoch": 2.9106699751861043, "grad_norm": 0.1719685196876526, "kl": 0.0394287109375, "learning_rate": 2.6989563678996856e-08, "loss": 0.0003955960273742676, "memory(GiB)": 38.13, "reward": 0.6047279834747314, "reward_std": 0.033815547823905945, "rewards/VisualizationJSONCombinedORM/mean": 0.6047279834747314, "rewards/VisualizationJSONCombinedORM/std": 0.1629665046930313, "step": 3519, "train_speed(iter/s)": 0.083225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 294.25, "completions/min_length": 224.0, "epoch": 2.911497105045492, "grad_norm": 0.1808471828699112, "kl": 0.2156982421875, "learning_rate": 2.649251066270231e-08, "loss": 0.0021627750247716904, "memory(GiB)": 38.13, "reward": 0.48873069882392883, "reward_std": 0.07009100168943405, "rewards/VisualizationJSONCombinedORM/mean": 0.48873069882392883, "rewards/VisualizationJSONCombinedORM/std": 0.18192151188850403, "step": 3520, "train_speed(iter/s)": 0.083198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 275.125, "completions/min_length": 207.0, "epoch": 2.91232423490488, "grad_norm": 0.20095250010490417, "kl": 0.0638427734375, "learning_rate": 2.6000065112924966e-08, "loss": 0.0006391368806362152, "memory(GiB)": 38.13, "reward": 0.5739461779594421, "reward_std": 0.05885353684425354, "rewards/VisualizationJSONCombinedORM/mean": 0.5739461779594421, "rewards/VisualizationJSONCombinedORM/std": 0.24427729845046997, "step": 3521, "train_speed(iter/s)": 0.083173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 329.5, "completions/min_length": 265.0, "epoch": 2.9131513647642677, "grad_norm": 0.1712198704481125, "kl": 0.04827880859375, "learning_rate": 2.551222748586879e-08, "loss": 0.00048193708062171936, "memory(GiB)": 38.13, "reward": 0.4986233115196228, "reward_std": 0.03954458609223366, "rewards/VisualizationJSONCombinedORM/mean": 0.4986233115196228, "rewards/VisualizationJSONCombinedORM/std": 0.2924414277076721, "step": 3522, "train_speed(iter/s)": 0.083144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 295.875, "completions/min_length": 237.0, "epoch": 2.913978494623656, "grad_norm": 0.16351847350597382, "kl": 0.04296875, "learning_rate": 2.5028998233467272e-08, "loss": 0.00042954832315444946, "memory(GiB)": 38.13, "reward": 0.5836477875709534, "reward_std": 0.05141662061214447, "rewards/VisualizationJSONCombinedORM/mean": 0.5836477875709534, "rewards/VisualizationJSONCombinedORM/std": 0.2716076374053955, "step": 3523, "train_speed(iter/s)": 0.083114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 291.9375, "completions/min_length": 238.0, "epoch": 2.914805624483044, "grad_norm": 0.1614549160003662, "kl": 0.027099609375, "learning_rate": 2.455037780338454e-08, "loss": 0.0002704467624425888, "memory(GiB)": 38.13, "reward": 0.5052123665809631, "reward_std": 0.06079317629337311, "rewards/VisualizationJSONCombinedORM/mean": 0.5052123665809631, "rewards/VisualizationJSONCombinedORM/std": 0.07698617875576019, "step": 3524, "train_speed(iter/s)": 0.083086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 326.1875, "completions/min_length": 244.0, "epoch": 2.9156327543424316, "grad_norm": 0.18047867715358734, "kl": 0.07171630859375, "learning_rate": 2.4076366639015914e-08, "loss": 0.0007163658738136292, "memory(GiB)": 38.13, "reward": 0.39852452278137207, "reward_std": 0.04789847880601883, "rewards/VisualizationJSONCombinedORM/mean": 0.39852452278137207, "rewards/VisualizationJSONCombinedORM/std": 0.20457158982753754, "step": 3525, "train_speed(iter/s)": 0.083059 }, { "epoch": 2.9156327543424316, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.75, "eval_completions/mean_length": 302.640625, "eval_completions/min_length": 250.41666666666666, "eval_kl": 0.07066853841145833, "eval_loss": 0.0007083590026013553, "eval_reward": 0.4582786802202463, "eval_reward_std": 0.06897627360497911, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4582786802202463, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06897627360497911, "eval_runtime": 314.0595, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 308.6875, "completions/min_length": 243.0, "epoch": 2.91645988420182, "grad_norm": 0.19135476648807526, "kl": 0.08270263671875, "learning_rate": 2.360696517948513e-08, "loss": 0.0008278116583824158, "memory(GiB)": 38.13, "reward": 0.50739586353302, "reward_std": 0.07011014968156815, "rewards/VisualizationJSONCombinedORM/mean": 0.50739586353302, "rewards/VisualizationJSONCombinedORM/std": 0.2138490080833435, "step": 3526, "train_speed(iter/s)": 0.082417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 271.375, "completions/min_length": 236.0, "epoch": 2.9172870140612077, "grad_norm": 0.27119892835617065, "kl": 0.0714111328125, "learning_rate": 2.3142173859647675e-08, "loss": 0.0007134079933166504, "memory(GiB)": 38.13, "reward": 0.5702829360961914, "reward_std": 0.06400463730096817, "rewards/VisualizationJSONCombinedORM/mean": 0.5702829360961914, "rewards/VisualizationJSONCombinedORM/std": 0.14650945365428925, "step": 3527, "train_speed(iter/s)": 0.082389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 303.0625, "completions/min_length": 253.0, "epoch": 2.9181141439205955, "grad_norm": 0.18172109127044678, "kl": 0.0604248046875, "learning_rate": 2.26819931100869e-08, "loss": 0.0006032288074493408, "memory(GiB)": 38.13, "reward": 0.5214705467224121, "reward_std": 0.053540535271167755, "rewards/VisualizationJSONCombinedORM/mean": 0.5214705467224121, "rewards/VisualizationJSONCombinedORM/std": 0.1251235008239746, "step": 3528, "train_speed(iter/s)": 0.082354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 285.25, "completions/min_length": 243.0, "epoch": 2.9189412737799834, "grad_norm": 0.11183535307645798, "kl": 0.05682373046875, "learning_rate": 2.2226423357114556e-08, "loss": 0.0005679516470991075, "memory(GiB)": 38.13, "reward": 0.7797876000404358, "reward_std": 0.0162399522960186, "rewards/VisualizationJSONCombinedORM/mean": 0.7797876000404358, "rewards/VisualizationJSONCombinedORM/std": 0.22851413488388062, "step": 3529, "train_speed(iter/s)": 0.082327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 325.375, "completions/min_length": 281.0, "epoch": 2.919768403639371, "grad_norm": 0.1975174993276596, "kl": 0.0721435546875, "learning_rate": 2.1775465022771946e-08, "loss": 0.0007204804569482803, "memory(GiB)": 38.13, "reward": 0.38440579175949097, "reward_std": 0.03694412112236023, "rewards/VisualizationJSONCombinedORM/mean": 0.38440579175949097, "rewards/VisualizationJSONCombinedORM/std": 0.10808973014354706, "step": 3530, "train_speed(iter/s)": 0.082291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 335.5625, "completions/min_length": 267.0, "epoch": 2.9205955334987594, "grad_norm": 0.21459196507930756, "kl": 0.0350341796875, "learning_rate": 2.1329118524827662e-08, "loss": 0.00035084038972854614, "memory(GiB)": 38.13, "reward": 0.6334497332572937, "reward_std": 0.08280906826257706, "rewards/VisualizationJSONCombinedORM/mean": 0.6334497332572937, "rewards/VisualizationJSONCombinedORM/std": 0.09688229858875275, "step": 3531, "train_speed(iter/s)": 0.082265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 275.875, "completions/min_length": 211.0, "epoch": 2.9214226633581473, "grad_norm": 0.19290080666542053, "kl": 0.07275390625, "learning_rate": 2.0887384276777055e-08, "loss": 0.0007269084453582764, "memory(GiB)": 38.13, "reward": 0.5955677032470703, "reward_std": 0.06690290570259094, "rewards/VisualizationJSONCombinedORM/mean": 0.5955677032470703, "rewards/VisualizationJSONCombinedORM/std": 0.13768509030342102, "step": 3532, "train_speed(iter/s)": 0.082243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 314.6875, "completions/min_length": 244.0, "epoch": 2.922249793217535, "grad_norm": 0.19202379882335663, "kl": 0.08984375, "learning_rate": 2.0450262687844446e-08, "loss": 0.000896647572517395, "memory(GiB)": 38.13, "reward": 0.5692170262336731, "reward_std": 0.05685849487781525, "rewards/VisualizationJSONCombinedORM/mean": 0.5692170262336731, "rewards/VisualizationJSONCombinedORM/std": 0.18342065811157227, "step": 3533, "train_speed(iter/s)": 0.082213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 306.4375, "completions/min_length": 234.0, "epoch": 2.9230769230769234, "grad_norm": 0.17509378492832184, "kl": 0.05364990234375, "learning_rate": 2.0017754162979795e-08, "loss": 0.0005369633436203003, "memory(GiB)": 38.13, "reward": 0.3897218108177185, "reward_std": 0.05084535852074623, "rewards/VisualizationJSONCombinedORM/mean": 0.3897218108177185, "rewards/VisualizationJSONCombinedORM/std": 0.06561999022960663, "step": 3534, "train_speed(iter/s)": 0.082187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 361.0625, "completions/min_length": 304.0, "epoch": 2.9239040529363107, "grad_norm": 0.25333717465400696, "kl": 0.2255859375, "learning_rate": 1.958985910285982e-08, "loss": 0.002260074019432068, "memory(GiB)": 38.13, "reward": 0.40229833126068115, "reward_std": 0.03826028108596802, "rewards/VisualizationJSONCombinedORM/mean": 0.40229833126068115, "rewards/VisualizationJSONCombinedORM/std": 0.14885981380939484, "step": 3535, "train_speed(iter/s)": 0.08216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 284.0625, "completions/min_length": 230.0, "epoch": 2.924731182795699, "grad_norm": 0.2508731186389923, "kl": 0.06134033203125, "learning_rate": 1.916657790388743e-08, "loss": 0.0006138626486063004, "memory(GiB)": 38.13, "reward": 0.6312741637229919, "reward_std": 0.059353865683078766, "rewards/VisualizationJSONCombinedORM/mean": 0.6312741637229919, "rewards/VisualizationJSONCombinedORM/std": 0.06800487637519836, "step": 3536, "train_speed(iter/s)": 0.082127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 312.5, "completions/min_length": 237.0, "epoch": 2.925558312655087, "grad_norm": 0.19682519137859344, "kl": 0.21240234375, "learning_rate": 1.8747910958191173e-08, "loss": 0.002118341624736786, "memory(GiB)": 38.13, "reward": 0.6646636724472046, "reward_std": 0.04848427698016167, "rewards/VisualizationJSONCombinedORM/mean": 0.6646636724472046, "rewards/VisualizationJSONCombinedORM/std": 0.08503501117229462, "step": 3537, "train_speed(iter/s)": 0.0821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 306.4375, "completions/min_length": 251.0, "epoch": 2.9263854425144746, "grad_norm": 0.26178544759750366, "kl": 0.06396484375, "learning_rate": 1.8333858653624136e-08, "loss": 0.0006388314068317413, "memory(GiB)": 38.13, "reward": 0.3041315972805023, "reward_std": 0.041674524545669556, "rewards/VisualizationJSONCombinedORM/mean": 0.3041315972805023, "rewards/VisualizationJSONCombinedORM/std": 0.05778764933347702, "step": 3538, "train_speed(iter/s)": 0.08207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 298.8125, "completions/min_length": 230.0, "epoch": 2.927212572373863, "grad_norm": 0.21528911590576172, "kl": 0.15740966796875, "learning_rate": 1.7924421373766153e-08, "loss": 0.0015747547149658203, "memory(GiB)": 38.13, "reward": 0.21867603063583374, "reward_std": 0.028763506561517715, "rewards/VisualizationJSONCombinedORM/mean": 0.21867603063583374, "rewards/VisualizationJSONCombinedORM/std": 0.03597385808825493, "step": 3539, "train_speed(iter/s)": 0.082046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.75, "completions/min_length": 247.0, "epoch": 2.9280397022332507, "grad_norm": 0.17158544063568115, "kl": 0.0814208984375, "learning_rate": 1.7519599497919926e-08, "loss": 0.0008143596351146698, "memory(GiB)": 38.13, "reward": 0.7157517075538635, "reward_std": 0.05269699543714523, "rewards/VisualizationJSONCombinedORM/mean": 0.7157517075538635, "rewards/VisualizationJSONCombinedORM/std": 0.053274039179086685, "step": 3540, "train_speed(iter/s)": 0.08202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 284.0, "completions/min_length": 244.0, "epoch": 2.9288668320926385, "grad_norm": 0.17952385544776917, "kl": 0.0361328125, "learning_rate": 1.711939340111324e-08, "loss": 0.00036090239882469177, "memory(GiB)": 38.13, "reward": 0.7301343083381653, "reward_std": 0.08262200653553009, "rewards/VisualizationJSONCombinedORM/mean": 0.7301343083381653, "rewards/VisualizationJSONCombinedORM/std": 0.08085083216428757, "step": 3541, "train_speed(iter/s)": 0.081995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 321.125, "completions/min_length": 285.0, "epoch": 2.9296939619520264, "grad_norm": 0.18460896611213684, "kl": 0.03668212890625, "learning_rate": 1.6723803454098408e-08, "loss": 0.0003663972020149231, "memory(GiB)": 38.13, "reward": 0.442977637052536, "reward_std": 0.04910287261009216, "rewards/VisualizationJSONCombinedORM/mean": 0.442977637052536, "rewards/VisualizationJSONCombinedORM/std": 0.2423664778470993, "step": 3542, "train_speed(iter/s)": 0.08197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 308.0, "completions/min_length": 233.0, "epoch": 2.930521091811414, "grad_norm": 0.17020933330059052, "kl": 0.0543212890625, "learning_rate": 1.6332830023350065e-08, "loss": 0.0005437098443508148, "memory(GiB)": 38.13, "reward": 0.46583235263824463, "reward_std": 0.06062822788953781, "rewards/VisualizationJSONCombinedORM/mean": 0.46583235263824463, "rewards/VisualizationJSONCombinedORM/std": 0.1007094755768776, "step": 3543, "train_speed(iter/s)": 0.081938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 287.25, "completions/min_length": 236.0, "epoch": 2.9313482216708024, "grad_norm": 0.20383502542972565, "kl": 0.147705078125, "learning_rate": 1.5946473471066813e-08, "loss": 0.001475416123867035, "memory(GiB)": 38.13, "reward": 0.49877381324768066, "reward_std": 0.05905171483755112, "rewards/VisualizationJSONCombinedORM/mean": 0.49877381324768066, "rewards/VisualizationJSONCombinedORM/std": 0.2112056314945221, "step": 3544, "train_speed(iter/s)": 0.081914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 300.5, "completions/min_length": 232.0, "epoch": 2.9321753515301903, "grad_norm": 0.1761062741279602, "kl": 0.03033447265625, "learning_rate": 1.556473415517068e-08, "loss": 0.00030301138758659363, "memory(GiB)": 38.13, "reward": 0.4922443926334381, "reward_std": 0.04279719665646553, "rewards/VisualizationJSONCombinedORM/mean": 0.4922443926334381, "rewards/VisualizationJSONCombinedORM/std": 0.06116253882646561, "step": 3545, "train_speed(iter/s)": 0.081886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 336.6875, "completions/min_length": 269.0, "epoch": 2.933002481389578, "grad_norm": 0.27134886384010315, "kl": 0.08392333984375, "learning_rate": 1.5187612429304887e-08, "loss": 0.0008395984768867493, "memory(GiB)": 38.13, "reward": 0.3575553297996521, "reward_std": 0.05862317234277725, "rewards/VisualizationJSONCombinedORM/mean": 0.3575553297996521, "rewards/VisualizationJSONCombinedORM/std": 0.138823002576828, "step": 3546, "train_speed(iter/s)": 0.08186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 285.6875, "completions/min_length": 244.0, "epoch": 2.9338296112489664, "grad_norm": 0.17742271721363068, "kl": 0.09698486328125, "learning_rate": 1.481510864283553e-08, "loss": 0.000970788300037384, "memory(GiB)": 38.13, "reward": 0.6844838857650757, "reward_std": 0.10122483968734741, "rewards/VisualizationJSONCombinedORM/mean": 0.6844838857650757, "rewards/VisualizationJSONCombinedORM/std": 0.09853114932775497, "step": 3547, "train_speed(iter/s)": 0.081827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 308.25, "completions/min_length": 261.0, "epoch": 2.934656741108354, "grad_norm": 0.18668076395988464, "kl": 0.037261962890625, "learning_rate": 1.4447223140851562e-08, "loss": 0.00037304311990737915, "memory(GiB)": 38.13, "reward": 0.5069185495376587, "reward_std": 0.06565359979867935, "rewards/VisualizationJSONCombinedORM/mean": 0.5069185495376587, "rewards/VisualizationJSONCombinedORM/std": 0.12857823073863983, "step": 3548, "train_speed(iter/s)": 0.081803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 264.6875, "completions/min_length": 237.0, "epoch": 2.935483870967742, "grad_norm": 0.22581638395786285, "kl": 0.047119140625, "learning_rate": 1.408395626416259e-08, "loss": 0.00047194212675094604, "memory(GiB)": 38.13, "reward": 0.4670451879501343, "reward_std": 0.09842805564403534, "rewards/VisualizationJSONCombinedORM/mean": 0.4670451879501343, "rewards/VisualizationJSONCombinedORM/std": 0.20394852757453918, "step": 3549, "train_speed(iter/s)": 0.081789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 253.0, "completions/min_length": 230.0, "epoch": 2.93631100082713, "grad_norm": 0.19138480722904205, "kl": 0.04205322265625, "learning_rate": 1.372530834929997e-08, "loss": 0.0004208311438560486, "memory(GiB)": 38.13, "reward": 0.5632032752037048, "reward_std": 0.06011917442083359, "rewards/VisualizationJSONCombinedORM/mean": 0.5632032752037048, "rewards/VisualizationJSONCombinedORM/std": 0.2091256082057953, "step": 3550, "train_speed(iter/s)": 0.081766 }, { "epoch": 2.93631100082713, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.0416666666667, "eval_completions/mean_length": 300.2864583333333, "eval_completions/min_length": 256.25, "eval_kl": 0.07281494140625, "eval_loss": 0.0007321468438021839, "eval_reward": 0.45052024349570274, "eval_reward_std": 0.056023279942261674, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45052024349570274, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05602328374516219, "eval_runtime": 306.4852, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 311.4375, "completions/min_length": 265.0, "epoch": 2.9371381306865176, "grad_norm": 0.142023503780365, "kl": 0.02239990234375, "learning_rate": 1.3371279728515152e-08, "loss": 0.00022463500499725342, "memory(GiB)": 38.13, "reward": 0.6045253276824951, "reward_std": 0.031306907534599304, "rewards/VisualizationJSONCombinedORM/mean": 0.6045253276824951, "rewards/VisualizationJSONCombinedORM/std": 0.20702482759952545, "step": 3551, "train_speed(iter/s)": 0.081163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 308.9375, "completions/min_length": 225.0, "epoch": 2.937965260545906, "grad_norm": 0.19999094307422638, "kl": 0.04364013671875, "learning_rate": 1.3021870729780783e-08, "loss": 0.00043702125549316406, "memory(GiB)": 38.13, "reward": 0.5370880961418152, "reward_std": 0.056486912071704865, "rewards/VisualizationJSONCombinedORM/mean": 0.5370880961418152, "rewards/VisualizationJSONCombinedORM/std": 0.26497936248779297, "step": 3552, "train_speed(iter/s)": 0.081125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 288.75, "completions/min_length": 242.0, "epoch": 2.9387923904052937, "grad_norm": 0.23016321659088135, "kl": 0.0623779296875, "learning_rate": 1.2677081676790714e-08, "loss": 0.0006234794855117798, "memory(GiB)": 38.13, "reward": 0.6507905721664429, "reward_std": 0.07880762219429016, "rewards/VisualizationJSONCombinedORM/mean": 0.6507905721664429, "rewards/VisualizationJSONCombinedORM/std": 0.08783599734306335, "step": 3553, "train_speed(iter/s)": 0.081095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 313.1875, "completions/min_length": 255.0, "epoch": 2.9396195202646815, "grad_norm": 0.21774055063724518, "kl": 0.056884765625, "learning_rate": 1.2336912888957774e-08, "loss": 0.0005690678954124451, "memory(GiB)": 38.13, "reward": 0.37015998363494873, "reward_std": 0.04488170146942139, "rewards/VisualizationJSONCombinedORM/mean": 0.37015998363494873, "rewards/VisualizationJSONCombinedORM/std": 0.19705967605113983, "step": 3554, "train_speed(iter/s)": 0.081068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 318.375, "completions/min_length": 239.0, "epoch": 2.9404466501240694, "grad_norm": 0.17745532095432281, "kl": 0.038330078125, "learning_rate": 1.200136468141544e-08, "loss": 0.0003833770751953125, "memory(GiB)": 38.13, "reward": 0.7401093244552612, "reward_std": 0.07803849130868912, "rewards/VisualizationJSONCombinedORM/mean": 0.7401093244552612, "rewards/VisualizationJSONCombinedORM/std": 0.09259934723377228, "step": 3555, "train_speed(iter/s)": 0.081044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 295.0, "completions/min_length": 236.0, "epoch": 2.941273779983457, "grad_norm": 0.26067161560058594, "kl": 0.06787109375, "learning_rate": 1.1670437365015053e-08, "loss": 0.0006774663925170898, "memory(GiB)": 38.13, "reward": 0.32822513580322266, "reward_std": 0.0598641000688076, "rewards/VisualizationJSONCombinedORM/mean": 0.32822513580322266, "rewards/VisualizationJSONCombinedORM/std": 0.13012543320655823, "step": 3556, "train_speed(iter/s)": 0.081011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 279.5625, "completions/min_length": 238.0, "epoch": 2.9421009098428454, "grad_norm": 0.1738041490316391, "kl": 0.1317138671875, "learning_rate": 1.1344131246329715e-08, "loss": 0.0013184994459152222, "memory(GiB)": 38.13, "reward": 0.4931415319442749, "reward_std": 0.06701395660638809, "rewards/VisualizationJSONCombinedORM/mean": 0.4931415319442749, "rewards/VisualizationJSONCombinedORM/std": 0.1047460213303566, "step": 3557, "train_speed(iter/s)": 0.080994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 325.5625, "completions/min_length": 237.0, "epoch": 2.9429280397022333, "grad_norm": 0.1880921870470047, "kl": 0.07012939453125, "learning_rate": 1.1022446627649286e-08, "loss": 0.0007002111524343491, "memory(GiB)": 38.13, "reward": 0.6034350395202637, "reward_std": 0.0637376606464386, "rewards/VisualizationJSONCombinedORM/mean": 0.6034350395202637, "rewards/VisualizationJSONCombinedORM/std": 0.09819938987493515, "step": 3558, "train_speed(iter/s)": 0.080966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 270.5, "completions/min_length": 226.0, "epoch": 2.943755169561621, "grad_norm": 0.23619119822978973, "kl": 0.12841796875, "learning_rate": 1.0705383806982606e-08, "loss": 0.0012838486582040787, "memory(GiB)": 38.13, "reward": 0.4593007564544678, "reward_std": 0.08962880074977875, "rewards/VisualizationJSONCombinedORM/mean": 0.4593007564544678, "rewards/VisualizationJSONCombinedORM/std": 0.09224211424589157, "step": 3559, "train_speed(iter/s)": 0.080936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 300.5, "completions/min_length": 217.0, "epoch": 2.9445822994210094, "grad_norm": 0.27790090441703796, "kl": 0.1077880859375, "learning_rate": 1.0392943078057493e-08, "loss": 0.0010789036750793457, "memory(GiB)": 38.13, "reward": 0.42483019828796387, "reward_std": 0.06929007172584534, "rewards/VisualizationJSONCombinedORM/mean": 0.42483019828796387, "rewards/VisualizationJSONCombinedORM/std": 0.14129610359668732, "step": 3560, "train_speed(iter/s)": 0.08091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 290.4375, "completions/min_length": 233.0, "epoch": 2.945409429280397, "grad_norm": 0.16267332434654236, "kl": 0.04632568359375, "learning_rate": 1.008512473032075e-08, "loss": 0.00046402961015701294, "memory(GiB)": 38.13, "reward": 0.5376592874526978, "reward_std": 0.07022961229085922, "rewards/VisualizationJSONCombinedORM/mean": 0.5376592874526978, "rewards/VisualizationJSONCombinedORM/std": 0.11691495776176453, "step": 3561, "train_speed(iter/s)": 0.080891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 344.9375, "completions/min_length": 267.0, "epoch": 2.946236559139785, "grad_norm": 0.18055593967437744, "kl": 0.06396484375, "learning_rate": 9.78192904893427e-09, "loss": 0.0006396062672138214, "memory(GiB)": 38.13, "reward": 0.5024917125701904, "reward_std": 0.04788237437605858, "rewards/VisualizationJSONCombinedORM/mean": 0.5024917125701904, "rewards/VisualizationJSONCombinedORM/std": 0.20117585361003876, "step": 3562, "train_speed(iter/s)": 0.080859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 291.0625, "completions/min_length": 217.0, "epoch": 2.947063688999173, "grad_norm": 0.19917649030685425, "kl": 0.0699462890625, "learning_rate": 9.48335631477948e-09, "loss": 0.0007010791450738907, "memory(GiB)": 38.13, "reward": 0.4673921763896942, "reward_std": 0.0423186719417572, "rewards/VisualizationJSONCombinedORM/mean": 0.4673921763896942, "rewards/VisualizationJSONCombinedORM/std": 0.1802043914794922, "step": 3563, "train_speed(iter/s)": 0.080838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 275.125, "completions/min_length": 213.0, "epoch": 2.9478908188585606, "grad_norm": 0.22110068798065186, "kl": 0.05517578125, "learning_rate": 9.18940680445568e-09, "loss": 0.000553034245967865, "memory(GiB)": 38.13, "reward": 0.4274129271507263, "reward_std": 0.06261210143566132, "rewards/VisualizationJSONCombinedORM/mean": 0.4274129271507263, "rewards/VisualizationJSONCombinedORM/std": 0.1533944010734558, "step": 3564, "train_speed(iter/s)": 0.080816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 292.0, "completions/min_length": 231.0, "epoch": 2.948717948717949, "grad_norm": 0.20850591361522675, "kl": 0.10162353515625, "learning_rate": 8.90008079027671e-09, "loss": 0.0010173991322517395, "memory(GiB)": 38.13, "reward": 0.7041975259780884, "reward_std": 0.0712098628282547, "rewards/VisualizationJSONCombinedORM/mean": 0.7041975259780884, "rewards/VisualizationJSONCombinedORM/std": 0.12497769296169281, "step": 3565, "train_speed(iter/s)": 0.080791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 295.125, "completions/min_length": 237.0, "epoch": 2.9495450785773367, "grad_norm": 0.21687503159046173, "kl": 0.0955810546875, "learning_rate": 8.615378540276497e-09, "loss": 0.0009542293846607208, "memory(GiB)": 38.13, "reward": 0.5337522625923157, "reward_std": 0.06227575242519379, "rewards/VisualizationJSONCombinedORM/mean": 0.5337522625923157, "rewards/VisualizationJSONCombinedORM/std": 0.2177911400794983, "step": 3566, "train_speed(iter/s)": 0.08077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 318.625, "completions/min_length": 245.0, "epoch": 2.9503722084367245, "grad_norm": 0.20748642086982727, "kl": 0.032379150390625, "learning_rate": 8.335300318201844e-09, "loss": 0.00032487139105796814, "memory(GiB)": 38.13, "reward": 0.6222192645072937, "reward_std": 0.04425038397312164, "rewards/VisualizationJSONCombinedORM/mean": 0.6222192645072937, "rewards/VisualizationJSONCombinedORM/std": 0.11407842487096786, "step": 3567, "train_speed(iter/s)": 0.080743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 272.0, "completions/min_length": 207.0, "epoch": 2.9511993382961124, "grad_norm": 0.2147454023361206, "kl": 0.0447998046875, "learning_rate": 8.059846383519088e-09, "loss": 0.0004495568573474884, "memory(GiB)": 38.13, "reward": 0.4314464032649994, "reward_std": 0.07125979661941528, "rewards/VisualizationJSONCombinedORM/mean": 0.4314464032649994, "rewards/VisualizationJSONCombinedORM/std": 0.0733131393790245, "step": 3568, "train_speed(iter/s)": 0.080725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 302.25, "completions/min_length": 236.0, "epoch": 2.9520264681555, "grad_norm": 0.25957930088043213, "kl": 0.17742919921875, "learning_rate": 7.789016991409104e-09, "loss": 0.0017770975828170776, "memory(GiB)": 38.13, "reward": 0.48911798000335693, "reward_std": 0.08582380414009094, "rewards/VisualizationJSONCombinedORM/mean": 0.48911798000335693, "rewards/VisualizationJSONCombinedORM/std": 0.12312779575586319, "step": 3569, "train_speed(iter/s)": 0.080701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 291.9375, "completions/min_length": 223.0, "epoch": 2.9528535980148884, "grad_norm": 0.18736886978149414, "kl": 0.05535888671875, "learning_rate": 7.52281239276842e-09, "loss": 0.0005542049184441566, "memory(GiB)": 38.13, "reward": 0.7354372143745422, "reward_std": 0.09090244770050049, "rewards/VisualizationJSONCombinedORM/mean": 0.7354372143745422, "rewards/VisualizationJSONCombinedORM/std": 0.08826054632663727, "step": 3570, "train_speed(iter/s)": 0.080679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 286.6875, "completions/min_length": 222.0, "epoch": 2.9536807278742763, "grad_norm": 0.2056695967912674, "kl": 0.06683349609375, "learning_rate": 7.261232834209208e-09, "loss": 0.0006681308150291443, "memory(GiB)": 38.13, "reward": 0.4703996181488037, "reward_std": 0.05532096326351166, "rewards/VisualizationJSONCombinedORM/mean": 0.4703996181488037, "rewards/VisualizationJSONCombinedORM/std": 0.2175675481557846, "step": 3571, "train_speed(iter/s)": 0.080662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 286.0625, "completions/min_length": 233.0, "epoch": 2.954507857733664, "grad_norm": 0.18750359117984772, "kl": 0.073486328125, "learning_rate": 7.0042785580598515e-09, "loss": 0.0007349774241447449, "memory(GiB)": 38.13, "reward": 0.5183374881744385, "reward_std": 0.05355556309223175, "rewards/VisualizationJSONCombinedORM/mean": 0.5183374881744385, "rewards/VisualizationJSONCombinedORM/std": 0.2122020274400711, "step": 3572, "train_speed(iter/s)": 0.080632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 297.125, "completions/min_length": 231.0, "epoch": 2.9553349875930524, "grad_norm": 0.16381289064884186, "kl": 0.1168212890625, "learning_rate": 6.751949802362712e-09, "loss": 0.0011704973876476288, "memory(GiB)": 38.13, "reward": 0.5003499984741211, "reward_std": 0.01775120198726654, "rewards/VisualizationJSONCombinedORM/mean": 0.5003499984741211, "rewards/VisualizationJSONCombinedORM/std": 0.1421278864145279, "step": 3573, "train_speed(iter/s)": 0.080616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 333.8125, "completions/min_length": 281.0, "epoch": 2.95616211745244, "grad_norm": 0.21620003879070282, "kl": 0.08013916015625, "learning_rate": 6.504246800875802e-09, "loss": 0.0008019804954528809, "memory(GiB)": 38.13, "reward": 0.6402187943458557, "reward_std": 0.08632323890924454, "rewards/VisualizationJSONCombinedORM/mean": 0.6402187943458557, "rewards/VisualizationJSONCombinedORM/std": 0.08518949896097183, "step": 3574, "train_speed(iter/s)": 0.080595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 285.375, "completions/min_length": 239.0, "epoch": 2.956989247311828, "grad_norm": 0.35809898376464844, "kl": 0.2122802734375, "learning_rate": 6.2611697830722295e-09, "loss": 0.0021235644817352295, "memory(GiB)": 38.13, "reward": 0.5324771404266357, "reward_std": 0.061510078608989716, "rewards/VisualizationJSONCombinedORM/mean": 0.5324771404266357, "rewards/VisualizationJSONCombinedORM/std": 0.23129712045192719, "step": 3575, "train_speed(iter/s)": 0.080568 }, { "epoch": 2.956989247311828, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.75, "eval_completions/mean_length": 306.4322916666667, "eval_completions/min_length": 255.79166666666666, "eval_kl": 0.07493082682291667, "eval_loss": 0.0007518927450291812, "eval_reward": 0.4643702494601409, "eval_reward_std": 0.061214659828692675, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4643702494601409, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06121466037196418, "eval_runtime": 314.039, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 281.4375, "completions/min_length": 249.0, "epoch": 2.957816377171216, "grad_norm": 0.19992384314537048, "kl": 0.0897216796875, "learning_rate": 6.022718974137976e-09, "loss": 0.000896107405424118, "memory(GiB)": 38.13, "reward": 0.47764694690704346, "reward_std": 0.03946945071220398, "rewards/VisualizationJSONCombinedORM/mean": 0.47764694690704346, "rewards/VisualizationJSONCombinedORM/std": 0.19441582262516022, "step": 3576, "train_speed(iter/s)": 0.07998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 308.4375, "completions/min_length": 219.0, "epoch": 2.9586435070306036, "grad_norm": 0.2165209949016571, "kl": 0.04083251953125, "learning_rate": 5.788894594975225e-09, "loss": 0.0004080049693584442, "memory(GiB)": 38.13, "reward": 0.5120777487754822, "reward_std": 0.07592940330505371, "rewards/VisualizationJSONCombinedORM/mean": 0.5120777487754822, "rewards/VisualizationJSONCombinedORM/std": 0.08183760195970535, "step": 3577, "train_speed(iter/s)": 0.079957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 338.8125, "completions/min_length": 298.0, "epoch": 2.959470636889992, "grad_norm": 0.1885930299758911, "kl": 0.1219482421875, "learning_rate": 5.559696862198483e-09, "loss": 0.0012190453708171844, "memory(GiB)": 38.13, "reward": 0.2017844021320343, "reward_std": 0.023483334109187126, "rewards/VisualizationJSONCombinedORM/mean": 0.2017844021320343, "rewards/VisualizationJSONCombinedORM/std": 0.03609045222401619, "step": 3578, "train_speed(iter/s)": 0.079934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 304.625, "completions/min_length": 242.0, "epoch": 2.9602977667493797, "grad_norm": 0.2111889272928238, "kl": 0.094970703125, "learning_rate": 5.3351259881379016e-09, "loss": 0.0009471476078033447, "memory(GiB)": 38.13, "reward": 0.5759634971618652, "reward_std": 0.07462795078754425, "rewards/VisualizationJSONCombinedORM/mean": 0.5759634971618652, "rewards/VisualizationJSONCombinedORM/std": 0.07695476710796356, "step": 3579, "train_speed(iter/s)": 0.079901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 310.875, "completions/min_length": 251.0, "epoch": 2.9611248966087675, "grad_norm": 0.2388867884874344, "kl": 0.04364013671875, "learning_rate": 5.115182180835398e-09, "loss": 0.0004363209009170532, "memory(GiB)": 38.13, "reward": 0.5619497299194336, "reward_std": 0.08913470804691315, "rewards/VisualizationJSONCombinedORM/mean": 0.5619497299194336, "rewards/VisualizationJSONCombinedORM/std": 0.15518954396247864, "step": 3580, "train_speed(iter/s)": 0.079887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 281.875, "completions/min_length": 207.0, "epoch": 2.9619520264681554, "grad_norm": 0.18917539715766907, "kl": 0.0693359375, "learning_rate": 4.899865644047985e-09, "loss": 0.0006935149431228638, "memory(GiB)": 38.13, "reward": 0.46422868967056274, "reward_std": 0.06565925478935242, "rewards/VisualizationJSONCombinedORM/mean": 0.46422868967056274, "rewards/VisualizationJSONCombinedORM/std": 0.10076652467250824, "step": 3581, "train_speed(iter/s)": 0.079861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 314.25, "completions/min_length": 243.0, "epoch": 2.962779156327543, "grad_norm": 0.19868044555187225, "kl": 0.03399658203125, "learning_rate": 4.689176577244992e-09, "loss": 0.0003399066627025604, "memory(GiB)": 38.13, "reward": 0.4003370404243469, "reward_std": 0.03119829297065735, "rewards/VisualizationJSONCombinedORM/mean": 0.4003370404243469, "rewards/VisualizationJSONCombinedORM/std": 0.130797877907753, "step": 3582, "train_speed(iter/s)": 0.079833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 312.4375, "completions/min_length": 264.0, "epoch": 2.9636062861869314, "grad_norm": 0.18010775744915009, "kl": 0.0528564453125, "learning_rate": 4.4831151756091766e-09, "loss": 0.0005281120538711548, "memory(GiB)": 38.13, "reward": 0.5532727837562561, "reward_std": 0.062351398169994354, "rewards/VisualizationJSONCombinedORM/mean": 0.5532727837562561, "rewards/VisualizationJSONCombinedORM/std": 0.13206471502780914, "step": 3583, "train_speed(iter/s)": 0.079802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 298.5625, "completions/min_length": 250.0, "epoch": 2.9644334160463193, "grad_norm": 0.21152357757091522, "kl": 0.1173095703125, "learning_rate": 4.281681630036727e-09, "loss": 0.0011748839169740677, "memory(GiB)": 38.13, "reward": 0.6087534427642822, "reward_std": 0.08458174765110016, "rewards/VisualizationJSONCombinedORM/mean": 0.6087534427642822, "rewards/VisualizationJSONCombinedORM/std": 0.08903753757476807, "step": 3584, "train_speed(iter/s)": 0.07978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 316.0625, "completions/min_length": 242.0, "epoch": 2.965260545905707, "grad_norm": 0.21383647620677948, "kl": 0.046142578125, "learning_rate": 4.0848761271350405e-09, "loss": 0.00046301260590553284, "memory(GiB)": 38.13, "reward": 0.4594941735267639, "reward_std": 0.06399751454591751, "rewards/VisualizationJSONCombinedORM/mean": 0.4594941735267639, "rewards/VisualizationJSONCombinedORM/std": 0.09660006314516068, "step": 3585, "train_speed(iter/s)": 0.07975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 305.625, "completions/min_length": 258.0, "epoch": 2.9660876757650954, "grad_norm": 0.18381740152835846, "kl": 0.13623046875, "learning_rate": 3.8926988492254955e-09, "loss": 0.0013614483177661896, "memory(GiB)": 38.13, "reward": 0.44323956966400146, "reward_std": 0.025166179984807968, "rewards/VisualizationJSONCombinedORM/mean": 0.44323956966400146, "rewards/VisualizationJSONCombinedORM/std": 0.12000883370637894, "step": 3586, "train_speed(iter/s)": 0.079716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 270.8125, "completions/min_length": 219.0, "epoch": 2.966914805624483, "grad_norm": 0.3261586129665375, "kl": 0.08453369140625, "learning_rate": 3.705149974342348e-09, "loss": 0.0008466858416795731, "memory(GiB)": 38.13, "reward": 0.42235273122787476, "reward_std": 0.09132982790470123, "rewards/VisualizationJSONCombinedORM/mean": 0.42235273122787476, "rewards/VisualizationJSONCombinedORM/std": 0.1371075063943863, "step": 3587, "train_speed(iter/s)": 0.079684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 293.375, "completions/min_length": 227.0, "epoch": 2.967741935483871, "grad_norm": 0.18860535323619843, "kl": 0.09130859375, "learning_rate": 3.522229676229949e-09, "loss": 0.0009112870320677757, "memory(GiB)": 38.13, "reward": 0.2871514558792114, "reward_std": 0.029997719451785088, "rewards/VisualizationJSONCombinedORM/mean": 0.2871514558792114, "rewards/VisualizationJSONCombinedORM/std": 0.12327554821968079, "step": 3588, "train_speed(iter/s)": 0.079663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 311.0625, "completions/min_length": 244.0, "epoch": 2.968569065343259, "grad_norm": 0.18571801483631134, "kl": 0.032440185546875, "learning_rate": 3.343938124346635e-09, "loss": 0.000323321670293808, "memory(GiB)": 38.13, "reward": 0.8448946475982666, "reward_std": 0.05683150142431259, "rewards/VisualizationJSONCombinedORM/mean": 0.8448946475982666, "rewards/VisualizationJSONCombinedORM/std": 0.07056403160095215, "step": 3589, "train_speed(iter/s)": 0.079634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 276.875, "completions/min_length": 226.0, "epoch": 2.9693961952026466, "grad_norm": 0.19066649675369263, "kl": 0.07177734375, "learning_rate": 3.170275483861951e-09, "loss": 0.0007175765931606293, "memory(GiB)": 38.13, "reward": 0.4805298149585724, "reward_std": 0.05703228712081909, "rewards/VisualizationJSONCombinedORM/mean": 0.4805298149585724, "rewards/VisualizationJSONCombinedORM/std": 0.10558388382196426, "step": 3590, "train_speed(iter/s)": 0.079614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 294.625, "completions/min_length": 239.0, "epoch": 2.970223325062035, "grad_norm": 0.1992058902978897, "kl": 0.1302490234375, "learning_rate": 3.0012419156572047e-09, "loss": 0.0013030506670475006, "memory(GiB)": 38.13, "reward": 0.41013675928115845, "reward_std": 0.05284417048096657, "rewards/VisualizationJSONCombinedORM/mean": 0.41013675928115845, "rewards/VisualizationJSONCombinedORM/std": 0.1722957193851471, "step": 3591, "train_speed(iter/s)": 0.07959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 302.5625, "completions/min_length": 238.0, "epoch": 2.9710504549214227, "grad_norm": 0.17558182775974274, "kl": 0.0797119140625, "learning_rate": 2.836837576326024e-09, "loss": 0.0007976144552230835, "memory(GiB)": 38.13, "reward": 0.4238588809967041, "reward_std": 0.05155452340841293, "rewards/VisualizationJSONCombinedORM/mean": 0.4238588809967041, "rewards/VisualizationJSONCombinedORM/std": 0.05427563562989235, "step": 3592, "train_speed(iter/s)": 0.079568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 306.8125, "completions/min_length": 256.0, "epoch": 2.9718775847808105, "grad_norm": 0.20267246663570404, "kl": 0.0614013671875, "learning_rate": 2.6770626181715776e-09, "loss": 0.0006149820983409882, "memory(GiB)": 38.13, "reward": 0.597980260848999, "reward_std": 0.08720419555902481, "rewards/VisualizationJSONCombinedORM/mean": 0.597980260848999, "rewards/VisualizationJSONCombinedORM/std": 0.14949245750904083, "step": 3593, "train_speed(iter/s)": 0.079543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 337.6875, "completions/min_length": 254.0, "epoch": 2.9727047146401984, "grad_norm": 0.2435750812292099, "kl": 0.05438232421875, "learning_rate": 2.5219171892110207e-09, "loss": 0.0005428716540336609, "memory(GiB)": 38.13, "reward": 0.5479328036308289, "reward_std": 0.056591108441352844, "rewards/VisualizationJSONCombinedORM/mean": 0.5479328036308289, "rewards/VisualizationJSONCombinedORM/std": 0.21654489636421204, "step": 3594, "train_speed(iter/s)": 0.079513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 284.5625, "completions/min_length": 243.0, "epoch": 2.973531844499586, "grad_norm": 0.17571134865283966, "kl": 0.06903076171875, "learning_rate": 2.371401433170495e-09, "loss": 0.0006896555423736572, "memory(GiB)": 38.13, "reward": 0.469411164522171, "reward_std": 0.04916481673717499, "rewards/VisualizationJSONCombinedORM/mean": 0.469411164522171, "rewards/VisualizationJSONCombinedORM/std": 0.052625685930252075, "step": 3595, "train_speed(iter/s)": 0.079492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 293.875, "completions/min_length": 248.0, "epoch": 2.9743589743589745, "grad_norm": 0.18070392310619354, "kl": 0.06097412109375, "learning_rate": 2.225515489488461e-09, "loss": 0.0006090402603149414, "memory(GiB)": 38.13, "reward": 0.4210697412490845, "reward_std": 0.04448989778757095, "rewards/VisualizationJSONCombinedORM/mean": 0.4210697412490845, "rewards/VisualizationJSONCombinedORM/std": 0.0733557865023613, "step": 3596, "train_speed(iter/s)": 0.079469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 329.375, "completions/min_length": 253.0, "epoch": 2.9751861042183623, "grad_norm": 0.19025924801826477, "kl": 0.08197021484375, "learning_rate": 2.0842594933140338e-09, "loss": 0.0008186101913452148, "memory(GiB)": 38.13, "reward": 0.31789153814315796, "reward_std": 0.041152410209178925, "rewards/VisualizationJSONCombinedORM/mean": 0.31789153814315796, "rewards/VisualizationJSONCombinedORM/std": 0.10990956425666809, "step": 3597, "train_speed(iter/s)": 0.079443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 288.75, "completions/min_length": 233.0, "epoch": 2.97601323407775, "grad_norm": 0.18419650197029114, "kl": 0.0408935546875, "learning_rate": 1.9476335755069795e-09, "loss": 0.00040973350405693054, "memory(GiB)": 38.13, "reward": 0.47179746627807617, "reward_std": 0.04389733821153641, "rewards/VisualizationJSONCombinedORM/mean": 0.47179746627807617, "rewards/VisualizationJSONCombinedORM/std": 0.14561180770397186, "step": 3598, "train_speed(iter/s)": 0.079419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 285.0, "completions/min_length": 218.0, "epoch": 2.9768403639371384, "grad_norm": 0.24914026260375977, "kl": 0.08355712890625, "learning_rate": 1.815637862637165e-09, "loss": 0.0008360594511032104, "memory(GiB)": 38.13, "reward": 0.6683365106582642, "reward_std": 0.12167681008577347, "rewards/VisualizationJSONCombinedORM/mean": 0.6683365106582642, "rewards/VisualizationJSONCombinedORM/std": 0.1465277224779129, "step": 3599, "train_speed(iter/s)": 0.079401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 328.1875, "completions/min_length": 267.0, "epoch": 2.977667493796526, "grad_norm": 0.16526266932487488, "kl": 0.07763671875, "learning_rate": 1.688272476986219e-09, "loss": 0.0007773041725158691, "memory(GiB)": 38.13, "reward": 0.632677435874939, "reward_std": 0.06053021550178528, "rewards/VisualizationJSONCombinedORM/mean": 0.632677435874939, "rewards/VisualizationJSONCombinedORM/std": 0.16974642872810364, "step": 3600, "train_speed(iter/s)": 0.079376 }, { "epoch": 2.977667493796526, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.125, "eval_completions/mean_length": 305.3333333333333, "eval_completions/min_length": 250.66666666666666, "eval_kl": 0.07140096028645833, "eval_loss": 0.0007162392139434814, "eval_reward": 0.4445510059595108, "eval_reward_std": 0.05578660871833563, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4445510059595108, "eval_rewards/VisualizationJSONCombinedORM/std": 0.055786610736201205, "eval_runtime": 312.4889, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 320.4375, "completions/min_length": 260.0, "epoch": 2.978494623655914, "grad_norm": 0.23252327740192413, "kl": 0.1361083984375, "learning_rate": 1.565537536545869e-09, "loss": 0.0013670027256011963, "memory(GiB)": 38.13, "reward": 0.4331561326980591, "reward_std": 0.07265764474868774, "rewards/VisualizationJSONCombinedORM/mean": 0.4331561326980591, "rewards/VisualizationJSONCombinedORM/std": 0.20504726469516754, "step": 3601, "train_speed(iter/s)": 0.078806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 284.6875, "completions/min_length": 218.0, "epoch": 2.979321753515302, "grad_norm": 0.21135801076889038, "kl": 0.06195068359375, "learning_rate": 1.4474331550173858e-09, "loss": 0.0006185062229633331, "memory(GiB)": 38.13, "reward": 0.6495664715766907, "reward_std": 0.04837650805711746, "rewards/VisualizationJSONCombinedORM/mean": 0.6495664715766907, "rewards/VisualizationJSONCombinedORM/std": 0.08156654238700867, "step": 3602, "train_speed(iter/s)": 0.078784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 311.0, "completions/min_length": 243.0, "epoch": 2.9801488833746896, "grad_norm": 0.1992279440164566, "kl": 0.04278564453125, "learning_rate": 1.3339594418138036e-09, "loss": 0.00042746588587760925, "memory(GiB)": 38.13, "reward": 0.4284287989139557, "reward_std": 0.03156236559152603, "rewards/VisualizationJSONCombinedORM/mean": 0.4284287989139557, "rewards/VisualizationJSONCombinedORM/std": 0.24422858655452728, "step": 3603, "train_speed(iter/s)": 0.078757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 313.8125, "completions/min_length": 265.0, "epoch": 2.980976013234078, "grad_norm": 0.23914310336112976, "kl": 0.1102294921875, "learning_rate": 1.225116502056589e-09, "loss": 0.0010994449257850647, "memory(GiB)": 38.13, "reward": 0.5363341569900513, "reward_std": 0.12397528439760208, "rewards/VisualizationJSONCombinedORM/mean": 0.5363341569900513, "rewards/VisualizationJSONCombinedORM/std": 0.12519696354866028, "step": 3604, "train_speed(iter/s)": 0.078731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 308.1875, "completions/min_length": 232.0, "epoch": 2.9818031430934657, "grad_norm": 0.18345974385738373, "kl": 0.1146240234375, "learning_rate": 1.1209044365778633e-09, "loss": 0.001147465780377388, "memory(GiB)": 38.13, "reward": 0.4088749587535858, "reward_std": 0.047674521803855896, "rewards/VisualizationJSONCombinedORM/mean": 0.4088749587535858, "rewards/VisualizationJSONCombinedORM/std": 0.14052064716815948, "step": 3605, "train_speed(iter/s)": 0.078711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 283.9375, "completions/min_length": 220.0, "epoch": 2.9826302729528535, "grad_norm": 0.16018137335777283, "kl": 0.1080322265625, "learning_rate": 1.0213233419203994e-09, "loss": 0.0010777954012155533, "memory(GiB)": 38.13, "reward": 0.5048230886459351, "reward_std": 0.017271514981985092, "rewards/VisualizationJSONCombinedORM/mean": 0.5048230886459351, "rewards/VisualizationJSONCombinedORM/std": 0.17067676782608032, "step": 3606, "train_speed(iter/s)": 0.078696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 277.8125, "completions/min_length": 228.0, "epoch": 2.9834574028122414, "grad_norm": 0.14990058541297913, "kl": 0.110107421875, "learning_rate": 9.263733103365147e-10, "loss": 0.0011005513370037079, "memory(GiB)": 38.13, "reward": 0.4657552242279053, "reward_std": 0.04068504273891449, "rewards/VisualizationJSONCombinedORM/mean": 0.4657552242279053, "rewards/VisualizationJSONCombinedORM/std": 0.22248271107673645, "step": 3607, "train_speed(iter/s)": 0.078673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 300.3125, "completions/min_length": 221.0, "epoch": 2.984284532671629, "grad_norm": 0.16058814525604248, "kl": 0.03826904296875, "learning_rate": 8.36054429787514e-10, "loss": 0.0003832876682281494, "memory(GiB)": 38.13, "reward": 0.4939121603965759, "reward_std": 0.04555053263902664, "rewards/VisualizationJSONCombinedORM/mean": 0.4939121603965759, "rewards/VisualizationJSONCombinedORM/std": 0.1318492740392685, "step": 3608, "train_speed(iter/s)": 0.078646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 298.375, "completions/min_length": 256.0, "epoch": 2.9851116625310175, "grad_norm": 0.18862412869930267, "kl": 0.1187744140625, "learning_rate": 7.503667839453555e-10, "loss": 0.0011839363723993301, "memory(GiB)": 38.13, "reward": 0.367533802986145, "reward_std": 0.029597174376249313, "rewards/VisualizationJSONCombinedORM/mean": 0.367533802986145, "rewards/VisualizationJSONCombinedORM/std": 0.0878165140748024, "step": 3609, "train_speed(iter/s)": 0.078617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 305.0, "completions/min_length": 248.0, "epoch": 2.9859387923904053, "grad_norm": 0.18309621512889862, "kl": 0.1219482421875, "learning_rate": 6.693104521909854e-10, "loss": 0.0012216046452522278, "memory(GiB)": 38.13, "reward": 0.4937114417552948, "reward_std": 0.10602942109107971, "rewards/VisualizationJSONCombinedORM/mean": 0.4937114417552948, "rewards/VisualizationJSONCombinedORM/std": 0.24816004931926727, "step": 3610, "train_speed(iter/s)": 0.0786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 296.5625, "completions/min_length": 237.0, "epoch": 2.986765922249793, "grad_norm": 0.1663699895143509, "kl": 0.06158447265625, "learning_rate": 5.928855096154485e-10, "loss": 0.0006173327565193176, "memory(GiB)": 38.13, "reward": 0.43588897585868835, "reward_std": 0.03593326732516289, "rewards/VisualizationJSONCombinedORM/mean": 0.43588897585868835, "rewards/VisualizationJSONCombinedORM/std": 0.12358102947473526, "step": 3611, "train_speed(iter/s)": 0.078575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 300.25, "completions/min_length": 249.0, "epoch": 2.9875930521091814, "grad_norm": 0.21531687676906586, "kl": 0.07794189453125, "learning_rate": 5.210920270187769e-10, "loss": 0.0007806196808815002, "memory(GiB)": 38.13, "reward": 0.7075058221817017, "reward_std": 0.09008066356182098, "rewards/VisualizationJSONCombinedORM/mean": 0.7075058221817017, "rewards/VisualizationJSONCombinedORM/std": 0.0996471643447876, "step": 3612, "train_speed(iter/s)": 0.078553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 295.3125, "completions/min_length": 240.0, "epoch": 2.988420181968569, "grad_norm": 0.18482358753681183, "kl": 0.1162109375, "learning_rate": 4.5393007090999143e-10, "loss": 0.0011598318815231323, "memory(GiB)": 38.13, "reward": 0.5877372622489929, "reward_std": 0.09121713042259216, "rewards/VisualizationJSONCombinedORM/mean": 0.5877372622489929, "rewards/VisualizationJSONCombinedORM/std": 0.13769055902957916, "step": 3613, "train_speed(iter/s)": 0.078529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 273.5625, "completions/min_length": 223.0, "epoch": 2.989247311827957, "grad_norm": 0.18868668377399445, "kl": 0.06396484375, "learning_rate": 3.913997035093209e-10, "loss": 0.0006395056843757629, "memory(GiB)": 38.13, "reward": 0.6395705938339233, "reward_std": 0.06594166904687881, "rewards/VisualizationJSONCombinedORM/mean": 0.6395705938339233, "rewards/VisualizationJSONCombinedORM/std": 0.06701846420764923, "step": 3614, "train_speed(iter/s)": 0.0785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 288.375, "completions/min_length": 227.0, "epoch": 2.990074441687345, "grad_norm": 0.16156131029129028, "kl": 0.03662109375, "learning_rate": 3.335009827437619e-10, "loss": 0.0003669746220111847, "memory(GiB)": 38.13, "reward": 0.5206418633460999, "reward_std": 0.05967588350176811, "rewards/VisualizationJSONCombinedORM/mean": 0.5206418633460999, "rewards/VisualizationJSONCombinedORM/std": 0.16281604766845703, "step": 3615, "train_speed(iter/s)": 0.078481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 320.0625, "completions/min_length": 262.0, "epoch": 2.9909015715467326, "grad_norm": 0.18316859006881714, "kl": 0.113525390625, "learning_rate": 2.802339622520744e-10, "loss": 0.0011343620717525482, "memory(GiB)": 38.13, "reward": 0.3608896732330322, "reward_std": 0.05926494300365448, "rewards/VisualizationJSONCombinedORM/mean": 0.3608896732330322, "rewards/VisualizationJSONCombinedORM/std": 0.10744081437587738, "step": 3616, "train_speed(iter/s)": 0.078446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 277.0, "completions/min_length": 216.0, "epoch": 2.991728701406121, "grad_norm": 0.20603249967098236, "kl": 0.0499267578125, "learning_rate": 2.315986913797863e-10, "loss": 0.0004994124174118042, "memory(GiB)": 38.13, "reward": 0.734100878238678, "reward_std": 0.07833951711654663, "rewards/VisualizationJSONCombinedORM/mean": 0.734100878238678, "rewards/VisualizationJSONCombinedORM/std": 0.07635468244552612, "step": 3617, "train_speed(iter/s)": 0.078424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 290.25, "completions/min_length": 212.0, "epoch": 2.9925558312655087, "grad_norm": 0.21719905734062195, "kl": 0.0821533203125, "learning_rate": 1.8759521518307845e-10, "loss": 0.0008216947317123413, "memory(GiB)": 38.13, "reward": 0.48429036140441895, "reward_std": 0.06170732527971268, "rewards/VisualizationJSONCombinedORM/mean": 0.48429036140441895, "rewards/VisualizationJSONCombinedORM/std": 0.10659243911504745, "step": 3618, "train_speed(iter/s)": 0.078402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 301.75, "completions/min_length": 240.0, "epoch": 2.9933829611248965, "grad_norm": 0.17580845952033997, "kl": 0.04638671875, "learning_rate": 1.4822357442656475e-10, "loss": 0.00046397559344768524, "memory(GiB)": 38.13, "reward": 0.4164672791957855, "reward_std": 0.047707781195640564, "rewards/VisualizationJSONCombinedORM/mean": 0.4164672791957855, "rewards/VisualizationJSONCombinedORM/std": 0.13252761960029602, "step": 3619, "train_speed(iter/s)": 0.078384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 308.6875, "completions/min_length": 249.0, "epoch": 2.9942100909842844, "grad_norm": 0.16353581845760345, "kl": 0.044921875, "learning_rate": 1.1348380558495742e-10, "loss": 0.00044973939657211304, "memory(GiB)": 38.13, "reward": 0.6821180582046509, "reward_std": 0.08461086452007294, "rewards/VisualizationJSONCombinedORM/mean": 0.6821180582046509, "rewards/VisualizationJSONCombinedORM/std": 0.09498949348926544, "step": 3620, "train_speed(iter/s)": 0.07836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 299.0, "completions/min_length": 225.0, "epoch": 2.995037220843672, "grad_norm": 0.20034103095531464, "kl": 0.076904296875, "learning_rate": 8.337594084084633e-11, "loss": 0.000769391655921936, "memory(GiB)": 38.13, "reward": 0.35301634669303894, "reward_std": 0.037809696048498154, "rewards/VisualizationJSONCombinedORM/mean": 0.35301634669303894, "rewards/VisualizationJSONCombinedORM/std": 0.04995657876133919, "step": 3621, "train_speed(iter/s)": 0.078341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 302.5, "completions/min_length": 255.0, "epoch": 2.9958643507030605, "grad_norm": 0.18474359810352325, "kl": 0.05462646484375, "learning_rate": 5.790000808580942e-11, "loss": 0.0005447231233119965, "memory(GiB)": 38.13, "reward": 0.7552766799926758, "reward_std": 0.09571404755115509, "rewards/VisualizationJSONCombinedORM/mean": 0.7552766799926758, "rewards/VisualizationJSONCombinedORM/std": 0.09742651134729385, "step": 3622, "train_speed(iter/s)": 0.078321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 304.625, "completions/min_length": 236.0, "epoch": 2.9966914805624483, "grad_norm": 0.19238540530204773, "kl": 0.2039794921875, "learning_rate": 3.7056030921522877e-11, "loss": 0.0020375847816467285, "memory(GiB)": 38.13, "reward": 0.5775811672210693, "reward_std": 0.06427878141403198, "rewards/VisualizationJSONCombinedORM/mean": 0.5775811672210693, "rewards/VisualizationJSONCombinedORM/std": 0.2788747251033783, "step": 3623, "train_speed(iter/s)": 0.078301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 289.1875, "completions/min_length": 237.0, "epoch": 2.997518610421836, "grad_norm": 0.20001991093158722, "kl": 0.1253662109375, "learning_rate": 2.084402865754065e-11, "loss": 0.001256205141544342, "memory(GiB)": 38.13, "reward": 0.4921573996543884, "reward_std": 0.07948292791843414, "rewards/VisualizationJSONCombinedORM/mean": 0.4921573996543884, "rewards/VisualizationJSONCombinedORM/std": 0.14166271686553955, "step": 3624, "train_speed(iter/s)": 0.07828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 264.8125, "completions/min_length": 201.0, "epoch": 2.9983457402812244, "grad_norm": 0.18180876970291138, "kl": 0.034149169921875, "learning_rate": 9.2640163124047e-12, "loss": 0.0003412291407585144, "memory(GiB)": 38.13, "reward": 0.24955040216445923, "reward_std": 0.0352117121219635, "rewards/VisualizationJSONCombinedORM/mean": 0.24955040216445923, "rewards/VisualizationJSONCombinedORM/std": 0.09369216859340668, "step": 3625, "train_speed(iter/s)": 0.07826 }, { "epoch": 2.9983457402812244, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 362.4583333333333, "eval_completions/mean_length": 302.6458333333333, "eval_completions/min_length": 250.0, "eval_kl": 0.08058675130208333, "eval_loss": 0.0008071499760262668, "eval_reward": 0.4510117657482624, "eval_reward_std": 0.05319595577505728, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4510117657482624, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05319595616310835, "eval_runtime": 309.4235, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 294.375, "completions/min_length": 250.0, "epoch": 2.999172870140612, "grad_norm": 0.16903002560138702, "kl": 0.2010498046875, "learning_rate": 2.3160046147552296e-12, "loss": 0.0020052827894687653, "memory(GiB)": 38.13, "reward": 0.5442919731140137, "reward_std": 0.08287329971790314, "rewards/VisualizationJSONCombinedORM/mean": 0.5442919731140137, "rewards/VisualizationJSONCombinedORM/std": 0.1345808357000351, "step": 3626, "train_speed(iter/s)": 0.077726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 319.3125, "completions/min_length": 241.0, "epoch": 3.0, "grad_norm": 0.19818416237831116, "kl": 0.06365966796875, "learning_rate": 0.0, "loss": 0.0006368234753608704, "memory(GiB)": 38.13, "reward": 0.5530165433883667, "reward_std": 0.04164047911763191, "rewards/VisualizationJSONCombinedORM/mean": 0.5530165433883667, "rewards/VisualizationJSONCombinedORM/std": 0.1916944533586502, "step": 3627, "train_speed(iter/s)": 0.077701 }, { "epoch": 3.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 356.7916666666667, "eval_completions/mean_length": 299.984375, "eval_completions/min_length": 248.04166666666666, "eval_kl": 0.074127197265625, "eval_loss": 0.0007325311307795346, "eval_reward": 0.458267110089461, "eval_reward_std": 0.06266414331427465, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.458267110089461, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0626641441291819, "eval_runtime": 306.1699, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 279.6875, "completions/min_length": 230.0, "epoch": 3.000827129859388, "grad_norm": 0.232681542634964, "kl": 0.05499267578125, "learning_rate": 4.129547490416706e-06, "loss": 0.0005490817129611969, "memory(GiB)": 36.61, "reward": 0.45598286390304565, "reward_std": 0.054496459662914276, "rewards/VisualizationJSONCombinedORM/mean": 0.45598286390304565, "rewards/VisualizationJSONCombinedORM/std": 0.054676175117492676, "step": 3628, "train_speed(iter/s)": 11.633835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 275.625, "completions/min_length": 219.0, "epoch": 3.0016542597187756, "grad_norm": 0.23873594403266907, "kl": 0.0615234375, "learning_rate": 4.126704235489606e-06, "loss": 0.0006158053874969482, "memory(GiB)": 37.15, "reward": 0.5151772499084473, "reward_std": 0.08605524152517319, "rewards/VisualizationJSONCombinedORM/mean": 0.5151772499084473, "rewards/VisualizationJSONCombinedORM/std": 0.08917613327503204, "step": 3629, "train_speed(iter/s)": 10.802242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 302.5, "completions/min_length": 235.0, "epoch": 3.002481389578164, "grad_norm": 0.16523537039756775, "kl": 0.025421142578125, "learning_rate": 4.123861271810735e-06, "loss": 0.00025444477796554565, "memory(GiB)": 37.35, "reward": 0.49337852001190186, "reward_std": 0.047953709959983826, "rewards/VisualizationJSONCombinedORM/mean": 0.49337852001190186, "rewards/VisualizationJSONCombinedORM/std": 0.24161919951438904, "step": 3630, "train_speed(iter/s)": 10.020754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 271.8125, "completions/min_length": 232.0, "epoch": 3.0033085194375517, "grad_norm": 0.1646922379732132, "kl": 0.0888671875, "learning_rate": 4.1210186003282275e-06, "loss": 0.0008884556591510773, "memory(GiB)": 37.35, "reward": 0.6056864261627197, "reward_std": 0.0684674009680748, "rewards/VisualizationJSONCombinedORM/mean": 0.6056864261627197, "rewards/VisualizationJSONCombinedORM/std": 0.15953585505485535, "step": 3631, "train_speed(iter/s)": 9.391434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 330.25, "completions/min_length": 261.0, "epoch": 3.0041356492969395, "grad_norm": 0.2024202197790146, "kl": 0.05694580078125, "learning_rate": 4.118176221990134e-06, "loss": 0.000570300966501236, "memory(GiB)": 37.35, "reward": 0.5651835203170776, "reward_std": 0.0438486710190773, "rewards/VisualizationJSONCombinedORM/mean": 0.5651835203170776, "rewards/VisualizationJSONCombinedORM/std": 0.05237523466348648, "step": 3632, "train_speed(iter/s)": 8.868001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 287.9375, "completions/min_length": 251.0, "epoch": 3.0049627791563274, "grad_norm": 0.1746111363172531, "kl": 0.1099853515625, "learning_rate": 4.115334137744397e-06, "loss": 0.0011003725230693817, "memory(GiB)": 37.35, "reward": 0.45992282032966614, "reward_std": 0.08192899823188782, "rewards/VisualizationJSONCombinedORM/mean": 0.45992282032966614, "rewards/VisualizationJSONCombinedORM/std": 0.08761128783226013, "step": 3633, "train_speed(iter/s)": 8.451436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 273.25, "completions/min_length": 241.0, "epoch": 3.0057899090157156, "grad_norm": 0.17727598547935486, "kl": 0.083251953125, "learning_rate": 4.112492348538867e-06, "loss": 0.0008347518742084503, "memory(GiB)": 37.35, "reward": 0.7174803018569946, "reward_std": 0.04317308962345123, "rewards/VisualizationJSONCombinedORM/mean": 0.7174803018569946, "rewards/VisualizationJSONCombinedORM/std": 0.06633505970239639, "step": 3634, "train_speed(iter/s)": 7.970679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 294.25, "completions/min_length": 244.0, "epoch": 3.0066170388751035, "grad_norm": 0.19828633964061737, "kl": 0.07025146484375, "learning_rate": 4.109650855321291e-06, "loss": 0.0007008658722043037, "memory(GiB)": 37.35, "reward": 0.31361275911331177, "reward_std": 0.045956868678331375, "rewards/VisualizationJSONCombinedORM/mean": 0.31361275911331177, "rewards/VisualizationJSONCombinedORM/std": 0.04581919312477112, "step": 3635, "train_speed(iter/s)": 7.608611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 289.0, "completions/min_length": 247.0, "epoch": 3.0074441687344913, "grad_norm": 0.2036546766757965, "kl": 0.04803466796875, "learning_rate": 4.106809659039325e-06, "loss": 0.0004811026155948639, "memory(GiB)": 37.66, "reward": 0.570777416229248, "reward_std": 0.09551619738340378, "rewards/VisualizationJSONCombinedORM/mean": 0.570777416229248, "rewards/VisualizationJSONCombinedORM/std": 0.24811838567256927, "step": 3636, "train_speed(iter/s)": 7.199449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 320.125, "completions/min_length": 246.0, "epoch": 3.008271298593879, "grad_norm": 0.15398690104484558, "kl": 0.04510498046875, "learning_rate": 4.103968760640516e-06, "loss": 0.0004505142569541931, "memory(GiB)": 37.66, "reward": 0.4302101731300354, "reward_std": 0.06473293900489807, "rewards/VisualizationJSONCombinedORM/mean": 0.4302101731300354, "rewards/VisualizationJSONCombinedORM/std": 0.08197331428527832, "step": 3637, "train_speed(iter/s)": 6.909329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 303.125, "completions/min_length": 236.0, "epoch": 3.0090984284532674, "grad_norm": 0.2206803262233734, "kl": 0.083984375, "learning_rate": 4.1011281610723235e-06, "loss": 0.0008391942828893661, "memory(GiB)": 37.66, "reward": 0.5662204027175903, "reward_std": 0.07348398119211197, "rewards/VisualizationJSONCombinedORM/mean": 0.5662204027175903, "rewards/VisualizationJSONCombinedORM/std": 0.23486091196537018, "step": 3638, "train_speed(iter/s)": 6.638805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 296.25, "completions/min_length": 242.0, "epoch": 3.009925558312655, "grad_norm": 0.22548159956932068, "kl": 0.08349609375, "learning_rate": 4.0982878612820934e-06, "loss": 0.0008351728320121765, "memory(GiB)": 37.66, "reward": 0.4286966919898987, "reward_std": 0.07639345526695251, "rewards/VisualizationJSONCombinedORM/mean": 0.4286966919898987, "rewards/VisualizationJSONCombinedORM/std": 0.13369125127792358, "step": 3639, "train_speed(iter/s)": 6.325752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 285.75, "completions/min_length": 234.0, "epoch": 3.010752688172043, "grad_norm": 0.19273212552070618, "kl": 0.131103515625, "learning_rate": 4.095447862217084e-06, "loss": 0.001310255378484726, "memory(GiB)": 37.66, "reward": 0.509604811668396, "reward_std": 0.07706703990697861, "rewards/VisualizationJSONCombinedORM/mean": 0.509604811668396, "rewards/VisualizationJSONCombinedORM/std": 0.11099959909915924, "step": 3640, "train_speed(iter/s)": 6.076006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 295.1875, "completions/min_length": 231.0, "epoch": 3.011579818031431, "grad_norm": 0.1720178723335266, "kl": 0.0482177734375, "learning_rate": 4.092608164824446e-06, "loss": 0.0004818905144929886, "memory(GiB)": 37.66, "reward": 0.5038402080535889, "reward_std": 0.04224628955125809, "rewards/VisualizationJSONCombinedORM/mean": 0.5038402080535889, "rewards/VisualizationJSONCombinedORM/std": 0.15233775973320007, "step": 3641, "train_speed(iter/s)": 5.834107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 287.6875, "completions/min_length": 221.0, "epoch": 3.0124069478908186, "grad_norm": 0.18677936494350433, "kl": 0.10595703125, "learning_rate": 4.089768770051233e-06, "loss": 0.0010589510202407837, "memory(GiB)": 37.66, "reward": 0.5996974110603333, "reward_std": 0.0884028822183609, "rewards/VisualizationJSONCombinedORM/mean": 0.5996974110603333, "rewards/VisualizationJSONCombinedORM/std": 0.11385300010442734, "step": 3642, "train_speed(iter/s)": 5.613902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 288.0625, "completions/min_length": 239.0, "epoch": 3.013234077750207, "grad_norm": 0.28025656938552856, "kl": 0.0933837890625, "learning_rate": 4.086929678844396e-06, "loss": 0.0009339526295661926, "memory(GiB)": 37.92, "reward": 0.6957410573959351, "reward_std": 0.10040765255689621, "rewards/VisualizationJSONCombinedORM/mean": 0.6957410573959351, "rewards/VisualizationJSONCombinedORM/std": 0.10739470273256302, "step": 3643, "train_speed(iter/s)": 5.363129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 291.4375, "completions/min_length": 238.0, "epoch": 3.0140612076095947, "grad_norm": 0.2250673621892929, "kl": 0.0543212890625, "learning_rate": 4.084090892150786e-06, "loss": 0.0005446672439575195, "memory(GiB)": 37.92, "reward": 0.6484887003898621, "reward_std": 0.09732864052057266, "rewards/VisualizationJSONCombinedORM/mean": 0.6484887003898621, "rewards/VisualizationJSONCombinedORM/std": 0.09593424201011658, "step": 3644, "train_speed(iter/s)": 5.171527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 287.875, "completions/min_length": 229.0, "epoch": 3.0148883374689825, "grad_norm": 0.2175181806087494, "kl": 0.1494140625, "learning_rate": 4.081252410917148e-06, "loss": 0.0014930367469787598, "memory(GiB)": 37.92, "reward": 0.6550692915916443, "reward_std": 0.08258463442325592, "rewards/VisualizationJSONCombinedORM/mean": 0.6550692915916443, "rewards/VisualizationJSONCombinedORM/std": 0.13255365192890167, "step": 3645, "train_speed(iter/s)": 5.007426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 302.875, "completions/min_length": 260.0, "epoch": 3.0157154673283704, "grad_norm": 0.2056775987148285, "kl": 0.10302734375, "learning_rate": 4.0784142360901355e-06, "loss": 0.001028645783662796, "memory(GiB)": 37.92, "reward": 0.7519092559814453, "reward_std": 0.07767722010612488, "rewards/VisualizationJSONCombinedORM/mean": 0.7519092559814453, "rewards/VisualizationJSONCombinedORM/std": 0.0906311571598053, "step": 3646, "train_speed(iter/s)": 4.83153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 323.6875, "completions/min_length": 258.0, "epoch": 3.0165425971877586, "grad_norm": 0.18496668338775635, "kl": 0.05999755859375, "learning_rate": 4.075576368616286e-06, "loss": 0.0006011426448822021, "memory(GiB)": 37.92, "reward": 0.5383440852165222, "reward_std": 0.05556916818022728, "rewards/VisualizationJSONCombinedORM/mean": 0.5383440852165222, "rewards/VisualizationJSONCombinedORM/std": 0.19940485060214996, "step": 3647, "train_speed(iter/s)": 4.657672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 339.5, "completions/min_length": 251.0, "epoch": 3.0173697270471465, "grad_norm": 0.2216198742389679, "kl": 0.08441162109375, "learning_rate": 4.072738809442046e-06, "loss": 0.0008442141115665436, "memory(GiB)": 37.92, "reward": 0.4814627766609192, "reward_std": 0.09281858801841736, "rewards/VisualizationJSONCombinedORM/mean": 0.4814627766609192, "rewards/VisualizationJSONCombinedORM/std": 0.11400105804204941, "step": 3648, "train_speed(iter/s)": 4.506672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 292.0625, "completions/min_length": 238.0, "epoch": 3.0181968569065343, "grad_norm": 0.1901719719171524, "kl": 0.06634521484375, "learning_rate": 4.0699015595137535e-06, "loss": 0.0006633996963500977, "memory(GiB)": 37.92, "reward": 0.4265768527984619, "reward_std": 0.058278605341911316, "rewards/VisualizationJSONCombinedORM/mean": 0.4265768527984619, "rewards/VisualizationJSONCombinedORM/std": 0.07293430715799332, "step": 3649, "train_speed(iter/s)": 4.362425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 308.0, "completions/min_length": 231.0, "epoch": 3.019023986765922, "grad_norm": 0.21475636959075928, "kl": 0.0706787109375, "learning_rate": 4.067064619777645e-06, "loss": 0.0007072535809129477, "memory(GiB)": 37.92, "reward": 0.6270104646682739, "reward_std": 0.07514585554599762, "rewards/VisualizationJSONCombinedORM/mean": 0.6270104646682739, "rewards/VisualizationJSONCombinedORM/std": 0.07307115197181702, "step": 3650, "train_speed(iter/s)": 4.225 }, { "epoch": 3.019023986765922, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.5416666666667, "eval_completions/mean_length": 302.203125, "eval_completions/min_length": 252.66666666666666, "eval_kl": 0.080108642578125, "eval_loss": 0.0007990089361555874, "eval_reward": 0.44949062789479893, "eval_reward_std": 0.0672242664732039, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44949062789479893, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06722427012088399, "eval_runtime": 305.2374, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 315.25, "completions/min_length": 232.0, "epoch": 3.0198511166253104, "grad_norm": 0.18612541258335114, "kl": 0.072509765625, "learning_rate": 4.064227991179852e-06, "loss": 0.0007239188998937607, "memory(GiB)": 37.92, "reward": 0.6665639281272888, "reward_std": 0.06144531071186066, "rewards/VisualizationJSONCombinedORM/mean": 0.6665639281272888, "rewards/VisualizationJSONCombinedORM/std": 0.06791917234659195, "step": 3651, "train_speed(iter/s)": 3.042805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 301.0, "completions/min_length": 242.0, "epoch": 3.020678246484698, "grad_norm": 0.1510016918182373, "kl": 0.0728759765625, "learning_rate": 4.0613916746664056e-06, "loss": 0.0007274765521287918, "memory(GiB)": 37.92, "reward": 0.5055396556854248, "reward_std": 0.05346349626779556, "rewards/VisualizationJSONCombinedORM/mean": 0.5055396556854248, "rewards/VisualizationJSONCombinedORM/std": 0.06335147470235825, "step": 3652, "train_speed(iter/s)": 2.969207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 322.1875, "completions/min_length": 258.0, "epoch": 3.021505376344086, "grad_norm": 0.1714593470096588, "kl": 0.072265625, "learning_rate": 4.058555671183227e-06, "loss": 0.0007237978279590607, "memory(GiB)": 37.92, "reward": 0.6362894773483276, "reward_std": 0.0336969830095768, "rewards/VisualizationJSONCombinedORM/mean": 0.6362894773483276, "rewards/VisualizationJSONCombinedORM/std": 0.1588803380727768, "step": 3653, "train_speed(iter/s)": 2.916469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 313.25, "completions/min_length": 242.0, "epoch": 3.022332506203474, "grad_norm": 0.17773830890655518, "kl": 0.0875244140625, "learning_rate": 4.055719981676142e-06, "loss": 0.0008741319179534912, "memory(GiB)": 37.92, "reward": 0.45828092098236084, "reward_std": 0.04911928251385689, "rewards/VisualizationJSONCombinedORM/mean": 0.45828092098236084, "rewards/VisualizationJSONCombinedORM/std": 0.20433233678340912, "step": 3654, "train_speed(iter/s)": 2.853049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 303.1875, "completions/min_length": 256.0, "epoch": 3.023159636062862, "grad_norm": 0.18128377199172974, "kl": 0.1109619140625, "learning_rate": 4.05288460709086e-06, "loss": 0.001108996570110321, "memory(GiB)": 37.92, "reward": 0.22952012717723846, "reward_std": 0.024289654567837715, "rewards/VisualizationJSONCombinedORM/mean": 0.22952012717723846, "rewards/VisualizationJSONCombinedORM/std": 0.07822084426879883, "step": 3655, "train_speed(iter/s)": 2.800627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 299.75, "completions/min_length": 220.0, "epoch": 3.02398676592225, "grad_norm": 0.17830446362495422, "kl": 0.0738525390625, "learning_rate": 4.050049548372998e-06, "loss": 0.0007385425269603729, "memory(GiB)": 37.92, "reward": 0.49863100051879883, "reward_std": 0.0664280354976654, "rewards/VisualizationJSONCombinedORM/mean": 0.49863100051879883, "rewards/VisualizationJSONCombinedORM/std": 0.07735804468393326, "step": 3656, "train_speed(iter/s)": 2.73501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 321.3125, "completions/min_length": 266.0, "epoch": 3.0248138957816377, "grad_norm": 0.16628752648830414, "kl": 0.088531494140625, "learning_rate": 4.047214806468056e-06, "loss": 0.0008892249315977097, "memory(GiB)": 37.92, "reward": 0.6870429515838623, "reward_std": 0.09071020781993866, "rewards/VisualizationJSONCombinedORM/mean": 0.6870429515838623, "rewards/VisualizationJSONCombinedORM/std": 0.18949344754219055, "step": 3657, "train_speed(iter/s)": 2.684134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 289.9375, "completions/min_length": 223.0, "epoch": 3.0256410256410255, "grad_norm": 0.2111404836177826, "kl": 0.08221435546875, "learning_rate": 4.044380382321437e-06, "loss": 0.0008231792598962784, "memory(GiB)": 37.92, "reward": 0.5524927377700806, "reward_std": 0.06338480114936829, "rewards/VisualizationJSONCombinedORM/mean": 0.5524927377700806, "rewards/VisualizationJSONCombinedORM/std": 0.17434117197990417, "step": 3658, "train_speed(iter/s)": 2.637604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 296.3125, "completions/min_length": 228.0, "epoch": 3.0264681555004134, "grad_norm": 0.2406189739704132, "kl": 0.1046142578125, "learning_rate": 4.041546276878433e-06, "loss": 0.0010458454489707947, "memory(GiB)": 37.92, "reward": 0.579572319984436, "reward_std": 0.11198629438877106, "rewards/VisualizationJSONCombinedORM/mean": 0.579572319984436, "rewards/VisualizationJSONCombinedORM/std": 0.10822964459657669, "step": 3659, "train_speed(iter/s)": 2.589819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 332.6875, "completions/min_length": 237.0, "epoch": 3.0272952853598016, "grad_norm": 0.20989350974559784, "kl": 0.0521240234375, "learning_rate": 4.038712491084234e-06, "loss": 0.0005225404165685177, "memory(GiB)": 37.92, "reward": 0.5729241371154785, "reward_std": 0.04825339466333389, "rewards/VisualizationJSONCombinedORM/mean": 0.5729241371154785, "rewards/VisualizationJSONCombinedORM/std": 0.1955355554819107, "step": 3660, "train_speed(iter/s)": 2.541909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 335.9375, "completions/min_length": 247.0, "epoch": 3.0281224152191895, "grad_norm": 0.2146928757429123, "kl": 0.06591796875, "learning_rate": 4.035879025883916e-06, "loss": 0.0006594154983758926, "memory(GiB)": 37.92, "reward": 0.475757360458374, "reward_std": 0.04850402846932411, "rewards/VisualizationJSONCombinedORM/mean": 0.475757360458374, "rewards/VisualizationJSONCombinedORM/std": 0.1873701512813568, "step": 3661, "train_speed(iter/s)": 2.498902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 294.625, "completions/min_length": 228.0, "epoch": 3.0289495450785773, "grad_norm": 0.20710283517837524, "kl": 0.03106689453125, "learning_rate": 4.033045882222459e-06, "loss": 0.00031106825917959213, "memory(GiB)": 37.92, "reward": 0.6556621789932251, "reward_std": 0.06084747239947319, "rewards/VisualizationJSONCombinedORM/mean": 0.6556621789932251, "rewards/VisualizationJSONCombinedORM/std": 0.16677211225032806, "step": 3662, "train_speed(iter/s)": 2.456063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 301.0625, "completions/min_length": 232.0, "epoch": 3.029776674937965, "grad_norm": 0.3494482934474945, "kl": 0.06842041015625, "learning_rate": 4.030213061044724e-06, "loss": 0.0006837248802185059, "memory(GiB)": 37.92, "reward": 0.5841824412345886, "reward_std": 0.06551536917686462, "rewards/VisualizationJSONCombinedORM/mean": 0.5841824412345886, "rewards/VisualizationJSONCombinedORM/std": 0.2771912217140198, "step": 3663, "train_speed(iter/s)": 2.417295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 327.6875, "completions/min_length": 254.0, "epoch": 3.0306038047973534, "grad_norm": 0.18307164311408997, "kl": 0.1607666015625, "learning_rate": 4.027380563295475e-06, "loss": 0.0016030073165893555, "memory(GiB)": 37.92, "reward": 0.42785221338272095, "reward_std": 0.06537552922964096, "rewards/VisualizationJSONCombinedORM/mean": 0.42785221338272095, "rewards/VisualizationJSONCombinedORM/std": 0.1345260888338089, "step": 3664, "train_speed(iter/s)": 2.376656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 289.0, "completions/min_length": 252.0, "epoch": 3.031430934656741, "grad_norm": 0.3142770528793335, "kl": 0.20782470703125, "learning_rate": 4.02454838991936e-06, "loss": 0.0020714551210403442, "memory(GiB)": 37.92, "reward": 0.5100129246711731, "reward_std": 0.046797268092632294, "rewards/VisualizationJSONCombinedORM/mean": 0.5100129246711731, "rewards/VisualizationJSONCombinedORM/std": 0.2556315064430237, "step": 3665, "train_speed(iter/s)": 2.338218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 283.125, "completions/min_length": 219.0, "epoch": 3.032258064516129, "grad_norm": 0.2134644240140915, "kl": 0.10784912109375, "learning_rate": 4.0217165418609215e-06, "loss": 0.001077122986316681, "memory(GiB)": 37.92, "reward": 0.6945209503173828, "reward_std": 0.06822288781404495, "rewards/VisualizationJSONCombinedORM/mean": 0.6945209503173828, "rewards/VisualizationJSONCombinedORM/std": 0.09667017310857773, "step": 3666, "train_speed(iter/s)": 2.305509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 269.0, "completions/min_length": 214.0, "epoch": 3.033085194375517, "grad_norm": 0.2210143655538559, "kl": 0.06231689453125, "learning_rate": 4.018885020064598e-06, "loss": 0.0006211884319782257, "memory(GiB)": 37.92, "reward": 0.43006327748298645, "reward_std": 0.06256872415542603, "rewards/VisualizationJSONCombinedORM/mean": 0.43006327748298645, "rewards/VisualizationJSONCombinedORM/std": 0.15408450365066528, "step": 3667, "train_speed(iter/s)": 2.269102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 321.0, "completions/min_length": 247.0, "epoch": 3.033912324234905, "grad_norm": 0.24642524123191833, "kl": 0.1153564453125, "learning_rate": 4.01605382547471e-06, "loss": 0.0011535026133060455, "memory(GiB)": 37.92, "reward": 0.5948686003684998, "reward_std": 0.06039271131157875, "rewards/VisualizationJSONCombinedORM/mean": 0.5948686003684998, "rewards/VisualizationJSONCombinedORM/std": 0.1570650190114975, "step": 3668, "train_speed(iter/s)": 2.236886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 302.0, "completions/min_length": 248.0, "epoch": 3.034739454094293, "grad_norm": 0.18206168711185455, "kl": 0.0513916015625, "learning_rate": 4.013222959035481e-06, "loss": 0.0005138255655765533, "memory(GiB)": 37.92, "reward": 0.5608614683151245, "reward_std": 0.041417304426431656, "rewards/VisualizationJSONCombinedORM/mean": 0.5608614683151245, "rewards/VisualizationJSONCombinedORM/std": 0.23124967515468597, "step": 3669, "train_speed(iter/s)": 2.202188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 281.125, "completions/min_length": 225.0, "epoch": 3.0355665839536807, "grad_norm": 0.19624201953411102, "kl": 0.0308837890625, "learning_rate": 4.0103924216910104e-06, "loss": 0.0003082975745201111, "memory(GiB)": 37.92, "reward": 0.6789692044258118, "reward_std": 0.03409622609615326, "rewards/VisualizationJSONCombinedORM/mean": 0.6789692044258118, "rewards/VisualizationJSONCombinedORM/std": 0.20105792582035065, "step": 3670, "train_speed(iter/s)": 2.16939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 270.375, "completions/min_length": 193.0, "epoch": 3.0363937138130686, "grad_norm": 0.16459912061691284, "kl": 0.0423583984375, "learning_rate": 4.0075622143853025e-06, "loss": 0.0004238635301589966, "memory(GiB)": 37.92, "reward": 0.4864968955516815, "reward_std": 0.06450193375349045, "rewards/VisualizationJSONCombinedORM/mean": 0.4864968955516815, "rewards/VisualizationJSONCombinedORM/std": 0.20942270755767822, "step": 3671, "train_speed(iter/s)": 2.135525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 293.75, "completions/min_length": 227.0, "epoch": 3.0372208436724564, "grad_norm": 0.17270682752132416, "kl": 0.107666015625, "learning_rate": 4.004732338062239e-06, "loss": 0.0010773763060569763, "memory(GiB)": 37.92, "reward": 0.474859356880188, "reward_std": 0.04360485076904297, "rewards/VisualizationJSONCombinedORM/mean": 0.474859356880188, "rewards/VisualizationJSONCombinedORM/std": 0.13934758305549622, "step": 3672, "train_speed(iter/s)": 2.105387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 288.5, "completions/min_length": 237.0, "epoch": 3.0380479735318446, "grad_norm": 0.1667853444814682, "kl": 0.05950927734375, "learning_rate": 4.001902793665602e-06, "loss": 0.0005944743752479553, "memory(GiB)": 37.92, "reward": 0.7219652533531189, "reward_std": 0.03457153961062431, "rewards/VisualizationJSONCombinedORM/mean": 0.7219652533531189, "rewards/VisualizationJSONCombinedORM/std": 0.0409105084836483, "step": 3673, "train_speed(iter/s)": 2.07826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 340.125, "completions/min_length": 246.0, "epoch": 3.0388751033912325, "grad_norm": 0.19665563106536865, "kl": 0.04547119140625, "learning_rate": 3.999073582139054e-06, "loss": 0.00045492593199014664, "memory(GiB)": 37.92, "reward": 0.7430731654167175, "reward_std": 0.049131087958812714, "rewards/VisualizationJSONCombinedORM/mean": 0.7430731654167175, "rewards/VisualizationJSONCombinedORM/std": 0.051364995539188385, "step": 3674, "train_speed(iter/s)": 2.044992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 296.3125, "completions/min_length": 256.0, "epoch": 3.0397022332506203, "grad_norm": 0.16909337043762207, "kl": 0.13427734375, "learning_rate": 3.996244704426153e-06, "loss": 0.0013464801013469696, "memory(GiB)": 37.92, "reward": 0.7189929485321045, "reward_std": 0.052697502076625824, "rewards/VisualizationJSONCombinedORM/mean": 0.7189929485321045, "rewards/VisualizationJSONCombinedORM/std": 0.11636806279420853, "step": 3675, "train_speed(iter/s)": 2.017589 }, { "epoch": 3.0397022332506203, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 370.375, "eval_completions/mean_length": 301.3489583333333, "eval_completions/min_length": 247.125, "eval_kl": 0.09916178385416667, "eval_loss": 0.0010361032327637076, "eval_reward": 0.4646853419641654, "eval_reward_std": 0.056930907109441854, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4646853419641654, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05693090796315422, "eval_runtime": 314.3681, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 269.25, "completions/min_length": 219.0, "epoch": 3.040529363110008, "grad_norm": 0.2125430852174759, "kl": 0.1156005859375, "learning_rate": 3.993416161470339e-06, "loss": 0.0011571720242500305, "memory(GiB)": 37.92, "reward": 0.36177194118499756, "reward_std": 0.06061915308237076, "rewards/VisualizationJSONCombinedORM/mean": 0.36177194118499756, "rewards/VisualizationJSONCombinedORM/std": 0.06410381197929382, "step": 3676, "train_speed(iter/s)": 1.69769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 321.25, "completions/min_length": 255.0, "epoch": 3.0413564929693964, "grad_norm": 0.19759885966777802, "kl": 0.212158203125, "learning_rate": 3.99058795421495e-06, "loss": 0.0021214187145233154, "memory(GiB)": 37.92, "reward": 0.46954530477523804, "reward_std": 0.04777764156460762, "rewards/VisualizationJSONCombinedORM/mean": 0.46954530477523804, "rewards/VisualizationJSONCombinedORM/std": 0.18236596882343292, "step": 3677, "train_speed(iter/s)": 1.67425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 285.0625, "completions/min_length": 228.0, "epoch": 3.042183622828784, "grad_norm": 0.20648318529129028, "kl": 0.071044921875, "learning_rate": 3.9877600836032004e-06, "loss": 0.000712025910615921, "memory(GiB)": 37.92, "reward": 0.3614036738872528, "reward_std": 0.04450615495443344, "rewards/VisualizationJSONCombinedORM/mean": 0.3614036738872528, "rewards/VisualizationJSONCombinedORM/std": 0.05036747083067894, "step": 3678, "train_speed(iter/s)": 1.655042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 309.0625, "completions/min_length": 237.0, "epoch": 3.043010752688172, "grad_norm": 0.199196919798851, "kl": 0.090576171875, "learning_rate": 3.984932550578204e-06, "loss": 0.0009073317050933838, "memory(GiB)": 37.92, "reward": 0.636583685874939, "reward_std": 0.102666474878788, "rewards/VisualizationJSONCombinedORM/mean": 0.636583685874939, "rewards/VisualizationJSONCombinedORM/std": 0.10728522390127182, "step": 3679, "train_speed(iter/s)": 1.635758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 310.875, "completions/min_length": 244.0, "epoch": 3.04383788254756, "grad_norm": 0.29041624069213867, "kl": 0.07550048828125, "learning_rate": 3.982105356082951e-06, "loss": 0.000756479799747467, "memory(GiB)": 37.92, "reward": 0.6435480713844299, "reward_std": 0.11358655989170074, "rewards/VisualizationJSONCombinedORM/mean": 0.6435480713844299, "rewards/VisualizationJSONCombinedORM/std": 0.11060253530740738, "step": 3680, "train_speed(iter/s)": 1.617927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 305.8125, "completions/min_length": 241.0, "epoch": 3.044665012406948, "grad_norm": 0.17297030985355377, "kl": 0.0894775390625, "learning_rate": 3.979278501060328e-06, "loss": 0.0008920095860958099, "memory(GiB)": 37.92, "reward": 0.6211329698562622, "reward_std": 0.1000019982457161, "rewards/VisualizationJSONCombinedORM/mean": 0.6211329698562622, "rewards/VisualizationJSONCombinedORM/std": 0.12341493368148804, "step": 3681, "train_speed(iter/s)": 1.600781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 306.4375, "completions/min_length": 233.0, "epoch": 3.045492142266336, "grad_norm": 0.18003319203853607, "kl": 0.1156005859375, "learning_rate": 3.9764519864531026e-06, "loss": 0.0011538490653038025, "memory(GiB)": 37.92, "reward": 0.5056173205375671, "reward_std": 0.022123700007796288, "rewards/VisualizationJSONCombinedORM/mean": 0.5056173205375671, "rewards/VisualizationJSONCombinedORM/std": 0.2782997786998749, "step": 3682, "train_speed(iter/s)": 1.581787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 292.0, "completions/min_length": 250.0, "epoch": 3.0463192721257237, "grad_norm": 0.17012803256511688, "kl": 0.2008056640625, "learning_rate": 3.9736258132039315e-06, "loss": 0.002002980560064316, "memory(GiB)": 37.92, "reward": 0.7159460783004761, "reward_std": 0.13070520758628845, "rewards/VisualizationJSONCombinedORM/mean": 0.7159460783004761, "rewards/VisualizationJSONCombinedORM/std": 0.1383042186498642, "step": 3683, "train_speed(iter/s)": 1.566109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 300.4375, "completions/min_length": 237.0, "epoch": 3.0471464019851116, "grad_norm": 0.1876666247844696, "kl": 0.069091796875, "learning_rate": 3.970799982255354e-06, "loss": 0.0006911158561706543, "memory(GiB)": 37.98, "reward": 0.4652632176876068, "reward_std": 0.05148187279701233, "rewards/VisualizationJSONCombinedORM/mean": 0.4652632176876068, "rewards/VisualizationJSONCombinedORM/std": 0.21614894270896912, "step": 3684, "train_speed(iter/s)": 1.544855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 316.1875, "completions/min_length": 252.0, "epoch": 3.0479735318444994, "grad_norm": 0.19635185599327087, "kl": 0.129638671875, "learning_rate": 3.967974494549803e-06, "loss": 0.0012971572577953339, "memory(GiB)": 37.98, "reward": 0.726999044418335, "reward_std": 0.07586903870105743, "rewards/VisualizationJSONCombinedORM/mean": 0.726999044418335, "rewards/VisualizationJSONCombinedORM/std": 0.07430793344974518, "step": 3685, "train_speed(iter/s)": 1.527586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 302.8125, "completions/min_length": 250.0, "epoch": 3.0488006617038876, "grad_norm": 0.17428411543369293, "kl": 0.0887451171875, "learning_rate": 3.9651493510295855e-06, "loss": 0.0008892342448234558, "memory(GiB)": 37.98, "reward": 0.7338019013404846, "reward_std": 0.08329527080059052, "rewards/VisualizationJSONCombinedORM/mean": 0.7338019013404846, "rewards/VisualizationJSONCombinedORM/std": 0.09977119415998459, "step": 3686, "train_speed(iter/s)": 1.515375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 319.25, "completions/min_length": 269.0, "epoch": 3.0496277915632755, "grad_norm": 0.290962278842926, "kl": 0.1878662109375, "learning_rate": 3.962324552636906e-06, "loss": 0.0018734484910964966, "memory(GiB)": 37.98, "reward": 0.5560338497161865, "reward_std": 0.11969995498657227, "rewards/VisualizationJSONCombinedORM/mean": 0.5560338497161865, "rewards/VisualizationJSONCombinedORM/std": 0.11863379180431366, "step": 3687, "train_speed(iter/s)": 1.500756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 308.1875, "completions/min_length": 219.0, "epoch": 3.0504549214226633, "grad_norm": 0.20485712587833405, "kl": 0.103759765625, "learning_rate": 3.959500100313845e-06, "loss": 0.001038936898112297, "memory(GiB)": 37.98, "reward": 0.49733179807662964, "reward_std": 0.05306107550859451, "rewards/VisualizationJSONCombinedORM/mean": 0.49733179807662964, "rewards/VisualizationJSONCombinedORM/std": 0.26238366961479187, "step": 3688, "train_speed(iter/s)": 1.483675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 323.0625, "completions/min_length": 259.0, "epoch": 3.051282051282051, "grad_norm": 0.1770845353603363, "kl": 0.05133056640625, "learning_rate": 3.956675995002372e-06, "loss": 0.0005116909742355347, "memory(GiB)": 37.98, "reward": 0.5627468824386597, "reward_std": 0.05366414785385132, "rewards/VisualizationJSONCombinedORM/mean": 0.5627468824386597, "rewards/VisualizationJSONCombinedORM/std": 0.14370787143707275, "step": 3689, "train_speed(iter/s)": 1.469456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 303.8125, "completions/min_length": 251.0, "epoch": 3.0521091811414394, "grad_norm": 0.2517356276512146, "kl": 0.11572265625, "learning_rate": 3.953852237644337e-06, "loss": 0.001157280057668686, "memory(GiB)": 37.98, "reward": 0.33830028772354126, "reward_std": 0.042680077254772186, "rewards/VisualizationJSONCombinedORM/mean": 0.33830028772354126, "rewards/VisualizationJSONCombinedORM/std": 0.16770945489406586, "step": 3690, "train_speed(iter/s)": 1.45412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 307.875, "completions/min_length": 220.0, "epoch": 3.052936311000827, "grad_norm": 0.2592047452926636, "kl": 0.1082763671875, "learning_rate": 3.951028829181479e-06, "loss": 0.001083660521544516, "memory(GiB)": 37.98, "reward": 0.533179759979248, "reward_std": 0.08045268058776855, "rewards/VisualizationJSONCombinedORM/mean": 0.533179759979248, "rewards/VisualizationJSONCombinedORM/std": 0.10288644582033157, "step": 3691, "train_speed(iter/s)": 1.439485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 317.875, "completions/min_length": 243.0, "epoch": 3.053763440860215, "grad_norm": 0.25706547498703003, "kl": 0.10565185546875, "learning_rate": 3.948205770555414e-06, "loss": 0.001057066023349762, "memory(GiB)": 37.98, "reward": 0.4082641303539276, "reward_std": 0.07876698672771454, "rewards/VisualizationJSONCombinedORM/mean": 0.4082641303539276, "rewards/VisualizationJSONCombinedORM/std": 0.24198304116725922, "step": 3692, "train_speed(iter/s)": 1.425753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 291.3125, "completions/min_length": 226.0, "epoch": 3.054590570719603, "grad_norm": 0.16532154381275177, "kl": 0.04638671875, "learning_rate": 3.945383062707652e-06, "loss": 0.000463852658867836, "memory(GiB)": 37.98, "reward": 0.41397571563720703, "reward_std": 0.01970015838742256, "rewards/VisualizationJSONCombinedORM/mean": 0.41397571563720703, "rewards/VisualizationJSONCombinedORM/std": 0.023476364091038704, "step": 3693, "train_speed(iter/s)": 1.410367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 286.6875, "completions/min_length": 248.0, "epoch": 3.055417700578991, "grad_norm": 0.16628611087799072, "kl": 0.07415771484375, "learning_rate": 3.942560706579571e-06, "loss": 0.0007414743304252625, "memory(GiB)": 37.98, "reward": 0.7007910013198853, "reward_std": 0.06518015265464783, "rewards/VisualizationJSONCombinedORM/mean": 0.7007910013198853, "rewards/VisualizationJSONCombinedORM/std": 0.10057547688484192, "step": 3694, "train_speed(iter/s)": 1.39632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 329.625, "completions/min_length": 277.0, "epoch": 3.056244830438379, "grad_norm": 0.15598797798156738, "kl": 0.072265625, "learning_rate": 3.939738703112447e-06, "loss": 0.000723525881767273, "memory(GiB)": 37.98, "reward": 0.6842085123062134, "reward_std": 0.08721044659614563, "rewards/VisualizationJSONCombinedORM/mean": 0.6842085123062134, "rewards/VisualizationJSONCombinedORM/std": 0.12059559673070908, "step": 3695, "train_speed(iter/s)": 1.382108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 314.5, "completions/min_length": 256.0, "epoch": 3.0570719602977667, "grad_norm": 0.18346306681632996, "kl": 0.1190185546875, "learning_rate": 3.936917053247428e-06, "loss": 0.001188628375530243, "memory(GiB)": 37.98, "reward": 0.7239639163017273, "reward_std": 0.06445953994989395, "rewards/VisualizationJSONCombinedORM/mean": 0.7239639163017273, "rewards/VisualizationJSONCombinedORM/std": 0.0691375732421875, "step": 3696, "train_speed(iter/s)": 1.36928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 310.5625, "completions/min_length": 253.0, "epoch": 3.0578990901571546, "grad_norm": 0.21428793668746948, "kl": 0.07806396484375, "learning_rate": 3.934095757925549e-06, "loss": 0.0007809735834598541, "memory(GiB)": 37.98, "reward": 0.8273327350616455, "reward_std": 0.06164763122797012, "rewards/VisualizationJSONCombinedORM/mean": 0.8273327350616455, "rewards/VisualizationJSONCombinedORM/std": 0.06736711412668228, "step": 3697, "train_speed(iter/s)": 1.355817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 267.0625, "completions/min_length": 219.0, "epoch": 3.058726220016543, "grad_norm": 0.17726187407970428, "kl": 0.09375, "learning_rate": 3.931274818087722e-06, "loss": 0.000939343124628067, "memory(GiB)": 37.98, "reward": 0.6635335683822632, "reward_std": 0.08222460746765137, "rewards/VisualizationJSONCombinedORM/mean": 0.6635335683822632, "rewards/VisualizationJSONCombinedORM/std": 0.09285573661327362, "step": 3698, "train_speed(iter/s)": 1.341158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 298.4375, "completions/min_length": 244.0, "epoch": 3.0595533498759306, "grad_norm": 0.19950780272483826, "kl": 0.1107177734375, "learning_rate": 3.928454234674748e-06, "loss": 0.0011061355471611023, "memory(GiB)": 37.98, "reward": 0.729204535484314, "reward_std": 0.06530507653951645, "rewards/VisualizationJSONCombinedORM/mean": 0.729204535484314, "rewards/VisualizationJSONCombinedORM/std": 0.09084636718034744, "step": 3699, "train_speed(iter/s)": 1.331533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 312.9375, "completions/min_length": 241.0, "epoch": 3.0603804797353185, "grad_norm": 0.18892289698123932, "kl": 0.082275390625, "learning_rate": 3.925634008627299e-06, "loss": 0.0008228272199630737, "memory(GiB)": 37.98, "reward": 0.41211971640586853, "reward_std": 0.054790087044239044, "rewards/VisualizationJSONCombinedORM/mean": 0.41211971640586853, "rewards/VisualizationJSONCombinedORM/std": 0.05400742217898369, "step": 3700, "train_speed(iter/s)": 1.318195 }, { "epoch": 3.0603804797353185, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.3333333333333, "eval_completions/mean_length": 297.7552083333333, "eval_completions/min_length": 249.29166666666666, "eval_kl": 0.09693400065104167, "eval_loss": 0.0009679955546744168, "eval_reward": 0.4933760116497676, "eval_reward_std": 0.07089999589758615, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4933760116497676, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0708999994288509, "eval_runtime": 305.8631, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 267.5, "completions/min_length": 223.0, "epoch": 3.0612076095947063, "grad_norm": 0.20403167605400085, "kl": 0.06103515625, "learning_rate": 3.922814140885942e-06, "loss": 0.0006107743829488754, "memory(GiB)": 37.98, "reward": 0.5882036685943604, "reward_std": 0.08238844573497772, "rewards/VisualizationJSONCombinedORM/mean": 0.5882036685943604, "rewards/VisualizationJSONCombinedORM/std": 0.18604440987110138, "step": 3701, "train_speed(iter/s)": 1.181611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 317.625, "completions/min_length": 244.0, "epoch": 3.062034739454094, "grad_norm": 0.15355226397514343, "kl": 0.0419921875, "learning_rate": 3.919994632391108e-06, "loss": 0.00042003393173217773, "memory(GiB)": 37.98, "reward": 0.49947160482406616, "reward_std": 0.04722989350557327, "rewards/VisualizationJSONCombinedORM/mean": 0.49947160482406616, "rewards/VisualizationJSONCombinedORM/std": 0.10561104863882065, "step": 3702, "train_speed(iter/s)": 1.171498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 303.5625, "completions/min_length": 198.0, "epoch": 3.0628618693134824, "grad_norm": 0.19686058163642883, "kl": 0.0555419921875, "learning_rate": 3.91717548408312e-06, "loss": 0.0005556158721446991, "memory(GiB)": 37.98, "reward": 0.5160407423973083, "reward_std": 0.04706989973783493, "rewards/VisualizationJSONCombinedORM/mean": 0.5160407423973083, "rewards/VisualizationJSONCombinedORM/std": 0.07578915357589722, "step": 3703, "train_speed(iter/s)": 1.160798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 319.1875, "completions/min_length": 226.0, "epoch": 3.06368899917287, "grad_norm": 0.2030734121799469, "kl": 0.068115234375, "learning_rate": 3.914356696902177e-06, "loss": 0.000680290162563324, "memory(GiB)": 37.98, "reward": 0.320448100566864, "reward_std": 0.03058362565934658, "rewards/VisualizationJSONCombinedORM/mean": 0.320448100566864, "rewards/VisualizationJSONCombinedORM/std": 0.08131097257137299, "step": 3704, "train_speed(iter/s)": 1.151249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 295.75, "completions/min_length": 229.0, "epoch": 3.064516129032258, "grad_norm": 0.17781813442707062, "kl": 0.1373291015625, "learning_rate": 3.911538271788359e-06, "loss": 0.0013696085661649704, "memory(GiB)": 37.98, "reward": 0.33164557814598083, "reward_std": 0.03160290792584419, "rewards/VisualizationJSONCombinedORM/mean": 0.33164557814598083, "rewards/VisualizationJSONCombinedORM/std": 0.1214892715215683, "step": 3705, "train_speed(iter/s)": 1.143161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 299.1875, "completions/min_length": 242.0, "epoch": 3.065343258891646, "grad_norm": 0.19072866439819336, "kl": 0.092041015625, "learning_rate": 3.90872020968162e-06, "loss": 0.0009222924709320068, "memory(GiB)": 37.98, "reward": 0.6706634759902954, "reward_std": 0.1037123054265976, "rewards/VisualizationJSONCombinedORM/mean": 0.6706634759902954, "rewards/VisualizationJSONCombinedORM/std": 0.1067834123969078, "step": 3706, "train_speed(iter/s)": 1.134316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 306.1875, "completions/min_length": 246.0, "epoch": 3.066170388751034, "grad_norm": 0.18800196051597595, "kl": 0.0396728515625, "learning_rate": 3.9059025115218016e-06, "loss": 0.0003963299095630646, "memory(GiB)": 37.98, "reward": 0.7197510004043579, "reward_std": 0.08262956142425537, "rewards/VisualizationJSONCombinedORM/mean": 0.7197510004043579, "rewards/VisualizationJSONCombinedORM/std": 0.1053634062409401, "step": 3707, "train_speed(iter/s)": 1.126295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 296.375, "completions/min_length": 222.0, "epoch": 3.066997518610422, "grad_norm": 0.2028558999300003, "kl": 0.0506591796875, "learning_rate": 3.9030851782486145e-06, "loss": 0.0005071447230875492, "memory(GiB)": 37.98, "reward": 0.564010739326477, "reward_std": 0.07262551784515381, "rewards/VisualizationJSONCombinedORM/mean": 0.564010739326477, "rewards/VisualizationJSONCombinedORM/std": 0.21428297460079193, "step": 3708, "train_speed(iter/s)": 1.116493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 313.8125, "completions/min_length": 252.0, "epoch": 3.0678246484698097, "grad_norm": 0.21430666744709015, "kl": 0.0899658203125, "learning_rate": 3.9002682108016585e-06, "loss": 0.0008994191884994507, "memory(GiB)": 37.98, "reward": 0.6313173770904541, "reward_std": 0.07505194842815399, "rewards/VisualizationJSONCombinedORM/mean": 0.6313173770904541, "rewards/VisualizationJSONCombinedORM/std": 0.11522391438484192, "step": 3709, "train_speed(iter/s)": 1.10819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 296.3125, "completions/min_length": 257.0, "epoch": 3.0686517783291976, "grad_norm": 0.18182054162025452, "kl": 0.0791015625, "learning_rate": 3.897451610120399e-06, "loss": 0.0007896348834037781, "memory(GiB)": 37.98, "reward": 0.5336146354675293, "reward_std": 0.0766029804944992, "rewards/VisualizationJSONCombinedORM/mean": 0.5336146354675293, "rewards/VisualizationJSONCombinedORM/std": 0.07492761313915253, "step": 3710, "train_speed(iter/s)": 1.100315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 298.5625, "completions/min_length": 249.0, "epoch": 3.069478908188586, "grad_norm": 0.17616166174411774, "kl": 0.04779052734375, "learning_rate": 3.894635377144189e-06, "loss": 0.00047829002141952515, "memory(GiB)": 37.98, "reward": 0.39552250504493713, "reward_std": 0.033242207020521164, "rewards/VisualizationJSONCombinedORM/mean": 0.39552250504493713, "rewards/VisualizationJSONCombinedORM/std": 0.21618613600730896, "step": 3711, "train_speed(iter/s)": 1.092246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 294.3125, "completions/min_length": 221.0, "epoch": 3.0703060380479736, "grad_norm": 0.1736905425786972, "kl": 0.095947265625, "learning_rate": 3.891819512812256e-06, "loss": 0.0009617134928703308, "memory(GiB)": 37.98, "reward": 0.6711243391036987, "reward_std": 0.07662692666053772, "rewards/VisualizationJSONCombinedORM/mean": 0.6711243391036987, "rewards/VisualizationJSONCombinedORM/std": 0.087867371737957, "step": 3712, "train_speed(iter/s)": 1.084185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 272.4375, "completions/min_length": 220.0, "epoch": 3.0711331679073615, "grad_norm": 0.217751607298851, "kl": 0.10009765625, "learning_rate": 3.889004018063702e-06, "loss": 0.0009990260004997253, "memory(GiB)": 37.98, "reward": 0.5635277032852173, "reward_std": 0.06835930049419403, "rewards/VisualizationJSONCombinedORM/mean": 0.5635277032852173, "rewards/VisualizationJSONCombinedORM/std": 0.25101548433303833, "step": 3713, "train_speed(iter/s)": 1.07504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 309.25, "completions/min_length": 244.0, "epoch": 3.0719602977667493, "grad_norm": 0.18121492862701416, "kl": 0.04193115234375, "learning_rate": 3.886188893837509e-06, "loss": 0.0004197265952825546, "memory(GiB)": 37.98, "reward": 0.3870454430580139, "reward_std": 0.07481656223535538, "rewards/VisualizationJSONCombinedORM/mean": 0.3870454430580139, "rewards/VisualizationJSONCombinedORM/std": 0.13401800394058228, "step": 3714, "train_speed(iter/s)": 1.066903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 313.625, "completions/min_length": 236.0, "epoch": 3.072787427626137, "grad_norm": 0.17640598118305206, "kl": 0.05120849609375, "learning_rate": 3.883374141072534e-06, "loss": 0.0005124285817146301, "memory(GiB)": 37.98, "reward": 0.5339611768722534, "reward_std": 0.05447734519839287, "rewards/VisualizationJSONCombinedORM/mean": 0.5339611768722534, "rewards/VisualizationJSONCombinedORM/std": 0.1662999540567398, "step": 3715, "train_speed(iter/s)": 1.05859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 310.625, "completions/min_length": 246.0, "epoch": 3.0736145574855254, "grad_norm": 0.21938809752464294, "kl": 0.0889892578125, "learning_rate": 3.880559760707508e-06, "loss": 0.0008919313549995422, "memory(GiB)": 37.98, "reward": 0.4354288578033447, "reward_std": 0.06717456877231598, "rewards/VisualizationJSONCombinedORM/mean": 0.4354288578033447, "rewards/VisualizationJSONCombinedORM/std": 0.09008937329053879, "step": 3716, "train_speed(iter/s)": 1.052213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 276.6875, "completions/min_length": 217.0, "epoch": 3.074441687344913, "grad_norm": 0.19779179990291595, "kl": 0.078857421875, "learning_rate": 3.8777457536810446e-06, "loss": 0.0007877424359321594, "memory(GiB)": 37.98, "reward": 0.4160986840724945, "reward_std": 0.045872729271650314, "rewards/VisualizationJSONCombinedORM/mean": 0.4160986840724945, "rewards/VisualizationJSONCombinedORM/std": 0.22665512561798096, "step": 3717, "train_speed(iter/s)": 1.044259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 320.625, "completions/min_length": 257.0, "epoch": 3.075268817204301, "grad_norm": 0.17235814034938812, "kl": 0.04345703125, "learning_rate": 3.874932120931622e-06, "loss": 0.00043415650725364685, "memory(GiB)": 37.98, "reward": 0.5126162767410278, "reward_std": 0.05347203090786934, "rewards/VisualizationJSONCombinedORM/mean": 0.5126162767410278, "rewards/VisualizationJSONCombinedORM/std": 0.10029163956642151, "step": 3718, "train_speed(iter/s)": 1.036767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 309.5, "completions/min_length": 239.0, "epoch": 3.076095947063689, "grad_norm": 0.2340659499168396, "kl": 0.06268310546875, "learning_rate": 3.872118863397607e-06, "loss": 0.0006268620491027832, "memory(GiB)": 37.98, "reward": 0.3796517252922058, "reward_std": 0.052134204655885696, "rewards/VisualizationJSONCombinedORM/mean": 0.3796517252922058, "rewards/VisualizationJSONCombinedORM/std": 0.10031183809041977, "step": 3719, "train_speed(iter/s)": 1.029914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 311.75, "completions/min_length": 253.0, "epoch": 3.076923076923077, "grad_norm": 0.18381120264530182, "kl": 0.04766845703125, "learning_rate": 3.869305982017229e-06, "loss": 0.0004774387925863266, "memory(GiB)": 37.98, "reward": 0.5467682480812073, "reward_std": 0.06525585800409317, "rewards/VisualizationJSONCombinedORM/mean": 0.5467682480812073, "rewards/VisualizationJSONCombinedORM/std": 0.15658196806907654, "step": 3720, "train_speed(iter/s)": 1.022849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 307.8125, "completions/min_length": 233.0, "epoch": 3.077750206782465, "grad_norm": 0.17106185853481293, "kl": 0.048828125, "learning_rate": 3.866493477728599e-06, "loss": 0.0004890859127044678, "memory(GiB)": 37.98, "reward": 0.7051622867584229, "reward_std": 0.043271198868751526, "rewards/VisualizationJSONCombinedORM/mean": 0.7051622867584229, "rewards/VisualizationJSONCombinedORM/std": 0.07276073843240738, "step": 3721, "train_speed(iter/s)": 1.015783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 280.5, "completions/min_length": 218.0, "epoch": 3.0785773366418527, "grad_norm": 0.17638294398784637, "kl": 0.04644775390625, "learning_rate": 3.8636813514697e-06, "loss": 0.00046421587467193604, "memory(GiB)": 37.98, "reward": 0.7249408960342407, "reward_std": 0.07787764817476273, "rewards/VisualizationJSONCombinedORM/mean": 0.7249408960342407, "rewards/VisualizationJSONCombinedORM/std": 0.07719367742538452, "step": 3722, "train_speed(iter/s)": 1.00994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 312.625, "completions/min_length": 221.0, "epoch": 3.0794044665012406, "grad_norm": 0.234904944896698, "kl": 0.04547119140625, "learning_rate": 3.86086960417839e-06, "loss": 0.0004556458443403244, "memory(GiB)": 37.98, "reward": 0.4718777537345886, "reward_std": 0.08173585683107376, "rewards/VisualizationJSONCombinedORM/mean": 0.4718777537345886, "rewards/VisualizationJSONCombinedORM/std": 0.08525233715772629, "step": 3723, "train_speed(iter/s)": 1.003285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 269.375, "completions/min_length": 224.0, "epoch": 3.080231596360629, "grad_norm": 0.16033495962619781, "kl": 0.08544921875, "learning_rate": 3.858058236792398e-06, "loss": 0.0008546076714992523, "memory(GiB)": 37.98, "reward": 0.6491361856460571, "reward_std": 0.07434844970703125, "rewards/VisualizationJSONCombinedORM/mean": 0.6491361856460571, "rewards/VisualizationJSONCombinedORM/std": 0.11810535937547684, "step": 3724, "train_speed(iter/s)": 0.995236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 292.25, "completions/min_length": 227.0, "epoch": 3.0810587262200166, "grad_norm": 0.17504169046878815, "kl": 0.06390380859375, "learning_rate": 3.855247250249331e-06, "loss": 0.0006376262754201889, "memory(GiB)": 37.98, "reward": 0.7753236889839172, "reward_std": 0.07122966647148132, "rewards/VisualizationJSONCombinedORM/mean": 0.7753236889839172, "rewards/VisualizationJSONCombinedORM/std": 0.07280239462852478, "step": 3725, "train_speed(iter/s)": 0.98924 }, { "epoch": 3.0810587262200166, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.5, "eval_completions/mean_length": 307.5989583333333, "eval_completions/min_length": 256.4583333333333, "eval_kl": 0.08185831705729167, "eval_loss": 0.0008234716951847076, "eval_reward": 0.4717661129931609, "eval_reward_std": 0.06505470940222342, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4717661129931609, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06505470816046, "eval_runtime": 317.0927, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 283.6875, "completions/min_length": 202.0, "epoch": 3.0818858560794045, "grad_norm": 0.15850669145584106, "kl": 0.02972412109375, "learning_rate": 3.852436645486662e-06, "loss": 0.00029720738530158997, "memory(GiB)": 37.98, "reward": 0.4970141649246216, "reward_std": 0.05574222654104233, "rewards/VisualizationJSONCombinedORM/mean": 0.4970141649246216, "rewards/VisualizationJSONCombinedORM/std": 0.2735748589038849, "step": 3726, "train_speed(iter/s)": 0.906186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 314.875, "completions/min_length": 238.0, "epoch": 3.0827129859387923, "grad_norm": 0.1922670602798462, "kl": 0.189453125, "learning_rate": 3.8496264234417465e-06, "loss": 0.0018983930349349976, "memory(GiB)": 37.98, "reward": 0.5003427863121033, "reward_std": 0.032344620674848557, "rewards/VisualizationJSONCombinedORM/mean": 0.5003427863121033, "rewards/VisualizationJSONCombinedORM/std": 0.2923291027545929, "step": 3727, "train_speed(iter/s)": 0.900832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 317.1875, "completions/min_length": 258.0, "epoch": 3.08354011579818, "grad_norm": 0.21366026997566223, "kl": 0.07470703125, "learning_rate": 3.846816585051802e-06, "loss": 0.0007470399141311646, "memory(GiB)": 37.98, "reward": 0.647899866104126, "reward_std": 0.06026436761021614, "rewards/VisualizationJSONCombinedORM/mean": 0.647899866104126, "rewards/VisualizationJSONCombinedORM/std": 0.1556122899055481, "step": 3728, "train_speed(iter/s)": 0.895416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 297.3125, "completions/min_length": 229.0, "epoch": 3.0843672456575684, "grad_norm": 0.23593728244304657, "kl": 0.134521484375, "learning_rate": 3.844007131253925e-06, "loss": 0.0013451837003231049, "memory(GiB)": 37.98, "reward": 0.5477480888366699, "reward_std": 0.08507543802261353, "rewards/VisualizationJSONCombinedORM/mean": 0.5477480888366699, "rewards/VisualizationJSONCombinedORM/std": 0.11226102709770203, "step": 3729, "train_speed(iter/s)": 0.889382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 311.625, "completions/min_length": 244.0, "epoch": 3.085194375516956, "grad_norm": 0.17008720338344574, "kl": 0.06951904296875, "learning_rate": 3.84119806298508e-06, "loss": 0.00069429911673069, "memory(GiB)": 37.98, "reward": 0.677010178565979, "reward_std": 0.06532920897006989, "rewards/VisualizationJSONCombinedORM/mean": 0.677010178565979, "rewards/VisualizationJSONCombinedORM/std": 0.08183697611093521, "step": 3730, "train_speed(iter/s)": 0.88393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 307.9375, "completions/min_length": 250.0, "epoch": 3.086021505376344, "grad_norm": 0.1739175021648407, "kl": 0.05938720703125, "learning_rate": 3.838389381182107e-06, "loss": 0.0005938448011875153, "memory(GiB)": 37.98, "reward": 0.7298154830932617, "reward_std": 0.08802482485771179, "rewards/VisualizationJSONCombinedORM/mean": 0.7298154830932617, "rewards/VisualizationJSONCombinedORM/std": 0.09143224358558655, "step": 3731, "train_speed(iter/s)": 0.878841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 299.0625, "completions/min_length": 233.0, "epoch": 3.086848635235732, "grad_norm": 0.17989128828048706, "kl": 0.052490234375, "learning_rate": 3.83558108678171e-06, "loss": 0.0005259718745946884, "memory(GiB)": 37.98, "reward": 0.28369542956352234, "reward_std": 0.02630610391497612, "rewards/VisualizationJSONCombinedORM/mean": 0.28369542956352234, "rewards/VisualizationJSONCombinedORM/std": 0.03438267484307289, "step": 3732, "train_speed(iter/s)": 0.873355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 278.375, "completions/min_length": 235.0, "epoch": 3.08767576509512, "grad_norm": 0.22948062419891357, "kl": 0.0809326171875, "learning_rate": 3.832773180720475e-06, "loss": 0.0008108764886856079, "memory(GiB)": 37.98, "reward": 0.5910813808441162, "reward_std": 0.08133678138256073, "rewards/VisualizationJSONCombinedORM/mean": 0.5910813808441162, "rewards/VisualizationJSONCombinedORM/std": 0.13020263612270355, "step": 3733, "train_speed(iter/s)": 0.867908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 324.3125, "completions/min_length": 260.0, "epoch": 3.088502894954508, "grad_norm": 0.20341932773590088, "kl": 0.080078125, "learning_rate": 3.829965663934844e-06, "loss": 0.0008019730448722839, "memory(GiB)": 37.98, "reward": 0.3206997215747833, "reward_std": 0.02595880627632141, "rewards/VisualizationJSONCombinedORM/mean": 0.3206997215747833, "rewards/VisualizationJSONCombinedORM/std": 0.040137890726327896, "step": 3734, "train_speed(iter/s)": 0.863578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 302.0, "completions/min_length": 269.0, "epoch": 3.0893300248138957, "grad_norm": 0.1857469528913498, "kl": 0.0897216796875, "learning_rate": 3.827158537361144e-06, "loss": 0.0008964613080024719, "memory(GiB)": 37.98, "reward": 0.5895817279815674, "reward_std": 0.045471180230379105, "rewards/VisualizationJSONCombinedORM/mean": 0.5895817279815674, "rewards/VisualizationJSONCombinedORM/std": 0.16344477236270905, "step": 3735, "train_speed(iter/s)": 0.858647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 277.625, "completions/min_length": 235.0, "epoch": 3.0901571546732836, "grad_norm": 0.17378731071949005, "kl": 0.0693359375, "learning_rate": 3.82435180193556e-06, "loss": 0.0006931610405445099, "memory(GiB)": 37.98, "reward": 0.48267054557800293, "reward_std": 0.02909594029188156, "rewards/VisualizationJSONCombinedORM/mean": 0.48267054557800293, "rewards/VisualizationJSONCombinedORM/std": 0.055067989975214005, "step": 3736, "train_speed(iter/s)": 0.854643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 310.375, "completions/min_length": 252.0, "epoch": 3.090984284532672, "grad_norm": 0.1998782604932785, "kl": 0.1116943359375, "learning_rate": 3.821545458594155e-06, "loss": 0.0011152401566505432, "memory(GiB)": 37.98, "reward": 0.7527846097946167, "reward_std": 0.09958325326442719, "rewards/VisualizationJSONCombinedORM/mean": 0.7527846097946167, "rewards/VisualizationJSONCombinedORM/std": 0.14109371602535248, "step": 3737, "train_speed(iter/s)": 0.84863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 310.4375, "completions/min_length": 247.0, "epoch": 3.0918114143920596, "grad_norm": 0.18333800137043, "kl": 0.084228515625, "learning_rate": 3.818739508272854e-06, "loss": 0.000841313973069191, "memory(GiB)": 37.98, "reward": 0.4876868724822998, "reward_std": 0.049308452755212784, "rewards/VisualizationJSONCombinedORM/mean": 0.4876868724822998, "rewards/VisualizationJSONCombinedORM/std": 0.09959196299314499, "step": 3738, "train_speed(iter/s)": 0.844111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 278.1875, "completions/min_length": 221.0, "epoch": 3.0926385442514475, "grad_norm": 0.2979108393192291, "kl": 0.3179931640625, "learning_rate": 3.815933951907458e-06, "loss": 0.0031720250844955444, "memory(GiB)": 37.98, "reward": 0.47056514024734497, "reward_std": 0.04555290937423706, "rewards/VisualizationJSONCombinedORM/mean": 0.47056514024734497, "rewards/VisualizationJSONCombinedORM/std": 0.1628977507352829, "step": 3739, "train_speed(iter/s)": 0.839331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 275.875, "completions/min_length": 234.0, "epoch": 3.0934656741108353, "grad_norm": 0.23432408273220062, "kl": 0.091552734375, "learning_rate": 3.8131287904336288e-06, "loss": 0.0009156018495559692, "memory(GiB)": 37.98, "reward": 0.5253063440322876, "reward_std": 0.06004806607961655, "rewards/VisualizationJSONCombinedORM/mean": 0.5253063440322876, "rewards/VisualizationJSONCombinedORM/std": 0.13534659147262573, "step": 3740, "train_speed(iter/s)": 0.83538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 278.5, "completions/min_length": 245.0, "epoch": 3.094292803970223, "grad_norm": 0.21033567190170288, "kl": 0.0965576171875, "learning_rate": 3.8103240247869077e-06, "loss": 0.0009660013020038605, "memory(GiB)": 37.98, "reward": 0.5830449461936951, "reward_std": 0.058751270174980164, "rewards/VisualizationJSONCombinedORM/mean": 0.5830449461936951, "rewards/VisualizationJSONCombinedORM/std": 0.11017882078886032, "step": 3741, "train_speed(iter/s)": 0.829889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/mean_length": 317.6875, "completions/min_length": 241.0, "epoch": 3.0951199338296114, "grad_norm": 0.20123933255672455, "kl": 0.1468505859375, "learning_rate": 3.80751965590269e-06, "loss": 0.0014640167355537415, "memory(GiB)": 37.98, "reward": 0.6935189962387085, "reward_std": 0.05279422178864479, "rewards/VisualizationJSONCombinedORM/mean": 0.6935189962387085, "rewards/VisualizationJSONCombinedORM/std": 0.06346695125102997, "step": 3742, "train_speed(iter/s)": 0.825248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 308.25, "completions/min_length": 239.0, "epoch": 3.095947063688999, "grad_norm": 0.16493871808052063, "kl": 0.05072021484375, "learning_rate": 3.804715684716251e-06, "loss": 0.0005072280764579773, "memory(GiB)": 37.98, "reward": 0.5992458462715149, "reward_std": 0.06755687296390533, "rewards/VisualizationJSONCombinedORM/mean": 0.5992458462715149, "rewards/VisualizationJSONCombinedORM/std": 0.2085234671831131, "step": 3743, "train_speed(iter/s)": 0.820607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 298.25, "completions/min_length": 236.0, "epoch": 3.096774193548387, "grad_norm": 0.18362529575824738, "kl": 0.088623046875, "learning_rate": 3.801912112162725e-06, "loss": 0.0008857995271682739, "memory(GiB)": 37.98, "reward": 0.6614412665367126, "reward_std": 0.046553902328014374, "rewards/VisualizationJSONCombinedORM/mean": 0.6614412665367126, "rewards/VisualizationJSONCombinedORM/std": 0.09559886157512665, "step": 3744, "train_speed(iter/s)": 0.816369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 303.125, "completions/min_length": 238.0, "epoch": 3.097601323407775, "grad_norm": 0.16966764628887177, "kl": 0.117431640625, "learning_rate": 3.7991089391771185e-06, "loss": 0.001171547919511795, "memory(GiB)": 37.98, "reward": 0.6450368165969849, "reward_std": 0.08316582441329956, "rewards/VisualizationJSONCombinedORM/mean": 0.6450368165969849, "rewards/VisualizationJSONCombinedORM/std": 0.08188163489103317, "step": 3745, "train_speed(iter/s)": 0.812195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 314.1875, "completions/min_length": 234.0, "epoch": 3.098428453267163, "grad_norm": 0.16486205160617828, "kl": 0.040130615234375, "learning_rate": 3.796306166694302e-06, "loss": 0.0004013478755950928, "memory(GiB)": 37.98, "reward": 0.4155820608139038, "reward_std": 0.05163147300481796, "rewards/VisualizationJSONCombinedORM/mean": 0.4155820608139038, "rewards/VisualizationJSONCombinedORM/std": 0.056532762944698334, "step": 3746, "train_speed(iter/s)": 0.807579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 296.0, "completions/min_length": 240.0, "epoch": 3.099255583126551, "grad_norm": 0.19105136394500732, "kl": 0.07763671875, "learning_rate": 3.793503795649014e-06, "loss": 0.0007758215069770813, "memory(GiB)": 37.98, "reward": 0.5592327117919922, "reward_std": 0.06010405346751213, "rewards/VisualizationJSONCombinedORM/mean": 0.5592327117919922, "rewards/VisualizationJSONCombinedORM/std": 0.07348591089248657, "step": 3747, "train_speed(iter/s)": 0.802886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 280.5625, "completions/min_length": 213.0, "epoch": 3.1000827129859387, "grad_norm": 0.1810222566127777, "kl": 0.074462890625, "learning_rate": 3.7907018269758557e-06, "loss": 0.0007437318563461304, "memory(GiB)": 37.98, "reward": 0.5959060192108154, "reward_std": 0.0496339350938797, "rewards/VisualizationJSONCombinedORM/mean": 0.5959060192108154, "rewards/VisualizationJSONCombinedORM/std": 0.24767416715621948, "step": 3748, "train_speed(iter/s)": 0.798707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 313.375, "completions/min_length": 191.0, "epoch": 3.1009098428453266, "grad_norm": 0.1624191701412201, "kl": 0.04962158203125, "learning_rate": 3.7879002616093015e-06, "loss": 0.0004967823624610901, "memory(GiB)": 37.98, "reward": 0.4559474289417267, "reward_std": 0.05816374719142914, "rewards/VisualizationJSONCombinedORM/mean": 0.4559474289417267, "rewards/VisualizationJSONCombinedORM/std": 0.06670037657022476, "step": 3749, "train_speed(iter/s)": 0.794102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 287.4375, "completions/min_length": 237.0, "epoch": 3.101736972704715, "grad_norm": 0.21838368475437164, "kl": 0.0948486328125, "learning_rate": 3.7850991004836813e-06, "loss": 0.000948835164308548, "memory(GiB)": 37.98, "reward": 0.4267025589942932, "reward_std": 0.06476110219955444, "rewards/VisualizationJSONCombinedORM/mean": 0.4267025589942932, "rewards/VisualizationJSONCombinedORM/std": 0.15485544502735138, "step": 3750, "train_speed(iter/s)": 0.790027 }, { "epoch": 3.101736972704715, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.4166666666667, "eval_completions/mean_length": 288.4375, "eval_completions/min_length": 241.25, "eval_kl": 0.070831298828125, "eval_loss": 0.0007072227890603244, "eval_reward": 0.4703984602044026, "eval_reward_std": 0.05404519472115984, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4703984602044026, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05404519278090447, "eval_runtime": 298.3316, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 301.625, "completions/min_length": 245.0, "epoch": 3.1025641025641026, "grad_norm": 0.17626364529132843, "kl": 0.07354736328125, "learning_rate": 3.7822983445331985e-06, "loss": 0.0007362216711044312, "memory(GiB)": 37.98, "reward": 0.5109649300575256, "reward_std": 0.1723533719778061, "rewards/VisualizationJSONCombinedORM/mean": 0.5109649300575256, "rewards/VisualizationJSONCombinedORM/std": 0.1719837486743927, "step": 3751, "train_speed(iter/s)": 0.739254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 291.125, "completions/min_length": 235.0, "epoch": 3.1033912324234905, "grad_norm": 0.20078600943088531, "kl": 0.0450439453125, "learning_rate": 3.7794979946919193e-06, "loss": 0.00045107677578926086, "memory(GiB)": 37.98, "reward": 0.6039096117019653, "reward_std": 0.06970903277397156, "rewards/VisualizationJSONCombinedORM/mean": 0.6039096117019653, "rewards/VisualizationJSONCombinedORM/std": 0.17889288067817688, "step": 3752, "train_speed(iter/s)": 0.735525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 262.4375, "completions/min_length": 194.0, "epoch": 3.1042183622828783, "grad_norm": 0.19426625967025757, "kl": 0.04644775390625, "learning_rate": 3.7766980518937705e-06, "loss": 0.0004650019109249115, "memory(GiB)": 37.98, "reward": 0.22513863444328308, "reward_std": 0.019009748473763466, "rewards/VisualizationJSONCombinedORM/mean": 0.22513863444328308, "rewards/VisualizationJSONCombinedORM/std": 0.022154631093144417, "step": 3753, "train_speed(iter/s)": 0.732326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 282.1875, "completions/min_length": 242.0, "epoch": 3.1050454921422665, "grad_norm": 0.2279680073261261, "kl": 0.0794677734375, "learning_rate": 3.773898517072549e-06, "loss": 0.0007945410907268524, "memory(GiB)": 37.98, "reward": 0.6875485181808472, "reward_std": 0.07040659338235855, "rewards/VisualizationJSONCombinedORM/mean": 0.6875485181808472, "rewards/VisualizationJSONCombinedORM/std": 0.135043203830719, "step": 3754, "train_speed(iter/s)": 0.729011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 318.375, "completions/min_length": 241.0, "epoch": 3.1058726220016544, "grad_norm": 0.18041864037513733, "kl": 0.05657958984375, "learning_rate": 3.7710993911619093e-06, "loss": 0.0005655959248542786, "memory(GiB)": 37.98, "reward": 0.5171014070510864, "reward_std": 0.04578910022974014, "rewards/VisualizationJSONCombinedORM/mean": 0.5171014070510864, "rewards/VisualizationJSONCombinedORM/std": 0.2782205641269684, "step": 3755, "train_speed(iter/s)": 0.725364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 285.125, "completions/min_length": 227.0, "epoch": 3.106699751861042, "grad_norm": 0.17179764807224274, "kl": 0.0831298828125, "learning_rate": 3.768300675095378e-06, "loss": 0.0008304864168167114, "memory(GiB)": 37.98, "reward": 0.7456798553466797, "reward_std": 0.061115555465221405, "rewards/VisualizationJSONCombinedORM/mean": 0.7456798553466797, "rewards/VisualizationJSONCombinedORM/std": 0.09687545895576477, "step": 3756, "train_speed(iter/s)": 0.722377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 296.3125, "completions/min_length": 235.0, "epoch": 3.10752688172043, "grad_norm": 0.18062105774879456, "kl": 0.1220703125, "learning_rate": 3.765502369806334e-06, "loss": 0.0012193117290735245, "memory(GiB)": 37.98, "reward": 0.48817670345306396, "reward_std": 0.05820954218506813, "rewards/VisualizationJSONCombinedORM/mean": 0.48817670345306396, "rewards/VisualizationJSONCombinedORM/std": 0.26967161893844604, "step": 3757, "train_speed(iter/s)": 0.719085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 249.0, "completions/min_length": 205.0, "epoch": 3.108354011579818, "grad_norm": 0.13172481954097748, "kl": 0.0382080078125, "learning_rate": 3.7627044762280307e-06, "loss": 0.0003822594881057739, "memory(GiB)": 37.98, "reward": 0.6018322706222534, "reward_std": 0.06773265451192856, "rewards/VisualizationJSONCombinedORM/mean": 0.6018322706222534, "rewards/VisualizationJSONCombinedORM/std": 0.1290796846151352, "step": 3758, "train_speed(iter/s)": 0.71648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 268.8125, "completions/min_length": 221.0, "epoch": 3.109181141439206, "grad_norm": 0.21610000729560852, "kl": 0.08453369140625, "learning_rate": 3.759906995293575e-06, "loss": 0.0008437409996986389, "memory(GiB)": 37.98, "reward": 0.6856998205184937, "reward_std": 0.059688158333301544, "rewards/VisualizationJSONCombinedORM/mean": 0.6856998205184937, "rewards/VisualizationJSONCombinedORM/std": 0.10438578575849533, "step": 3759, "train_speed(iter/s)": 0.712871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 278.375, "completions/min_length": 208.0, "epoch": 3.110008271298594, "grad_norm": 0.17916980385780334, "kl": 0.05645751953125, "learning_rate": 3.757109927935943e-06, "loss": 0.0005635321140289307, "memory(GiB)": 37.98, "reward": 0.4265143573284149, "reward_std": 0.045093536376953125, "rewards/VisualizationJSONCombinedORM/mean": 0.4265143573284149, "rewards/VisualizationJSONCombinedORM/std": 0.049910739064216614, "step": 3760, "train_speed(iter/s)": 0.709365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 288.5625, "completions/min_length": 246.0, "epoch": 3.1108354011579817, "grad_norm": 0.21445351839065552, "kl": 0.0576171875, "learning_rate": 3.7543132750879663e-06, "loss": 0.0005770623683929443, "memory(GiB)": 37.98, "reward": 0.47985005378723145, "reward_std": 0.06396549940109253, "rewards/VisualizationJSONCombinedORM/mean": 0.47985005378723145, "rewards/VisualizationJSONCombinedORM/std": 0.06658914685249329, "step": 3761, "train_speed(iter/s)": 0.705652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 293.375, "completions/min_length": 230.0, "epoch": 3.1116625310173696, "grad_norm": 0.20967832207679749, "kl": 0.054443359375, "learning_rate": 3.7515170376823446e-06, "loss": 0.0005435943603515625, "memory(GiB)": 37.98, "reward": 0.2822408676147461, "reward_std": 0.03682943433523178, "rewards/VisualizationJSONCombinedORM/mean": 0.2822408676147461, "rewards/VisualizationJSONCombinedORM/std": 0.049726586788892746, "step": 3762, "train_speed(iter/s)": 0.702786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 297.125, "completions/min_length": 225.0, "epoch": 3.112489660876758, "grad_norm": 0.2029821276664734, "kl": 0.07061767578125, "learning_rate": 3.7487212166516327e-06, "loss": 0.0007046535611152649, "memory(GiB)": 37.98, "reward": 0.5394610166549683, "reward_std": 0.0787552073597908, "rewards/VisualizationJSONCombinedORM/mean": 0.5394610166549683, "rewards/VisualizationJSONCombinedORM/std": 0.07627803087234497, "step": 3763, "train_speed(iter/s)": 0.69937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 291.125, "completions/min_length": 216.0, "epoch": 3.1133167907361456, "grad_norm": 0.17633666098117828, "kl": 0.087158203125, "learning_rate": 3.745925812928255e-06, "loss": 0.0008704196661710739, "memory(GiB)": 37.98, "reward": 0.752690315246582, "reward_std": 0.09790787100791931, "rewards/VisualizationJSONCombinedORM/mean": 0.752690315246582, "rewards/VisualizationJSONCombinedORM/std": 0.11422424763441086, "step": 3764, "train_speed(iter/s)": 0.696653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 311.25, "completions/min_length": 252.0, "epoch": 3.1141439205955335, "grad_norm": 0.1621565818786621, "kl": 0.07720947265625, "learning_rate": 3.743130827444487e-06, "loss": 0.0007714293897151947, "memory(GiB)": 37.98, "reward": 0.44630956649780273, "reward_std": 0.05874037742614746, "rewards/VisualizationJSONCombinedORM/mean": 0.44630956649780273, "rewards/VisualizationJSONCombinedORM/std": 0.11049465090036392, "step": 3765, "train_speed(iter/s)": 0.693331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 292.5625, "completions/min_length": 231.0, "epoch": 3.1149710504549213, "grad_norm": 0.18655017018318176, "kl": 0.0709228515625, "learning_rate": 3.7403362611324723e-06, "loss": 0.00070977583527565, "memory(GiB)": 37.98, "reward": 0.41939833760261536, "reward_std": 0.06508767604827881, "rewards/VisualizationJSONCombinedORM/mean": 0.41939833760261536, "rewards/VisualizationJSONCombinedORM/std": 0.08482097089290619, "step": 3766, "train_speed(iter/s)": 0.690463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 273.9375, "completions/min_length": 216.0, "epoch": 3.1157981803143096, "grad_norm": 0.20927660167217255, "kl": 0.0826416015625, "learning_rate": 3.7375421149242102e-06, "loss": 0.0008265003561973572, "memory(GiB)": 37.98, "reward": 0.43191784620285034, "reward_std": 0.04777432605624199, "rewards/VisualizationJSONCombinedORM/mean": 0.43191784620285034, "rewards/VisualizationJSONCombinedORM/std": 0.13748995959758759, "step": 3767, "train_speed(iter/s)": 0.687416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 307.6875, "completions/min_length": 238.0, "epoch": 3.1166253101736974, "grad_norm": 0.17682240903377533, "kl": 0.05865478515625, "learning_rate": 3.7347483897515635e-06, "loss": 0.0005864053964614868, "memory(GiB)": 37.98, "reward": 0.4154987931251526, "reward_std": 0.03903408721089363, "rewards/VisualizationJSONCombinedORM/mean": 0.4154987931251526, "rewards/VisualizationJSONCombinedORM/std": 0.053040046244859695, "step": 3768, "train_speed(iter/s)": 0.684532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 297.0, "completions/min_length": 231.0, "epoch": 3.117452440033085, "grad_norm": 0.19146794080734253, "kl": 0.06280517578125, "learning_rate": 3.7319550865462506e-06, "loss": 0.0006266683340072632, "memory(GiB)": 37.98, "reward": 0.7109565734863281, "reward_std": 0.07622949033975601, "rewards/VisualizationJSONCombinedORM/mean": 0.7109565734863281, "rewards/VisualizationJSONCombinedORM/std": 0.14555148780345917, "step": 3769, "train_speed(iter/s)": 0.681086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 271.8125, "completions/min_length": 229.0, "epoch": 3.118279569892473, "grad_norm": 0.2529069781303406, "kl": 0.06689453125, "learning_rate": 3.7291622062398523e-06, "loss": 0.0006691217422485352, "memory(GiB)": 37.98, "reward": 0.5660558938980103, "reward_std": 0.09386945515871048, "rewards/VisualizationJSONCombinedORM/mean": 0.5660558938980103, "rewards/VisualizationJSONCombinedORM/std": 0.2374545931816101, "step": 3770, "train_speed(iter/s)": 0.67765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 327.3125, "completions/min_length": 234.0, "epoch": 3.119106699751861, "grad_norm": 0.16466458141803741, "kl": 0.05206298828125, "learning_rate": 3.7263697497638063e-06, "loss": 0.0005215350538492203, "memory(GiB)": 37.98, "reward": 0.6848024129867554, "reward_std": 0.04806579276919365, "rewards/VisualizationJSONCombinedORM/mean": 0.6848024129867554, "rewards/VisualizationJSONCombinedORM/std": 0.19016695022583008, "step": 3771, "train_speed(iter/s)": 0.674333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 285.1875, "completions/min_length": 221.0, "epoch": 3.119933829611249, "grad_norm": 0.1969330906867981, "kl": 0.0831298828125, "learning_rate": 3.7235777180494126e-06, "loss": 0.0008311979472637177, "memory(GiB)": 37.98, "reward": 0.6423004269599915, "reward_std": 0.07390792667865753, "rewards/VisualizationJSONCombinedORM/mean": 0.6423004269599915, "rewards/VisualizationJSONCombinedORM/std": 0.17821791768074036, "step": 3772, "train_speed(iter/s)": 0.671349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 296.4375, "completions/min_length": 242.0, "epoch": 3.120760959470637, "grad_norm": 0.16805234551429749, "kl": 0.04095458984375, "learning_rate": 3.720786112027822e-06, "loss": 0.00040984898805618286, "memory(GiB)": 37.98, "reward": 0.5179681777954102, "reward_std": 0.05818341672420502, "rewards/VisualizationJSONCombinedORM/mean": 0.5179681777954102, "rewards/VisualizationJSONCombinedORM/std": 0.21671104431152344, "step": 3773, "train_speed(iter/s)": 0.668472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 275.9375, "completions/min_length": 230.0, "epoch": 3.1215880893300247, "grad_norm": 0.22655217349529266, "kl": 0.05523681640625, "learning_rate": 3.717994932630053e-06, "loss": 0.0005531013011932373, "memory(GiB)": 37.98, "reward": 0.3154674470424652, "reward_std": 0.04455330967903137, "rewards/VisualizationJSONCombinedORM/mean": 0.3154674470424652, "rewards/VisualizationJSONCombinedORM/std": 0.04635785520076752, "step": 3774, "train_speed(iter/s)": 0.665031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 294.3125, "completions/min_length": 238.0, "epoch": 3.1224152191894126, "grad_norm": 0.180097296833992, "kl": 0.04278564453125, "learning_rate": 3.7152041807869744e-06, "loss": 0.00042767077684402466, "memory(GiB)": 37.98, "reward": 0.7093600630760193, "reward_std": 0.06094612926244736, "rewards/VisualizationJSONCombinedORM/mean": 0.7093600630760193, "rewards/VisualizationJSONCombinedORM/std": 0.16499675810337067, "step": 3775, "train_speed(iter/s)": 0.661939 }, { "epoch": 3.1224152191894126, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.125, "eval_completions/mean_length": 292.7916666666667, "eval_completions/min_length": 245.66666666666666, "eval_kl": 0.06959025065104167, "eval_loss": 0.000700727105140686, "eval_reward": 0.4578051306307316, "eval_reward_std": 0.05782150011509657, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4578051306307316, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05782150042553743, "eval_runtime": 304.9404, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 297.375, "completions/min_length": 229.0, "epoch": 3.123242349048801, "grad_norm": 0.20544546842575073, "kl": 0.0648193359375, "learning_rate": 3.7124138574293157e-06, "loss": 0.0006490014493465424, "memory(GiB)": 37.98, "reward": 0.3435877561569214, "reward_std": 0.04856621474027634, "rewards/VisualizationJSONCombinedORM/mean": 0.3435877561569214, "rewards/VisualizationJSONCombinedORM/std": 0.05060682073235512, "step": 3776, "train_speed(iter/s)": 0.625647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 318.5, "completions/min_length": 256.0, "epoch": 3.1240694789081886, "grad_norm": 0.16011488437652588, "kl": 0.04876708984375, "learning_rate": 3.7096239634876625e-06, "loss": 0.00048661231994628906, "memory(GiB)": 37.98, "reward": 0.5953419208526611, "reward_std": 0.05097249522805214, "rewards/VisualizationJSONCombinedORM/mean": 0.5953419208526611, "rewards/VisualizationJSONCombinedORM/std": 0.1325242668390274, "step": 3777, "train_speed(iter/s)": 0.62306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 308.75, "completions/min_length": 242.0, "epoch": 3.1248966087675765, "grad_norm": 0.18038536608219147, "kl": 0.095703125, "learning_rate": 3.706834499892458e-06, "loss": 0.0009565195068717003, "memory(GiB)": 37.98, "reward": 0.5366458892822266, "reward_std": 0.08742943406105042, "rewards/VisualizationJSONCombinedORM/mean": 0.5366458892822266, "rewards/VisualizationJSONCombinedORM/std": 0.13120965659618378, "step": 3778, "train_speed(iter/s)": 0.620426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 306.25, "completions/min_length": 230.0, "epoch": 3.1257237386269643, "grad_norm": 0.1721189022064209, "kl": 0.05426025390625, "learning_rate": 3.7040454675739994e-06, "loss": 0.0005428045988082886, "memory(GiB)": 37.98, "reward": 0.517831027507782, "reward_std": 0.09392555058002472, "rewards/VisualizationJSONCombinedORM/mean": 0.517831027507782, "rewards/VisualizationJSONCombinedORM/std": 0.14316807687282562, "step": 3779, "train_speed(iter/s)": 0.618136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 297.625, "completions/min_length": 229.0, "epoch": 3.1265508684863526, "grad_norm": 0.1941763311624527, "kl": 0.04815673828125, "learning_rate": 3.7012568674624473e-06, "loss": 0.0004806816577911377, "memory(GiB)": 37.98, "reward": 0.6022756099700928, "reward_std": 0.06369107961654663, "rewards/VisualizationJSONCombinedORM/mean": 0.6022756099700928, "rewards/VisualizationJSONCombinedORM/std": 0.16913364827632904, "step": 3780, "train_speed(iter/s)": 0.615478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 253.75, "completions/min_length": 223.0, "epoch": 3.1273779983457404, "grad_norm": 0.1710314005613327, "kl": 0.0870361328125, "learning_rate": 3.6984687004878052e-06, "loss": 0.0008712857961654663, "memory(GiB)": 37.98, "reward": 0.7850576639175415, "reward_std": 0.0747501328587532, "rewards/VisualizationJSONCombinedORM/mean": 0.7850576639175415, "rewards/VisualizationJSONCombinedORM/std": 0.11911695450544357, "step": 3781, "train_speed(iter/s)": 0.612844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 284.1875, "completions/min_length": 237.0, "epoch": 3.128205128205128, "grad_norm": 0.18128088116645813, "kl": 0.0465087890625, "learning_rate": 3.6956809675799467e-06, "loss": 0.00046479329466819763, "memory(GiB)": 37.98, "reward": 0.5629682540893555, "reward_std": 0.055260926485061646, "rewards/VisualizationJSONCombinedORM/mean": 0.5629682540893555, "rewards/VisualizationJSONCombinedORM/std": 0.06441212445497513, "step": 3782, "train_speed(iter/s)": 0.610277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 298.0, "completions/min_length": 229.0, "epoch": 3.129032258064516, "grad_norm": 0.1677902638912201, "kl": 0.0537109375, "learning_rate": 3.69289366966859e-06, "loss": 0.0005361754447221756, "memory(GiB)": 37.98, "reward": 0.6584773063659668, "reward_std": 0.0516030453145504, "rewards/VisualizationJSONCombinedORM/mean": 0.6584773063659668, "rewards/VisualizationJSONCombinedORM/std": 0.13450735807418823, "step": 3783, "train_speed(iter/s)": 0.607506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 287.0625, "completions/min_length": 224.0, "epoch": 3.1298593879239043, "grad_norm": 0.21724651753902435, "kl": 0.106201171875, "learning_rate": 3.6901068076833136e-06, "loss": 0.0010615885257720947, "memory(GiB)": 37.98, "reward": 0.4186411499977112, "reward_std": 0.06295615434646606, "rewards/VisualizationJSONCombinedORM/mean": 0.4186411499977112, "rewards/VisualizationJSONCombinedORM/std": 0.06267832964658737, "step": 3784, "train_speed(iter/s)": 0.605001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 271.875, "completions/min_length": 235.0, "epoch": 3.130686517783292, "grad_norm": 0.1854674369096756, "kl": 0.0908203125, "learning_rate": 3.6873203825535473e-06, "loss": 0.0009075235575437546, "memory(GiB)": 37.98, "reward": 0.7793766260147095, "reward_std": 0.06250659376382828, "rewards/VisualizationJSONCombinedORM/mean": 0.7793766260147095, "rewards/VisualizationJSONCombinedORM/std": 0.06468440592288971, "step": 3785, "train_speed(iter/s)": 0.602665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 293.5, "completions/min_length": 222.0, "epoch": 3.13151364764268, "grad_norm": 0.1931637078523636, "kl": 0.065673828125, "learning_rate": 3.6845343952085793e-06, "loss": 0.0006571710109710693, "memory(GiB)": 38.04, "reward": 0.4799307882785797, "reward_std": 0.06375105679035187, "rewards/VisualizationJSONCombinedORM/mean": 0.4799307882785797, "rewards/VisualizationJSONCombinedORM/std": 0.09678950160741806, "step": 3786, "train_speed(iter/s)": 0.599832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 287.75, "completions/min_length": 227.0, "epoch": 3.1323407775020677, "grad_norm": 0.17822265625, "kl": 0.11376953125, "learning_rate": 3.6817488465775462e-06, "loss": 0.0011386461555957794, "memory(GiB)": 38.04, "reward": 0.4401988387107849, "reward_std": 0.04776473343372345, "rewards/VisualizationJSONCombinedORM/mean": 0.4401988387107849, "rewards/VisualizationJSONCombinedORM/std": 0.07645699381828308, "step": 3787, "train_speed(iter/s)": 0.59761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 328.0625, "completions/min_length": 241.0, "epoch": 3.1331679073614556, "grad_norm": 0.15661945939064026, "kl": 0.12890625, "learning_rate": 3.6789637375894467e-06, "loss": 0.0012886682525277138, "memory(GiB)": 38.04, "reward": 0.538836658000946, "reward_std": 0.06576558947563171, "rewards/VisualizationJSONCombinedORM/mean": 0.538836658000946, "rewards/VisualizationJSONCombinedORM/std": 0.06622505933046341, "step": 3788, "train_speed(iter/s)": 0.595616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 297.125, "completions/min_length": 232.0, "epoch": 3.133995037220844, "grad_norm": 0.17239399254322052, "kl": 0.07159423828125, "learning_rate": 3.6761790691731207e-06, "loss": 0.0007145553827285767, "memory(GiB)": 38.04, "reward": 0.8085697293281555, "reward_std": 0.08043447136878967, "rewards/VisualizationJSONCombinedORM/mean": 0.8085697293281555, "rewards/VisualizationJSONCombinedORM/std": 0.08062063157558441, "step": 3789, "train_speed(iter/s)": 0.593431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 301.4375, "completions/min_length": 220.0, "epoch": 3.1348221670802316, "grad_norm": 0.20033687353134155, "kl": 0.06884765625, "learning_rate": 3.673394842257275e-06, "loss": 0.0006890445947647095, "memory(GiB)": 38.04, "reward": 0.5693666338920593, "reward_std": 0.06421968340873718, "rewards/VisualizationJSONCombinedORM/mean": 0.5693666338920593, "rewards/VisualizationJSONCombinedORM/std": 0.18745718896389008, "step": 3790, "train_speed(iter/s)": 0.590692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 298.6875, "completions/min_length": 238.0, "epoch": 3.1356492969396195, "grad_norm": 0.19274994730949402, "kl": 0.1593017578125, "learning_rate": 3.6706110577704568e-06, "loss": 0.0015943031758069992, "memory(GiB)": 38.04, "reward": 0.21809916198253632, "reward_std": 0.032934028655290604, "rewards/VisualizationJSONCombinedORM/mean": 0.21809916198253632, "rewards/VisualizationJSONCombinedORM/std": 0.03196802735328674, "step": 3791, "train_speed(iter/s)": 0.587983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 300.5625, "completions/min_length": 241.0, "epoch": 3.1364764267990073, "grad_norm": 0.16688503324985504, "kl": 0.1439208984375, "learning_rate": 3.6678277166410756e-06, "loss": 0.0014401078224182129, "memory(GiB)": 38.04, "reward": 0.3359782099723816, "reward_std": 0.030116595327854156, "rewards/VisualizationJSONCombinedORM/mean": 0.3359782099723816, "rewards/VisualizationJSONCombinedORM/std": 0.056232985109090805, "step": 3792, "train_speed(iter/s)": 0.585877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 281.5, "completions/min_length": 236.0, "epoch": 3.1373035566583956, "grad_norm": 0.18089789152145386, "kl": 0.09429931640625, "learning_rate": 3.6650448197973855e-06, "loss": 0.0009421315044164658, "memory(GiB)": 38.04, "reward": 0.5384659171104431, "reward_std": 0.05838339030742645, "rewards/VisualizationJSONCombinedORM/mean": 0.5384659171104431, "rewards/VisualizationJSONCombinedORM/std": 0.13706009089946747, "step": 3793, "train_speed(iter/s)": 0.583608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 291.6875, "completions/min_length": 227.0, "epoch": 3.1381306865177834, "grad_norm": 0.20613409578800201, "kl": 0.0615234375, "learning_rate": 3.6622623681674986e-06, "loss": 0.0006146803498268127, "memory(GiB)": 38.04, "reward": 0.5697369575500488, "reward_std": 0.08076991885900497, "rewards/VisualizationJSONCombinedORM/mean": 0.5697369575500488, "rewards/VisualizationJSONCombinedORM/std": 0.1438443958759308, "step": 3794, "train_speed(iter/s)": 0.581853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 269.25, "completions/min_length": 192.0, "epoch": 3.138957816377171, "grad_norm": 0.17740167677402496, "kl": 0.0560302734375, "learning_rate": 3.659480362679371e-06, "loss": 0.0005609765648841858, "memory(GiB)": 38.04, "reward": 0.6430713534355164, "reward_std": 0.07709235697984695, "rewards/VisualizationJSONCombinedORM/mean": 0.6430713534355164, "rewards/VisualizationJSONCombinedORM/std": 0.13547967374324799, "step": 3795, "train_speed(iter/s)": 0.580135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 272.1875, "completions/min_length": 230.0, "epoch": 3.139784946236559, "grad_norm": 0.22123616933822632, "kl": 0.1124267578125, "learning_rate": 3.6566988042608205e-06, "loss": 0.0011221105232834816, "memory(GiB)": 38.04, "reward": 0.37360700964927673, "reward_std": 0.04219319671392441, "rewards/VisualizationJSONCombinedORM/mean": 0.37360700964927673, "rewards/VisualizationJSONCombinedORM/std": 0.15631595253944397, "step": 3796, "train_speed(iter/s)": 0.578158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 291.25, "completions/min_length": 217.0, "epoch": 3.1406120760959473, "grad_norm": 0.22396023571491241, "kl": 0.0484619140625, "learning_rate": 3.6539176938395037e-06, "loss": 0.00048451125621795654, "memory(GiB)": 38.04, "reward": 0.7294775247573853, "reward_std": 0.07346563786268234, "rewards/VisualizationJSONCombinedORM/mean": 0.7294775247573853, "rewards/VisualizationJSONCombinedORM/std": 0.09849976748228073, "step": 3797, "train_speed(iter/s)": 0.576094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 334.6875, "completions/min_length": 273.0, "epoch": 3.141439205955335, "grad_norm": 0.16434447467327118, "kl": 0.0804443359375, "learning_rate": 3.6511370323429392e-06, "loss": 0.000804118812084198, "memory(GiB)": 38.04, "reward": 0.7019826173782349, "reward_std": 0.08577413856983185, "rewards/VisualizationJSONCombinedORM/mean": 0.7019826173782349, "rewards/VisualizationJSONCombinedORM/std": 0.12867730855941772, "step": 3798, "train_speed(iter/s)": 0.573846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 264.625, "completions/min_length": 210.0, "epoch": 3.142266335814723, "grad_norm": 0.2046751081943512, "kl": 0.1016845703125, "learning_rate": 3.6483568206984872e-06, "loss": 0.0010158047080039978, "memory(GiB)": 38.04, "reward": 0.7516867518424988, "reward_std": 0.07942669093608856, "rewards/VisualizationJSONCombinedORM/mean": 0.7516867518424988, "rewards/VisualizationJSONCombinedORM/std": 0.1569875329732895, "step": 3799, "train_speed(iter/s)": 0.571852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 299.9375, "completions/min_length": 233.0, "epoch": 3.1430934656741107, "grad_norm": 0.19102166593074799, "kl": 0.1141357421875, "learning_rate": 3.6455770598333633e-06, "loss": 0.0011384468525648117, "memory(GiB)": 38.04, "reward": 0.4993349313735962, "reward_std": 0.05666626617312431, "rewards/VisualizationJSONCombinedORM/mean": 0.4993349313735962, "rewards/VisualizationJSONCombinedORM/std": 0.11973187327384949, "step": 3800, "train_speed(iter/s)": 0.569819 }, { "epoch": 3.1430934656741107, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 341.5416666666667, "eval_completions/mean_length": 287.78125, "eval_completions/min_length": 245.83333333333334, "eval_kl": 0.082916259765625, "eval_loss": 0.0008322795038111508, "eval_reward": 0.4906271683673064, "eval_reward_std": 0.06305394616598885, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4906271683673064, "eval_rewards/VisualizationJSONCombinedORM/std": 0.06305394822265953, "eval_runtime": 297.8963, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 291.1875, "completions/min_length": 191.0, "epoch": 3.1439205955334986, "grad_norm": 0.2392943799495697, "kl": 0.1884765625, "learning_rate": 3.6427977506746293e-06, "loss": 0.0018893256783485413, "memory(GiB)": 38.04, "reward": 0.36504289507865906, "reward_std": 0.07192055135965347, "rewards/VisualizationJSONCombinedORM/mean": 0.36504289507865906, "rewards/VisualizationJSONCombinedORM/std": 0.08675254136323929, "step": 3801, "train_speed(iter/s)": 0.543669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 277.25, "completions/min_length": 245.0, "epoch": 3.144747725392887, "grad_norm": 0.20782561600208282, "kl": 0.1162109375, "learning_rate": 3.6400188941492e-06, "loss": 0.0011610761284828186, "memory(GiB)": 38.04, "reward": 0.37906667590141296, "reward_std": 0.06205248087644577, "rewards/VisualizationJSONCombinedORM/mean": 0.37906667590141296, "rewards/VisualizationJSONCombinedORM/std": 0.06549612432718277, "step": 3802, "train_speed(iter/s)": 0.541687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 289.0, "completions/min_length": 234.0, "epoch": 3.1455748552522746, "grad_norm": 0.16649751365184784, "kl": 0.04443359375, "learning_rate": 3.637240491183832e-06, "loss": 0.00044558942317962646, "memory(GiB)": 38.04, "reward": 0.6552492380142212, "reward_std": 0.06516920030117035, "rewards/VisualizationJSONCombinedORM/mean": 0.6552492380142212, "rewards/VisualizationJSONCombinedORM/std": 0.08643550425767899, "step": 3803, "train_speed(iter/s)": 0.53973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 265.0, "completions/min_length": 231.0, "epoch": 3.1464019851116625, "grad_norm": 0.16367363929748535, "kl": 0.083740234375, "learning_rate": 3.634462542705144e-06, "loss": 0.0008393712341785431, "memory(GiB)": 38.04, "reward": 0.5407541990280151, "reward_std": 0.06583720445632935, "rewards/VisualizationJSONCombinedORM/mean": 0.5407541990280151, "rewards/VisualizationJSONCombinedORM/std": 0.2147350013256073, "step": 3804, "train_speed(iter/s)": 0.537927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 248.375, "completions/min_length": 213.0, "epoch": 3.1472291149710503, "grad_norm": 0.17869892716407776, "kl": 0.1002197265625, "learning_rate": 3.6316850496395863e-06, "loss": 0.0010054241865873337, "memory(GiB)": 38.04, "reward": 0.4402740001678467, "reward_std": 0.06710869073867798, "rewards/VisualizationJSONCombinedORM/mean": 0.4402740001678467, "rewards/VisualizationJSONCombinedORM/std": 0.15823636949062347, "step": 3805, "train_speed(iter/s)": 0.536165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 280.4375, "completions/min_length": 233.0, "epoch": 3.1480562448304386, "grad_norm": 0.17391234636306763, "kl": 0.1190185546875, "learning_rate": 3.628908012913471e-06, "loss": 0.0011894665658473969, "memory(GiB)": 38.04, "reward": 0.6000410914421082, "reward_std": 0.07715633511543274, "rewards/VisualizationJSONCombinedORM/mean": 0.6000410914421082, "rewards/VisualizationJSONCombinedORM/std": 0.20787610113620758, "step": 3806, "train_speed(iter/s)": 0.534843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 291.3125, "completions/min_length": 254.0, "epoch": 3.1488833746898264, "grad_norm": 0.1638537347316742, "kl": 0.067138671875, "learning_rate": 3.6261314334529508e-06, "loss": 0.0006702393293380737, "memory(GiB)": 38.04, "reward": 0.5831048488616943, "reward_std": 0.032633647322654724, "rewards/VisualizationJSONCombinedORM/mean": 0.5831048488616943, "rewards/VisualizationJSONCombinedORM/std": 0.2735528349876404, "step": 3807, "train_speed(iter/s)": 0.53289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 268.1875, "completions/min_length": 216.0, "epoch": 3.149710504549214, "grad_norm": 0.20417572557926178, "kl": 0.06304931640625, "learning_rate": 3.62335531218403e-06, "loss": 0.000629934947937727, "memory(GiB)": 38.04, "reward": 0.5994335412979126, "reward_std": 0.051697492599487305, "rewards/VisualizationJSONCombinedORM/mean": 0.5994335412979126, "rewards/VisualizationJSONCombinedORM/std": 0.1749952882528305, "step": 3808, "train_speed(iter/s)": 0.530877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 290.0, "completions/min_length": 237.0, "epoch": 3.150537634408602, "grad_norm": 0.1683516949415207, "kl": 0.05194091796875, "learning_rate": 3.620579650032555e-06, "loss": 0.0005180295556783676, "memory(GiB)": 38.04, "reward": 0.3428310751914978, "reward_std": 0.028750132769346237, "rewards/VisualizationJSONCombinedORM/mean": 0.3428310751914978, "rewards/VisualizationJSONCombinedORM/std": 0.08483117818832397, "step": 3809, "train_speed(iter/s)": 0.529239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 269.5625, "completions/min_length": 205.0, "epoch": 3.1513647642679903, "grad_norm": 0.17551682889461517, "kl": 0.036468505859375, "learning_rate": 3.6178044479242256e-06, "loss": 0.0003646835684776306, "memory(GiB)": 38.04, "reward": 0.4719008803367615, "reward_std": 0.05911166965961456, "rewards/VisualizationJSONCombinedORM/mean": 0.4719008803367615, "rewards/VisualizationJSONCombinedORM/std": 0.11964093148708344, "step": 3810, "train_speed(iter/s)": 0.527366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 272.8125, "completions/min_length": 216.0, "epoch": 3.152191894127378, "grad_norm": 0.19011394679546356, "kl": 0.0911865234375, "learning_rate": 3.61502970678458e-06, "loss": 0.0009109508246183395, "memory(GiB)": 38.04, "reward": 0.6323325634002686, "reward_std": 0.07657486200332642, "rewards/VisualizationJSONCombinedORM/mean": 0.6323325634002686, "rewards/VisualizationJSONCombinedORM/std": 0.07464369386434555, "step": 3811, "train_speed(iter/s)": 0.525666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 274.25, "completions/min_length": 234.0, "epoch": 3.153019023986766, "grad_norm": 0.16652518510818481, "kl": 0.06939697265625, "learning_rate": 3.6122554275390136e-06, "loss": 0.0006931684911251068, "memory(GiB)": 38.04, "reward": 0.5951818227767944, "reward_std": 0.02987869828939438, "rewards/VisualizationJSONCombinedORM/mean": 0.5951818227767944, "rewards/VisualizationJSONCombinedORM/std": 0.25513365864753723, "step": 3812, "train_speed(iter/s)": 0.523977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 257.3125, "completions/min_length": 220.0, "epoch": 3.1538461538461537, "grad_norm": 0.20953607559204102, "kl": 0.07183837890625, "learning_rate": 3.609481611112755e-06, "loss": 0.0007195323705673218, "memory(GiB)": 38.04, "reward": 0.39515072107315063, "reward_std": 0.05086224153637886, "rewards/VisualizationJSONCombinedORM/mean": 0.39515072107315063, "rewards/VisualizationJSONCombinedORM/std": 0.0530364066362381, "step": 3813, "train_speed(iter/s)": 0.522633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 288.375, "completions/min_length": 226.0, "epoch": 3.1546732837055416, "grad_norm": 0.22913414239883423, "kl": 0.110107421875, "learning_rate": 3.6067082584308897e-06, "loss": 0.0010983925312757492, "memory(GiB)": 38.04, "reward": 0.3736076354980469, "reward_std": 0.04697500914335251, "rewards/VisualizationJSONCombinedORM/mean": 0.3736076354980469, "rewards/VisualizationJSONCombinedORM/std": 0.04769871011376381, "step": 3814, "train_speed(iter/s)": 0.520854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 274.625, "completions/min_length": 241.0, "epoch": 3.15550041356493, "grad_norm": 0.20121212303638458, "kl": 0.0799560546875, "learning_rate": 3.603935370418342e-06, "loss": 0.000798950670287013, "memory(GiB)": 38.04, "reward": 0.4635702073574066, "reward_std": 0.07280935347080231, "rewards/VisualizationJSONCombinedORM/mean": 0.4635702073574066, "rewards/VisualizationJSONCombinedORM/std": 0.10458339005708694, "step": 3815, "train_speed(iter/s)": 0.519341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 280.5, "completions/min_length": 221.0, "epoch": 3.1563275434243176, "grad_norm": 0.16483652591705322, "kl": 0.1024169921875, "learning_rate": 3.6011629479998837e-06, "loss": 0.0010254234075546265, "memory(GiB)": 38.04, "reward": 0.7353309392929077, "reward_std": 0.04989565163850784, "rewards/VisualizationJSONCombinedORM/mean": 0.7353309392929077, "rewards/VisualizationJSONCombinedORM/std": 0.05846633017063141, "step": 3816, "train_speed(iter/s)": 0.517412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 275.75, "completions/min_length": 212.0, "epoch": 3.1571546732837055, "grad_norm": 0.190158411860466, "kl": 0.166259765625, "learning_rate": 3.598390992100129e-06, "loss": 0.0016639754176139832, "memory(GiB)": 38.04, "reward": 0.6139290928840637, "reward_std": 0.07445968687534332, "rewards/VisualizationJSONCombinedORM/mean": 0.6139290928840637, "rewards/VisualizationJSONCombinedORM/std": 0.07269354164600372, "step": 3817, "train_speed(iter/s)": 0.515781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 297.375, "completions/min_length": 247.0, "epoch": 3.1579818031430933, "grad_norm": 0.20890425145626068, "kl": 0.095947265625, "learning_rate": 3.595619503643541e-06, "loss": 0.0009583160281181335, "memory(GiB)": 38.04, "reward": 0.5229008197784424, "reward_std": 0.05912094563245773, "rewards/VisualizationJSONCombinedORM/mean": 0.5229008197784424, "rewards/VisualizationJSONCombinedORM/std": 0.2515571415424347, "step": 3818, "train_speed(iter/s)": 0.51418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 267.375, "completions/min_length": 205.0, "epoch": 3.1588089330024816, "grad_norm": 0.186253160238266, "kl": 0.0635986328125, "learning_rate": 3.59284848355442e-06, "loss": 0.0006354600191116333, "memory(GiB)": 38.04, "reward": 0.49004247784614563, "reward_std": 0.051582083106040955, "rewards/VisualizationJSONCombinedORM/mean": 0.49004247784614563, "rewards/VisualizationJSONCombinedORM/std": 0.10087873041629791, "step": 3819, "train_speed(iter/s)": 0.512787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 303.5, "completions/min_length": 226.0, "epoch": 3.1596360628618694, "grad_norm": 0.23254092037677765, "kl": 0.0477294921875, "learning_rate": 3.59007793275692e-06, "loss": 0.00047675520181655884, "memory(GiB)": 38.04, "reward": 0.6391397714614868, "reward_std": 0.06754090636968613, "rewards/VisualizationJSONCombinedORM/mean": 0.6391397714614868, "rewards/VisualizationJSONCombinedORM/std": 0.11451607197523117, "step": 3820, "train_speed(iter/s)": 0.5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 274.375, "completions/min_length": 207.0, "epoch": 3.160463192721257, "grad_norm": 0.18669751286506653, "kl": 0.04901123046875, "learning_rate": 3.587307852175025e-06, "loss": 0.0004900321364402771, "memory(GiB)": 38.04, "reward": 0.7150412797927856, "reward_std": 0.05517761409282684, "rewards/VisualizationJSONCombinedORM/mean": 0.7150412797927856, "rewards/VisualizationJSONCombinedORM/std": 0.060052137821912766, "step": 3821, "train_speed(iter/s)": 0.50957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 300.875, "completions/min_length": 231.0, "epoch": 3.161290322580645, "grad_norm": 0.19336646795272827, "kl": 0.0888671875, "learning_rate": 3.584538242732577e-06, "loss": 0.0008888617157936096, "memory(GiB)": 38.04, "reward": 0.5820775032043457, "reward_std": 0.050240516662597656, "rewards/VisualizationJSONCombinedORM/mean": 0.5820775032043457, "rewards/VisualizationJSONCombinedORM/std": 0.20367097854614258, "step": 3822, "train_speed(iter/s)": 0.508039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 269.0625, "completions/min_length": 227.0, "epoch": 3.1621174524400333, "grad_norm": 0.20087569952011108, "kl": 0.037841796875, "learning_rate": 3.58176910535325e-06, "loss": 0.00037893280386924744, "memory(GiB)": 38.04, "reward": 0.5080586075782776, "reward_std": 0.10341120511293411, "rewards/VisualizationJSONCombinedORM/mean": 0.5080586075782776, "rewards/VisualizationJSONCombinedORM/std": 0.11630459874868393, "step": 3823, "train_speed(iter/s)": 0.506443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 279.25, "completions/min_length": 214.0, "epoch": 3.162944582299421, "grad_norm": 0.20275090634822845, "kl": 0.127197265625, "learning_rate": 3.5790004409605657e-06, "loss": 0.0012731216847896576, "memory(GiB)": 38.04, "reward": 0.4102393388748169, "reward_std": 0.03491292893886566, "rewards/VisualizationJSONCombinedORM/mean": 0.4102393388748169, "rewards/VisualizationJSONCombinedORM/std": 0.03726915270090103, "step": 3824, "train_speed(iter/s)": 0.504864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 282.75, "completions/min_length": 241.0, "epoch": 3.163771712158809, "grad_norm": 0.2010219693183899, "kl": 0.06451416015625, "learning_rate": 3.5762322504778846e-06, "loss": 0.0006458088755607605, "memory(GiB)": 38.04, "reward": 0.5645256042480469, "reward_std": 0.07749168574810028, "rewards/VisualizationJSONCombinedORM/mean": 0.5645256042480469, "rewards/VisualizationJSONCombinedORM/std": 0.07574817538261414, "step": 3825, "train_speed(iter/s)": 0.503303 }, { "epoch": 3.163771712158809, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 339.9583333333333, "eval_completions/mean_length": 284.3020833333333, "eval_completions/min_length": 239.58333333333334, "eval_kl": 0.09041341145833333, "eval_loss": 0.0009153746068477631, "eval_reward": 0.48105370874206227, "eval_reward_std": 0.0573711635855337, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.48105370874206227, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057371163818364344, "eval_runtime": 297.5895, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 260.875, "completions/min_length": 201.0, "epoch": 3.1645988420181967, "grad_norm": 0.2816896140575409, "kl": 0.0792236328125, "learning_rate": 3.573464534828414e-06, "loss": 0.0007910877466201782, "memory(GiB)": 38.04, "reward": 0.6948882937431335, "reward_std": 0.06608626991510391, "rewards/VisualizationJSONCombinedORM/mean": 0.6948882937431335, "rewards/VisualizationJSONCombinedORM/std": 0.14976854622364044, "step": 3826, "train_speed(iter/s)": 0.48304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 294.0, "completions/min_length": 242.0, "epoch": 3.1654259718775846, "grad_norm": 0.1917026787996292, "kl": 0.108154296875, "learning_rate": 3.5706972949351965e-06, "loss": 0.001084985677152872, "memory(GiB)": 38.04, "reward": 0.4153928756713867, "reward_std": 0.0369497574865818, "rewards/VisualizationJSONCombinedORM/mean": 0.4153928756713867, "rewards/VisualizationJSONCombinedORM/std": 0.05747479200363159, "step": 3827, "train_speed(iter/s)": 0.481676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 240.8125, "completions/min_length": 216.0, "epoch": 3.166253101736973, "grad_norm": 0.2355727106332779, "kl": 0.06976318359375, "learning_rate": 3.567930531721125e-06, "loss": 0.0006974302232265472, "memory(GiB)": 38.04, "reward": 0.5858874320983887, "reward_std": 0.047118764370679855, "rewards/VisualizationJSONCombinedORM/mean": 0.5858874320983887, "rewards/VisualizationJSONCombinedORM/std": 0.17578794062137604, "step": 3828, "train_speed(iter/s)": 0.480468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 269.75, "completions/min_length": 209.0, "epoch": 3.1670802315963607, "grad_norm": 0.2899084985256195, "kl": 0.0927734375, "learning_rate": 3.5651642461089207e-06, "loss": 0.0009274743497371674, "memory(GiB)": 38.04, "reward": 0.5879879593849182, "reward_std": 0.08612831681966782, "rewards/VisualizationJSONCombinedORM/mean": 0.5879879593849182, "rewards/VisualizationJSONCombinedORM/std": 0.09085755795240402, "step": 3829, "train_speed(iter/s)": 0.479036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.1875, "completions/min_length": 238.0, "epoch": 3.1679073614557485, "grad_norm": 0.18466094136238098, "kl": 0.1192626953125, "learning_rate": 3.5623984390211597e-06, "loss": 0.0011951811611652374, "memory(GiB)": 38.04, "reward": 0.633686900138855, "reward_std": 0.06487025320529938, "rewards/VisualizationJSONCombinedORM/mean": 0.633686900138855, "rewards/VisualizationJSONCombinedORM/std": 0.14763545989990234, "step": 3830, "train_speed(iter/s)": 0.477618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 283.3125, "completions/min_length": 227.0, "epoch": 3.1687344913151363, "grad_norm": 0.19981402158737183, "kl": 0.0689697265625, "learning_rate": 3.559633111380247e-06, "loss": 0.0006885994225740433, "memory(GiB)": 38.04, "reward": 0.5979323387145996, "reward_std": 0.04547690600156784, "rewards/VisualizationJSONCombinedORM/mean": 0.5979323387145996, "rewards/VisualizationJSONCombinedORM/std": 0.16081808507442474, "step": 3831, "train_speed(iter/s)": 0.476222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 262.6875, "completions/min_length": 219.0, "epoch": 3.1695616211745246, "grad_norm": 0.2326311469078064, "kl": 0.0760498046875, "learning_rate": 3.556868264108436e-06, "loss": 0.0007606782019138336, "memory(GiB)": 38.04, "reward": 0.421353280544281, "reward_std": 0.054102905094623566, "rewards/VisualizationJSONCombinedORM/mean": 0.421353280544281, "rewards/VisualizationJSONCombinedORM/std": 0.06773450970649719, "step": 3832, "train_speed(iter/s)": 0.474599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 293.5625, "completions/min_length": 246.0, "epoch": 3.1703887510339124, "grad_norm": 0.1932854950428009, "kl": 0.072021484375, "learning_rate": 3.5541038981278126e-06, "loss": 0.000720784068107605, "memory(GiB)": 38.04, "reward": 0.42776596546173096, "reward_std": 0.05082811042666435, "rewards/VisualizationJSONCombinedORM/mean": 0.42776596546173096, "rewards/VisualizationJSONCombinedORM/std": 0.057856347411870956, "step": 3833, "train_speed(iter/s)": 0.473471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 289.625, "completions/min_length": 231.0, "epoch": 3.1712158808933, "grad_norm": 0.19591714441776276, "kl": 0.0882568359375, "learning_rate": 3.551340014360308e-06, "loss": 0.0008828556165099144, "memory(GiB)": 38.04, "reward": 0.3852844834327698, "reward_std": 0.04866240546107292, "rewards/VisualizationJSONCombinedORM/mean": 0.3852844834327698, "rewards/VisualizationJSONCombinedORM/std": 0.08980713039636612, "step": 3834, "train_speed(iter/s)": 0.472377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 267.9375, "completions/min_length": 201.0, "epoch": 3.172043010752688, "grad_norm": 0.16536098718643188, "kl": 0.06011962890625, "learning_rate": 3.5485766137276894e-06, "loss": 0.0006018839776515961, "memory(GiB)": 38.04, "reward": 0.49322831630706787, "reward_std": 0.05902906879782677, "rewards/VisualizationJSONCombinedORM/mean": 0.49322831630706787, "rewards/VisualizationJSONCombinedORM/std": 0.31750062108039856, "step": 3835, "train_speed(iter/s)": 0.471105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 268.6875, "completions/min_length": 219.0, "epoch": 3.1728701406120763, "grad_norm": 0.22102554142475128, "kl": 0.0556640625, "learning_rate": 3.5458136971515626e-06, "loss": 0.0005569048225879669, "memory(GiB)": 38.04, "reward": 0.3881637454032898, "reward_std": 0.047981392592191696, "rewards/VisualizationJSONCombinedORM/mean": 0.3881637454032898, "rewards/VisualizationJSONCombinedORM/std": 0.052195511758327484, "step": 3836, "train_speed(iter/s)": 0.469786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 262.3125, "completions/min_length": 236.0, "epoch": 3.173697270471464, "grad_norm": 0.25336068868637085, "kl": 0.07586669921875, "learning_rate": 3.5430512655533774e-06, "loss": 0.0007585175335407257, "memory(GiB)": 38.04, "reward": 0.5825207829475403, "reward_std": 0.07849615812301636, "rewards/VisualizationJSONCombinedORM/mean": 0.5825207829475403, "rewards/VisualizationJSONCombinedORM/std": 0.13138066232204437, "step": 3837, "train_speed(iter/s)": 0.468429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 287.5625, "completions/min_length": 233.0, "epoch": 3.174524400330852, "grad_norm": 0.16066890954971313, "kl": 0.07440185546875, "learning_rate": 3.5402893198544137e-06, "loss": 0.000745970755815506, "memory(GiB)": 38.04, "reward": 0.47245728969573975, "reward_std": 0.04040009155869484, "rewards/VisualizationJSONCombinedORM/mean": 0.47245728969573975, "rewards/VisualizationJSONCombinedORM/std": 0.1072150245308876, "step": 3838, "train_speed(iter/s)": 0.467026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 264.75, "completions/min_length": 227.0, "epoch": 3.1753515301902397, "grad_norm": 0.2119133621454239, "kl": 0.119384765625, "learning_rate": 3.5375278609757958e-06, "loss": 0.0011914819478988647, "memory(GiB)": 38.04, "reward": 0.6955567002296448, "reward_std": 0.12515495717525482, "rewards/VisualizationJSONCombinedORM/mean": 0.6955567002296448, "rewards/VisualizationJSONCombinedORM/std": 0.12152216583490372, "step": 3839, "train_speed(iter/s)": 0.465926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 268.875, "completions/min_length": 221.0, "epoch": 3.1761786600496276, "grad_norm": 0.1918838620185852, "kl": 0.04815673828125, "learning_rate": 3.5347668898384805e-06, "loss": 0.00048283860087394714, "memory(GiB)": 38.04, "reward": 0.5649232864379883, "reward_std": 0.08617211878299713, "rewards/VisualizationJSONCombinedORM/mean": 0.5649232864379883, "rewards/VisualizationJSONCombinedORM/std": 0.10581807047128677, "step": 3840, "train_speed(iter/s)": 0.464287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 297.1875, "completions/min_length": 247.0, "epoch": 3.177005789909016, "grad_norm": 0.2963334023952484, "kl": 0.0948486328125, "learning_rate": 3.5320064073632677e-06, "loss": 0.0009507313370704651, "memory(GiB)": 38.04, "reward": 0.6422560214996338, "reward_std": 0.06837029755115509, "rewards/VisualizationJSONCombinedORM/mean": 0.6422560214996338, "rewards/VisualizationJSONCombinedORM/std": 0.15466825664043427, "step": 3841, "train_speed(iter/s)": 0.463281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 282.25, "completions/min_length": 229.0, "epoch": 3.1778329197684037, "grad_norm": 0.18830029666423798, "kl": 0.06744384765625, "learning_rate": 3.5292464144707877e-06, "loss": 0.0006739422678947449, "memory(GiB)": 38.04, "reward": 0.33536866307258606, "reward_std": 0.03530503064393997, "rewards/VisualizationJSONCombinedORM/mean": 0.33536866307258606, "rewards/VisualizationJSONCombinedORM/std": 0.08062558621168137, "step": 3842, "train_speed(iter/s)": 0.461829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 300.5, "completions/min_length": 223.0, "epoch": 3.1786600496277915, "grad_norm": 0.17399382591247559, "kl": 0.056884765625, "learning_rate": 3.526486912081516e-06, "loss": 0.000569663941860199, "memory(GiB)": 38.04, "reward": 0.5267460346221924, "reward_std": 0.0919417068362236, "rewards/VisualizationJSONCombinedORM/mean": 0.5267460346221924, "rewards/VisualizationJSONCombinedORM/std": 0.0902881771326065, "step": 3843, "train_speed(iter/s)": 0.460094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 292.625, "completions/min_length": 211.0, "epoch": 3.1794871794871793, "grad_norm": 0.16222991049289703, "kl": 0.0521240234375, "learning_rate": 3.523727901115753e-06, "loss": 0.0005221068859100342, "memory(GiB)": 38.04, "reward": 0.5743781924247742, "reward_std": 0.06539803743362427, "rewards/VisualizationJSONCombinedORM/mean": 0.5743781924247742, "rewards/VisualizationJSONCombinedORM/std": 0.13367381691932678, "step": 3844, "train_speed(iter/s)": 0.458871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 267.125, "completions/min_length": 213.0, "epoch": 3.1803143093465676, "grad_norm": 0.1855793595314026, "kl": 0.040283203125, "learning_rate": 3.5209693824936486e-06, "loss": 0.00040386244654655457, "memory(GiB)": 38.04, "reward": 0.5667942762374878, "reward_std": 0.03933820500969887, "rewards/VisualizationJSONCombinedORM/mean": 0.5667942762374878, "rewards/VisualizationJSONCombinedORM/std": 0.09655173867940903, "step": 3845, "train_speed(iter/s)": 0.457647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 285.0625, "completions/min_length": 235.0, "epoch": 3.1811414392059554, "grad_norm": 0.27703049778938293, "kl": 0.0989990234375, "learning_rate": 3.5182113571351763e-06, "loss": 0.0009895488619804382, "memory(GiB)": 38.04, "reward": 0.4681202173233032, "reward_std": 0.08703558146953583, "rewards/VisualizationJSONCombinedORM/mean": 0.4681202173233032, "rewards/VisualizationJSONCombinedORM/std": 0.15474584698677063, "step": 3846, "train_speed(iter/s)": 0.456416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 274.6875, "completions/min_length": 217.0, "epoch": 3.181968569065343, "grad_norm": 0.16777963936328888, "kl": 0.03167724609375, "learning_rate": 3.5154538259601544e-06, "loss": 0.00031532347202301025, "memory(GiB)": 38.04, "reward": 0.758212685585022, "reward_std": 0.02945217862725258, "rewards/VisualizationJSONCombinedORM/mean": 0.758212685585022, "rewards/VisualizationJSONCombinedORM/std": 0.04168045520782471, "step": 3847, "train_speed(iter/s)": 0.455103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 266.75, "completions/min_length": 213.0, "epoch": 3.182795698924731, "grad_norm": 0.1847844123840332, "kl": 0.038818359375, "learning_rate": 3.512696789888229e-06, "loss": 0.0003875121474266052, "memory(GiB)": 38.04, "reward": 0.43547698855400085, "reward_std": 0.03376257047057152, "rewards/VisualizationJSONCombinedORM/mean": 0.43547698855400085, "rewards/VisualizationJSONCombinedORM/std": 0.06393671780824661, "step": 3848, "train_speed(iter/s)": 0.453599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 274.6875, "completions/min_length": 224.0, "epoch": 3.1836228287841193, "grad_norm": 0.20268848538398743, "kl": 0.0908203125, "learning_rate": 3.5099402498388877e-06, "loss": 0.0009085014462471008, "memory(GiB)": 38.04, "reward": 0.5187842845916748, "reward_std": 0.07643510401248932, "rewards/VisualizationJSONCombinedORM/mean": 0.5187842845916748, "rewards/VisualizationJSONCombinedORM/std": 0.14373232424259186, "step": 3849, "train_speed(iter/s)": 0.452398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 276.1875, "completions/min_length": 240.0, "epoch": 3.184449958643507, "grad_norm": 0.2475263774394989, "kl": 0.0885009765625, "learning_rate": 3.5071842067314453e-06, "loss": 0.0008849110454320908, "memory(GiB)": 38.04, "reward": 0.4603883624076843, "reward_std": 0.07598012685775757, "rewards/VisualizationJSONCombinedORM/mean": 0.4603883624076843, "rewards/VisualizationJSONCombinedORM/std": 0.2707795798778534, "step": 3850, "train_speed(iter/s)": 0.451224 }, { "epoch": 3.184449958643507, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 329.9166666666667, "eval_completions/mean_length": 279.5, "eval_completions/min_length": 235.58333333333334, "eval_kl": 0.07190958658854167, "eval_loss": 0.0007255297969095409, "eval_reward": 0.437617842728893, "eval_reward_std": 0.048967672240299485, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.437617842728893, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04896767272536332, "eval_runtime": 289.5323, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 283.6875, "completions/min_length": 244.0, "epoch": 3.185277088502895, "grad_norm": 0.17658497393131256, "kl": 0.05206298828125, "learning_rate": 3.504428661485062e-06, "loss": 0.0005209706723690033, "memory(GiB)": 38.04, "reward": 0.46025753021240234, "reward_std": 0.043738704174757004, "rewards/VisualizationJSONCombinedORM/mean": 0.46025753021240234, "rewards/VisualizationJSONCombinedORM/std": 0.09183065593242645, "step": 3851, "train_speed(iter/s)": 0.435206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 250.375, "completions/min_length": 218.0, "epoch": 3.1861042183622827, "grad_norm": 0.16828420758247375, "kl": 0.03411865234375, "learning_rate": 3.501673615018717e-06, "loss": 0.0003403499722480774, "memory(GiB)": 38.04, "reward": 0.692747950553894, "reward_std": 0.07381458580493927, "rewards/VisualizationJSONCombinedORM/mean": 0.692747950553894, "rewards/VisualizationJSONCombinedORM/std": 0.14340637624263763, "step": 3852, "train_speed(iter/s)": 0.434315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 302.625, "completions/min_length": 238.0, "epoch": 3.1869313482216706, "grad_norm": 0.19069820642471313, "kl": 0.0301513671875, "learning_rate": 3.498919068251237e-06, "loss": 0.0003018788993358612, "memory(GiB)": 38.04, "reward": 0.38667595386505127, "reward_std": 0.02421395294368267, "rewards/VisualizationJSONCombinedORM/mean": 0.38667595386505127, "rewards/VisualizationJSONCombinedORM/std": 0.041075099259614944, "step": 3853, "train_speed(iter/s)": 0.43295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 276.375, "completions/min_length": 215.0, "epoch": 3.187758478081059, "grad_norm": 0.22279128432273865, "kl": 0.06927490234375, "learning_rate": 3.4961650221012734e-06, "loss": 0.0006924644112586975, "memory(GiB)": 38.04, "reward": 0.5246273279190063, "reward_std": 0.048861172050237656, "rewards/VisualizationJSONCombinedORM/mean": 0.5246273279190063, "rewards/VisualizationJSONCombinedORM/std": 0.2651064097881317, "step": 3854, "train_speed(iter/s)": 0.431767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 278.625, "completions/min_length": 225.0, "epoch": 3.1885856079404467, "grad_norm": 0.1606416553258896, "kl": 0.03936767578125, "learning_rate": 3.4934114774873153e-06, "loss": 0.0003939680755138397, "memory(GiB)": 38.04, "reward": 0.7295806407928467, "reward_std": 0.024987783282995224, "rewards/VisualizationJSONCombinedORM/mean": 0.7295806407928467, "rewards/VisualizationJSONCombinedORM/std": 0.08889268338680267, "step": 3855, "train_speed(iter/s)": 0.430469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 288.0, "completions/min_length": 220.0, "epoch": 3.1894127377998345, "grad_norm": 0.1663598120212555, "kl": 0.06170654296875, "learning_rate": 3.4906584353276806e-06, "loss": 0.0006172545254230499, "memory(GiB)": 38.04, "reward": 0.6652592420578003, "reward_std": 0.07155944406986237, "rewards/VisualizationJSONCombinedORM/mean": 0.6652592420578003, "rewards/VisualizationJSONCombinedORM/std": 0.17020496726036072, "step": 3856, "train_speed(iter/s)": 0.42931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 256.125, "completions/min_length": 220.0, "epoch": 3.1902398676592223, "grad_norm": 0.14992211759090424, "kl": 0.1356201171875, "learning_rate": 3.4879058965405242e-06, "loss": 0.0013569965958595276, "memory(GiB)": 38.04, "reward": 0.3303791284561157, "reward_std": 0.038201190531253815, "rewards/VisualizationJSONCombinedORM/mean": 0.3303791284561157, "rewards/VisualizationJSONCombinedORM/std": 0.03845744952559471, "step": 3857, "train_speed(iter/s)": 0.428337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 281.0, "completions/min_length": 224.0, "epoch": 3.1910669975186106, "grad_norm": 0.17232103645801544, "kl": 0.04278564453125, "learning_rate": 3.4851538620438274e-06, "loss": 0.0004280880093574524, "memory(GiB)": 38.04, "reward": 0.4906931221485138, "reward_std": 0.06886221468448639, "rewards/VisualizationJSONCombinedORM/mean": 0.4906931221485138, "rewards/VisualizationJSONCombinedORM/std": 0.24339668452739716, "step": 3858, "train_speed(iter/s)": 0.427199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 297.6875, "completions/min_length": 244.0, "epoch": 3.1918941273779984, "grad_norm": 0.27326276898384094, "kl": 0.10821533203125, "learning_rate": 3.4824023327554114e-06, "loss": 0.0010859370231628418, "memory(GiB)": 38.09, "reward": 0.611451268196106, "reward_std": 0.10161539912223816, "rewards/VisualizationJSONCombinedORM/mean": 0.611451268196106, "rewards/VisualizationJSONCombinedORM/std": 0.09936785697937012, "step": 3859, "train_speed(iter/s)": 0.425813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 285.5625, "completions/min_length": 243.0, "epoch": 3.192721257237386, "grad_norm": 0.24811692535877228, "kl": 0.0447998046875, "learning_rate": 3.4796513095929178e-06, "loss": 0.0004478767514228821, "memory(GiB)": 38.09, "reward": 0.5374768972396851, "reward_std": 0.0734933465719223, "rewards/VisualizationJSONCombinedORM/mean": 0.5374768972396851, "rewards/VisualizationJSONCombinedORM/std": 0.22484657168388367, "step": 3860, "train_speed(iter/s)": 0.424769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 278.25, "completions/min_length": 218.0, "epoch": 3.193548387096774, "grad_norm": 0.16604191064834595, "kl": 0.05596923828125, "learning_rate": 3.476900793473832e-06, "loss": 0.0005601490847766399, "memory(GiB)": 38.09, "reward": 0.5625568628311157, "reward_std": 0.04967442899942398, "rewards/VisualizationJSONCombinedORM/mean": 0.5625568628311157, "rewards/VisualizationJSONCombinedORM/std": 0.15886926651000977, "step": 3861, "train_speed(iter/s)": 0.423548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 302.5, "completions/min_length": 250.0, "epoch": 3.1943755169561623, "grad_norm": 0.22439393401145935, "kl": 0.054443359375, "learning_rate": 3.47415078531546e-06, "loss": 0.0005441997200250626, "memory(GiB)": 38.09, "reward": 0.4707184433937073, "reward_std": 0.05805092677474022, "rewards/VisualizationJSONCombinedORM/mean": 0.4707184433937073, "rewards/VisualizationJSONCombinedORM/std": 0.06075066700577736, "step": 3862, "train_speed(iter/s)": 0.422577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 281.5, "completions/min_length": 238.0, "epoch": 3.19520264681555, "grad_norm": 0.16034089028835297, "kl": 0.033538818359375, "learning_rate": 3.4714012860349445e-06, "loss": 0.00033493898808956146, "memory(GiB)": 38.09, "reward": 0.5160409808158875, "reward_std": 0.04201855883002281, "rewards/VisualizationJSONCombinedORM/mean": 0.5160409808158875, "rewards/VisualizationJSONCombinedORM/std": 0.28908491134643555, "step": 3863, "train_speed(iter/s)": 0.421511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 267.5, "completions/min_length": 213.0, "epoch": 3.196029776674938, "grad_norm": 0.2233075499534607, "kl": 0.1302490234375, "learning_rate": 3.4686522965492544e-06, "loss": 0.0013020560145378113, "memory(GiB)": 38.09, "reward": 0.34921935200691223, "reward_std": 0.03618454188108444, "rewards/VisualizationJSONCombinedORM/mean": 0.34921935200691223, "rewards/VisualizationJSONCombinedORM/std": 0.057938892394304276, "step": 3864, "train_speed(iter/s)": 0.420452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 261.6875, "completions/min_length": 212.0, "epoch": 3.1968569065343257, "grad_norm": 0.24383041262626648, "kl": 0.05322265625, "learning_rate": 3.4659038177751918e-06, "loss": 0.0005314722657203674, "memory(GiB)": 38.09, "reward": 0.7510879039764404, "reward_std": 0.07462666928768158, "rewards/VisualizationJSONCombinedORM/mean": 0.7510879039764404, "rewards/VisualizationJSONCombinedORM/std": 0.07549804449081421, "step": 3865, "train_speed(iter/s)": 0.41949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 291.75, "completions/min_length": 188.0, "epoch": 3.197684036393714, "grad_norm": 0.2113429754972458, "kl": 0.04449462890625, "learning_rate": 3.463155850629386e-06, "loss": 0.00044386833906173706, "memory(GiB)": 38.09, "reward": 0.51993727684021, "reward_std": 0.12547358870506287, "rewards/VisualizationJSONCombinedORM/mean": 0.51993727684021, "rewards/VisualizationJSONCombinedORM/std": 0.19589364528656006, "step": 3866, "train_speed(iter/s)": 0.418376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 295.8125, "completions/min_length": 239.0, "epoch": 3.198511166253102, "grad_norm": 0.18826113641262054, "kl": 0.090087890625, "learning_rate": 3.460408396028301e-06, "loss": 0.0008988510817289352, "memory(GiB)": 38.09, "reward": 0.535198986530304, "reward_std": 0.052366603165864944, "rewards/VisualizationJSONCombinedORM/mean": 0.535198986530304, "rewards/VisualizationJSONCombinedORM/std": 0.12966856360435486, "step": 3867, "train_speed(iter/s)": 0.417345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 275.1875, "completions/min_length": 247.0, "epoch": 3.1993382961124897, "grad_norm": 0.19428537786006927, "kl": 0.041290283203125, "learning_rate": 3.4576614548882187e-06, "loss": 0.00041353702545166016, "memory(GiB)": 38.09, "reward": 0.39961642026901245, "reward_std": 0.04145099222660065, "rewards/VisualizationJSONCombinedORM/mean": 0.39961642026901245, "rewards/VisualizationJSONCombinedORM/std": 0.14065641164779663, "step": 3868, "train_speed(iter/s)": 0.416299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 256.6875, "completions/min_length": 189.0, "epoch": 3.2001654259718775, "grad_norm": 0.19633129239082336, "kl": 0.03924560546875, "learning_rate": 3.4549150281252635e-06, "loss": 0.00039207562804222107, "memory(GiB)": 38.09, "reward": 0.6960625648498535, "reward_std": 0.06283605843782425, "rewards/VisualizationJSONCombinedORM/mean": 0.6960625648498535, "rewards/VisualizationJSONCombinedORM/std": 0.06344300508499146, "step": 3869, "train_speed(iter/s)": 0.415456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 281.125, "completions/min_length": 218.0, "epoch": 3.2009925558312657, "grad_norm": 0.1935979574918747, "kl": 0.04071044921875, "learning_rate": 3.4521691166553777e-06, "loss": 0.000406801700592041, "memory(GiB)": 38.09, "reward": 0.6786290407180786, "reward_std": 0.051822468638420105, "rewards/VisualizationJSONCombinedORM/mean": 0.6786290407180786, "rewards/VisualizationJSONCombinedORM/std": 0.1959783136844635, "step": 3870, "train_speed(iter/s)": 0.414356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 258.1875, "completions/min_length": 216.0, "epoch": 3.2018196856906536, "grad_norm": 0.2574310302734375, "kl": 0.131103515625, "learning_rate": 3.4494237213943382e-06, "loss": 0.001313634216785431, "memory(GiB)": 38.09, "reward": 0.4810100793838501, "reward_std": 0.04159865528345108, "rewards/VisualizationJSONCombinedORM/mean": 0.4810100793838501, "rewards/VisualizationJSONCombinedORM/std": 0.31511130928993225, "step": 3871, "train_speed(iter/s)": 0.413289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 295.6875, "completions/min_length": 244.0, "epoch": 3.2026468155500414, "grad_norm": 0.19479705393314362, "kl": 0.0601806640625, "learning_rate": 3.446678843257745e-06, "loss": 0.0006010960787534714, "memory(GiB)": 38.09, "reward": 0.6076359748840332, "reward_std": 0.06015067920088768, "rewards/VisualizationJSONCombinedORM/mean": 0.6076359748840332, "rewards/VisualizationJSONCombinedORM/std": 0.19734394550323486, "step": 3872, "train_speed(iter/s)": 0.412051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 252.9375, "completions/min_length": 229.0, "epoch": 3.203473945409429, "grad_norm": 0.13921472430229187, "kl": 0.025177001953125, "learning_rate": 3.443934483161029e-06, "loss": 0.0002512335777282715, "memory(GiB)": 38.09, "reward": 0.5842815041542053, "reward_std": 0.05839543789625168, "rewards/VisualizationJSONCombinedORM/mean": 0.5842815041542053, "rewards/VisualizationJSONCombinedORM/std": 0.15329110622406006, "step": 3873, "train_speed(iter/s)": 0.411322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 288.375, "completions/min_length": 227.0, "epoch": 3.204301075268817, "grad_norm": 0.19058136641979218, "kl": 0.0606689453125, "learning_rate": 3.4411906420194452e-06, "loss": 0.0006078099831938744, "memory(GiB)": 38.09, "reward": 0.39938783645629883, "reward_std": 0.03741952031850815, "rewards/VisualizationJSONCombinedORM/mean": 0.39938783645629883, "rewards/VisualizationJSONCombinedORM/std": 0.08237389475107193, "step": 3874, "train_speed(iter/s)": 0.410403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 281.4375, "completions/min_length": 212.0, "epoch": 3.2051282051282053, "grad_norm": 0.20044541358947754, "kl": 0.078125, "learning_rate": 3.438447320748082e-06, "loss": 0.0007814094424247742, "memory(GiB)": 38.09, "reward": 0.7799001932144165, "reward_std": 0.07334110885858536, "rewards/VisualizationJSONCombinedORM/mean": 0.7799001932144165, "rewards/VisualizationJSONCombinedORM/std": 0.0715806856751442, "step": 3875, "train_speed(iter/s)": 0.409581 }, { "epoch": 3.2051282051282053, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 333.4583333333333, "eval_completions/mean_length": 279.296875, "eval_completions/min_length": 235.625, "eval_kl": 0.06480916341145833, "eval_loss": 0.0006456797127611935, "eval_reward": 0.4214073146382968, "eval_reward_std": 0.04716845751439299, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4214073146382968, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04716846061880157, "eval_runtime": 291.6011, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.01, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 267.5625, "completions/min_length": 230.0, "epoch": 3.205955334987593, "grad_norm": 0.19480302929878235, "kl": 0.13134765625, "learning_rate": 3.435704520261843e-06, "loss": 0.0013166926801204681, "memory(GiB)": 38.09, "reward": 0.569713294506073, "reward_std": 0.08972389250993729, "rewards/VisualizationJSONCombinedORM/mean": 0.569713294506073, "rewards/VisualizationJSONCombinedORM/std": 0.1725240796804428, "step": 3876, "train_speed(iter/s)": 0.396641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 279.1875, "completions/min_length": 225.0, "epoch": 3.206782464846981, "grad_norm": 0.20594584941864014, "kl": 0.0650634765625, "learning_rate": 3.4329622414754728e-06, "loss": 0.0006490927189588547, "memory(GiB)": 38.09, "reward": 0.641503095626831, "reward_std": 0.08749669790267944, "rewards/VisualizationJSONCombinedORM/mean": 0.641503095626831, "rewards/VisualizationJSONCombinedORM/std": 0.08910570293664932, "step": 3877, "train_speed(iter/s)": 0.395784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 268.0625, "completions/min_length": 238.0, "epoch": 3.2076095947063687, "grad_norm": 0.17212295532226562, "kl": 0.085693359375, "learning_rate": 3.4302204853035278e-06, "loss": 0.000859186053276062, "memory(GiB)": 38.09, "reward": 0.5273275375366211, "reward_std": 0.07237230241298676, "rewards/VisualizationJSONCombinedORM/mean": 0.5273275375366211, "rewards/VisualizationJSONCombinedORM/std": 0.0756276547908783, "step": 3878, "train_speed(iter/s)": 0.394813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 274.5625, "completions/min_length": 219.0, "epoch": 3.208436724565757, "grad_norm": 0.2157818228006363, "kl": 0.034210205078125, "learning_rate": 3.4274792526604006e-06, "loss": 0.00034149736166000366, "memory(GiB)": 38.09, "reward": 0.27855658531188965, "reward_std": 0.022999826818704605, "rewards/VisualizationJSONCombinedORM/mean": 0.27855658531188965, "rewards/VisualizationJSONCombinedORM/std": 0.02280455082654953, "step": 3879, "train_speed(iter/s)": 0.39372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 264.3125, "completions/min_length": 205.0, "epoch": 3.209263854425145, "grad_norm": 0.21801240742206573, "kl": 0.05450439453125, "learning_rate": 3.424738544460302e-06, "loss": 0.0005438588559627533, "memory(GiB)": 38.09, "reward": 0.539077877998352, "reward_std": 0.061372801661491394, "rewards/VisualizationJSONCombinedORM/mean": 0.539077877998352, "rewards/VisualizationJSONCombinedORM/std": 0.14017288386821747, "step": 3880, "train_speed(iter/s)": 0.392809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 302.4375, "completions/min_length": 223.0, "epoch": 3.2100909842845327, "grad_norm": 0.15880391001701355, "kl": 0.07757568359375, "learning_rate": 3.4219983616172743e-06, "loss": 0.0007766559720039368, "memory(GiB)": 38.09, "reward": 0.49193012714385986, "reward_std": 0.07638963311910629, "rewards/VisualizationJSONCombinedORM/mean": 0.49193012714385986, "rewards/VisualizationJSONCombinedORM/std": 0.2793150544166565, "step": 3881, "train_speed(iter/s)": 0.391885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 295.8125, "completions/min_length": 198.0, "epoch": 3.2109181141439205, "grad_norm": 0.16595667600631714, "kl": 0.026824951171875, "learning_rate": 3.4192587050451774e-06, "loss": 0.0002669692039489746, "memory(GiB)": 38.09, "reward": 0.3840792775154114, "reward_std": 0.024691157042980194, "rewards/VisualizationJSONCombinedORM/mean": 0.3840792775154114, "rewards/VisualizationJSONCombinedORM/std": 0.10649388283491135, "step": 3882, "train_speed(iter/s)": 0.390884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 292.3125, "completions/min_length": 243.0, "epoch": 3.2117452440033087, "grad_norm": 0.20323461294174194, "kl": 0.104248046875, "learning_rate": 3.4165195756577048e-06, "loss": 0.0010432898998260498, "memory(GiB)": 38.09, "reward": 0.5430492758750916, "reward_std": 0.04712961986660957, "rewards/VisualizationJSONCombinedORM/mean": 0.5430492758750916, "rewards/VisualizationJSONCombinedORM/std": 0.049929503351449966, "step": 3883, "train_speed(iter/s)": 0.389746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 258.0, "completions/min_length": 191.0, "epoch": 3.2125723738626966, "grad_norm": 0.25336843729019165, "kl": 0.06463623046875, "learning_rate": 3.4137809743683624e-06, "loss": 0.0006464198231697083, "memory(GiB)": 38.09, "reward": 0.6939576864242554, "reward_std": 0.09773174673318863, "rewards/VisualizationJSONCombinedORM/mean": 0.6939576864242554, "rewards/VisualizationJSONCombinedORM/std": 0.1145845577120781, "step": 3884, "train_speed(iter/s)": 0.388956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 288.9375, "completions/min_length": 236.0, "epoch": 3.2133995037220844, "grad_norm": 0.2091442495584488, "kl": 0.05291748046875, "learning_rate": 3.4110429020904924e-06, "loss": 0.0005290098488330841, "memory(GiB)": 38.09, "reward": 0.6500855684280396, "reward_std": 0.09975937008857727, "rewards/VisualizationJSONCombinedORM/mean": 0.6500855684280396, "rewards/VisualizationJSONCombinedORM/std": 0.12298014760017395, "step": 3885, "train_speed(iter/s)": 0.387946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 312.375, "completions/min_length": 242.0, "epoch": 3.214226633581472, "grad_norm": 0.16857276856899261, "kl": 0.0399169921875, "learning_rate": 3.4083053597372517e-06, "loss": 0.00039879599353298545, "memory(GiB)": 38.09, "reward": 0.6636320352554321, "reward_std": 0.03882291913032532, "rewards/VisualizationJSONCombinedORM/mean": 0.6636320352554321, "rewards/VisualizationJSONCombinedORM/std": 0.1248127818107605, "step": 3886, "train_speed(iter/s)": 0.387037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 275.875, "completions/min_length": 225.0, "epoch": 3.21505376344086, "grad_norm": 0.192679762840271, "kl": 0.0458984375, "learning_rate": 3.405568348221625e-06, "loss": 0.0004598274827003479, "memory(GiB)": 38.09, "reward": 0.44342106580734253, "reward_std": 0.06980234384536743, "rewards/VisualizationJSONCombinedORM/mean": 0.44342106580734253, "rewards/VisualizationJSONCombinedORM/std": 0.265255868434906, "step": 3887, "train_speed(iter/s)": 0.386114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 272.625, "completions/min_length": 228.0, "epoch": 3.2158808933002483, "grad_norm": 0.23217351734638214, "kl": 0.241455078125, "learning_rate": 3.4028318684564168e-06, "loss": 0.0024086683988571167, "memory(GiB)": 38.09, "reward": 0.3150331676006317, "reward_std": 0.03286166489124298, "rewards/VisualizationJSONCombinedORM/mean": 0.3150331676006317, "rewards/VisualizationJSONCombinedORM/std": 0.08056965470314026, "step": 3888, "train_speed(iter/s)": 0.385206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 295.9375, "completions/min_length": 255.0, "epoch": 3.216708023159636, "grad_norm": 0.18023091554641724, "kl": 0.10223388671875, "learning_rate": 3.4000959213542573e-06, "loss": 0.0010198578238487244, "memory(GiB)": 38.09, "reward": 0.3964884877204895, "reward_std": 0.07901504635810852, "rewards/VisualizationJSONCombinedORM/mean": 0.3964884877204895, "rewards/VisualizationJSONCombinedORM/std": 0.21016091108322144, "step": 3889, "train_speed(iter/s)": 0.384277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 284.375, "completions/min_length": 241.0, "epoch": 3.217535153019024, "grad_norm": 0.15114563703536987, "kl": 0.059326171875, "learning_rate": 3.3973605078275955e-06, "loss": 0.0005939528346061707, "memory(GiB)": 38.09, "reward": 0.4387819170951843, "reward_std": 0.03291592746973038, "rewards/VisualizationJSONCombinedORM/mean": 0.4387819170951843, "rewards/VisualizationJSONCombinedORM/std": 0.04303594306111336, "step": 3890, "train_speed(iter/s)": 0.383382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 280.625, "completions/min_length": 229.0, "epoch": 3.2183622828784118, "grad_norm": 0.29522454738616943, "kl": 0.04681396484375, "learning_rate": 3.3946256287887095e-06, "loss": 0.00046785827726125717, "memory(GiB)": 38.09, "reward": 0.556613028049469, "reward_std": 0.07412241399288177, "rewards/VisualizationJSONCombinedORM/mean": 0.556613028049469, "rewards/VisualizationJSONCombinedORM/std": 0.10373975336551666, "step": 3891, "train_speed(iter/s)": 0.382373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 309.1875, "completions/min_length": 223.0, "epoch": 3.2191894127378, "grad_norm": 0.20746275782585144, "kl": 0.09722900390625, "learning_rate": 3.391891285149688e-06, "loss": 0.0009738225489854813, "memory(GiB)": 38.09, "reward": 0.4783738851547241, "reward_std": 0.06924742460250854, "rewards/VisualizationJSONCombinedORM/mean": 0.4783738851547241, "rewards/VisualizationJSONCombinedORM/std": 0.0876987874507904, "step": 3892, "train_speed(iter/s)": 0.381276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 297.25, "completions/min_length": 235.0, "epoch": 3.220016542597188, "grad_norm": 0.18810592591762543, "kl": 0.0888671875, "learning_rate": 3.3891574778224524e-06, "loss": 0.0008893720805644989, "memory(GiB)": 38.09, "reward": 0.6903256773948669, "reward_std": 0.06969361007213593, "rewards/VisualizationJSONCombinedORM/mean": 0.6903256773948669, "rewards/VisualizationJSONCombinedORM/std": 0.06851959973573685, "step": 3893, "train_speed(iter/s)": 0.380405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 277.625, "completions/min_length": 209.0, "epoch": 3.2208436724565757, "grad_norm": 0.2198665589094162, "kl": 0.09332275390625, "learning_rate": 3.3864242077187364e-06, "loss": 0.0009314902126789093, "memory(GiB)": 38.09, "reward": 0.250996857881546, "reward_std": 0.03386756032705307, "rewards/VisualizationJSONCombinedORM/mean": 0.250996857881546, "rewards/VisualizationJSONCombinedORM/std": 0.04701066389679909, "step": 3894, "train_speed(iter/s)": 0.379548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 340.75, "completions/min_length": 276.0, "epoch": 3.2216708023159635, "grad_norm": 0.19013802707195282, "kl": 0.030242919921875, "learning_rate": 3.3836914757501023e-06, "loss": 0.00030203163623809814, "memory(GiB)": 38.09, "reward": 0.7635180354118347, "reward_std": 0.08827364444732666, "rewards/VisualizationJSONCombinedORM/mean": 0.7635180354118347, "rewards/VisualizationJSONCombinedORM/std": 0.08544217795133591, "step": 3895, "train_speed(iter/s)": 0.378708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 292.4375, "completions/min_length": 244.0, "epoch": 3.2224979321753517, "grad_norm": 0.1719502955675125, "kl": 0.05108642578125, "learning_rate": 3.3809592828279254e-06, "loss": 0.0005115419626235962, "memory(GiB)": 38.09, "reward": 0.4042167663574219, "reward_std": 0.03978923335671425, "rewards/VisualizationJSONCombinedORM/mean": 0.4042167663574219, "rewards/VisualizationJSONCombinedORM/std": 0.05416806414723396, "step": 3896, "train_speed(iter/s)": 0.377856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 294.625, "completions/min_length": 244.0, "epoch": 3.2233250620347396, "grad_norm": 0.2010744959115982, "kl": 0.0697021484375, "learning_rate": 3.3782276298634076e-06, "loss": 0.0006955601274967194, "memory(GiB)": 38.09, "reward": 0.5088704824447632, "reward_std": 0.04974967613816261, "rewards/VisualizationJSONCombinedORM/mean": 0.5088704824447632, "rewards/VisualizationJSONCombinedORM/std": 0.10327069461345673, "step": 3897, "train_speed(iter/s)": 0.377088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 274.0625, "completions/min_length": 216.0, "epoch": 3.2241521918941274, "grad_norm": 0.23928192257881165, "kl": 0.190185546875, "learning_rate": 3.375496517767564e-06, "loss": 0.0018997788429260254, "memory(GiB)": 38.09, "reward": 0.5908402800559998, "reward_std": 0.07841864973306656, "rewards/VisualizationJSONCombinedORM/mean": 0.5908402800559998, "rewards/VisualizationJSONCombinedORM/std": 0.09948690235614777, "step": 3898, "train_speed(iter/s)": 0.376333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 267.0, "completions/min_length": 211.0, "epoch": 3.224979321753515, "grad_norm": 0.2191995531320572, "kl": 0.18701171875, "learning_rate": 3.37276594745124e-06, "loss": 0.0018691495060920715, "memory(GiB)": 38.09, "reward": 0.4697924554347992, "reward_std": 0.0639309510588646, "rewards/VisualizationJSONCombinedORM/mean": 0.4697924554347992, "rewards/VisualizationJSONCombinedORM/std": 0.19960156083106995, "step": 3899, "train_speed(iter/s)": 0.375565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 283.875, "completions/min_length": 217.0, "epoch": 3.225806451612903, "grad_norm": 0.1818678379058838, "kl": 0.15618896484375, "learning_rate": 3.3700359198250854e-06, "loss": 0.0015591811388731003, "memory(GiB)": 38.09, "reward": 0.6877628564834595, "reward_std": 0.05743515491485596, "rewards/VisualizationJSONCombinedORM/mean": 0.6877628564834595, "rewards/VisualizationJSONCombinedORM/std": 0.07201993465423584, "step": 3900, "train_speed(iter/s)": 0.37475 }, { "epoch": 3.225806451612903, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 355.7916666666667, "eval_completions/mean_length": 291.8385416666667, "eval_completions/min_length": 240.875, "eval_kl": 0.10759480794270833, "eval_loss": 0.001078678877092898, "eval_reward": 0.4914383056263129, "eval_reward_std": 0.05497633902511249, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4914383056263129, "eval_rewards/VisualizationJSONCombinedORM/std": 0.054976338132595025, "eval_runtime": 304.7645, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 292.4375, "completions/min_length": 228.0, "epoch": 3.2266335814722913, "grad_norm": 0.18776264786720276, "kl": 0.09173583984375, "learning_rate": 3.3673064357995844e-06, "loss": 0.0009189732372760773, "memory(GiB)": 38.09, "reward": 0.6665408611297607, "reward_std": 0.078091099858284, "rewards/VisualizationJSONCombinedORM/mean": 0.6665408611297607, "rewards/VisualizationJSONCombinedORM/std": 0.10341262072324753, "step": 3901, "train_speed(iter/s)": 0.363364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 297.375, "completions/min_length": 250.0, "epoch": 3.227460711331679, "grad_norm": 0.20861086249351501, "kl": 0.2025146484375, "learning_rate": 3.3645774962850287e-06, "loss": 0.002031080424785614, "memory(GiB)": 38.09, "reward": 0.537220299243927, "reward_std": 0.04911094158887863, "rewards/VisualizationJSONCombinedORM/mean": 0.537220299243927, "rewards/VisualizationJSONCombinedORM/std": 0.14580559730529785, "step": 3902, "train_speed(iter/s)": 0.362498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 261.6875, "completions/min_length": 220.0, "epoch": 3.228287841191067, "grad_norm": 0.42271897196769714, "kl": 0.310791015625, "learning_rate": 3.3618491021915334e-06, "loss": 0.00310361385345459, "memory(GiB)": 38.09, "reward": 0.5119221806526184, "reward_std": 0.06268922239542007, "rewards/VisualizationJSONCombinedORM/mean": 0.5119221806526184, "rewards/VisualizationJSONCombinedORM/std": 0.19169750809669495, "step": 3903, "train_speed(iter/s)": 0.361922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 287.25, "completions/min_length": 213.0, "epoch": 3.2291149710504548, "grad_norm": 0.17461960017681122, "kl": 0.05010986328125, "learning_rate": 3.35912125442903e-06, "loss": 0.0005012676119804382, "memory(GiB)": 38.09, "reward": 0.5487709641456604, "reward_std": 0.047255996614694595, "rewards/VisualizationJSONCombinedORM/mean": 0.5487709641456604, "rewards/VisualizationJSONCombinedORM/std": 0.2849633991718292, "step": 3904, "train_speed(iter/s)": 0.361198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 313.3125, "completions/min_length": 264.0, "epoch": 3.229942100909843, "grad_norm": 0.21571412682533264, "kl": 0.1319580078125, "learning_rate": 3.356393953907271e-06, "loss": 0.001323893666267395, "memory(GiB)": 38.09, "reward": 0.7030558586120605, "reward_std": 0.07723242044448853, "rewards/VisualizationJSONCombinedORM/mean": 0.7030558586120605, "rewards/VisualizationJSONCombinedORM/std": 0.08224521577358246, "step": 3905, "train_speed(iter/s)": 0.360257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 298.9375, "completions/min_length": 234.0, "epoch": 3.230769230769231, "grad_norm": 0.17073437571525574, "kl": 0.10400390625, "learning_rate": 3.353667201535819e-06, "loss": 0.0010370761156082153, "memory(GiB)": 38.09, "reward": 0.5009043216705322, "reward_std": 0.057745859026908875, "rewards/VisualizationJSONCombinedORM/mean": 0.5009043216705322, "rewards/VisualizationJSONCombinedORM/std": 0.06810910999774933, "step": 3906, "train_speed(iter/s)": 0.359492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 281.6875, "completions/min_length": 214.0, "epoch": 3.2315963606286187, "grad_norm": 0.24344402551651, "kl": 0.085693359375, "learning_rate": 3.3509409982240654e-06, "loss": 0.0008556470274925232, "memory(GiB)": 38.09, "reward": 0.4710869789123535, "reward_std": 0.0662900060415268, "rewards/VisualizationJSONCombinedORM/mean": 0.4710869789123535, "rewards/VisualizationJSONCombinedORM/std": 0.08418548107147217, "step": 3907, "train_speed(iter/s)": 0.358744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 260.8125, "completions/min_length": 217.0, "epoch": 3.2324234904880065, "grad_norm": 0.21370379626750946, "kl": 0.0831298828125, "learning_rate": 3.348215344881205e-06, "loss": 0.0008321031928062439, "memory(GiB)": 38.09, "reward": 0.42940178513526917, "reward_std": 0.05284809321165085, "rewards/VisualizationJSONCombinedORM/mean": 0.42940178513526917, "rewards/VisualizationJSONCombinedORM/std": 0.1412433534860611, "step": 3908, "train_speed(iter/s)": 0.357957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 316.1875, "completions/min_length": 226.0, "epoch": 3.2332506203473947, "grad_norm": 0.1893322616815567, "kl": 0.086181640625, "learning_rate": 3.3454902424162603e-06, "loss": 0.0008610598742961884, "memory(GiB)": 38.09, "reward": 0.48448115587234497, "reward_std": 0.04830528795719147, "rewards/VisualizationJSONCombinedORM/mean": 0.48448115587234497, "rewards/VisualizationJSONCombinedORM/std": 0.2977508306503296, "step": 3909, "train_speed(iter/s)": 0.357272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 297.0, "completions/min_length": 240.0, "epoch": 3.2340777502067826, "grad_norm": 0.20149628818035126, "kl": 0.0379638671875, "learning_rate": 3.342765691738064e-06, "loss": 0.00038058310747146606, "memory(GiB)": 38.09, "reward": 0.5675271153450012, "reward_std": 0.035316139459609985, "rewards/VisualizationJSONCombinedORM/mean": 0.5675271153450012, "rewards/VisualizationJSONCombinedORM/std": 0.29695695638656616, "step": 3910, "train_speed(iter/s)": 0.356495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 296.5625, "completions/min_length": 239.0, "epoch": 3.2349048800661704, "grad_norm": 0.18739038705825806, "kl": 0.078369140625, "learning_rate": 3.340041693755268e-06, "loss": 0.000784851610660553, "memory(GiB)": 38.09, "reward": 0.36978018283843994, "reward_std": 0.031937215477228165, "rewards/VisualizationJSONCombinedORM/mean": 0.36978018283843994, "rewards/VisualizationJSONCombinedORM/std": 0.033738717436790466, "step": 3911, "train_speed(iter/s)": 0.355913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 281.3125, "completions/min_length": 242.0, "epoch": 3.235732009925558, "grad_norm": 0.15709498524665833, "kl": 0.12554931640625, "learning_rate": 3.3373182493763366e-06, "loss": 0.0012581069022417068, "memory(GiB)": 38.09, "reward": 0.5075410008430481, "reward_std": 0.047947220504283905, "rewards/VisualizationJSONCombinedORM/mean": 0.5075410008430481, "rewards/VisualizationJSONCombinedORM/std": 0.23024989664554596, "step": 3912, "train_speed(iter/s)": 0.355213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 286.5, "completions/min_length": 242.0, "epoch": 3.236559139784946, "grad_norm": 0.278872013092041, "kl": 0.10833740234375, "learning_rate": 3.3345953595095526e-06, "loss": 0.0010865777730941772, "memory(GiB)": 38.09, "reward": 0.5311813354492188, "reward_std": 0.06209708750247955, "rewards/VisualizationJSONCombinedORM/mean": 0.5311813354492188, "rewards/VisualizationJSONCombinedORM/std": 0.13830095529556274, "step": 3913, "train_speed(iter/s)": 0.354516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 303.0, "completions/min_length": 246.0, "epoch": 3.2373862696443343, "grad_norm": 0.1918768286705017, "kl": 0.08251953125, "learning_rate": 3.3318730250630105e-06, "loss": 0.00082381721585989, "memory(GiB)": 38.09, "reward": 0.4780036211013794, "reward_std": 0.0708676129579544, "rewards/VisualizationJSONCombinedORM/mean": 0.4780036211013794, "rewards/VisualizationJSONCombinedORM/std": 0.10050535947084427, "step": 3914, "train_speed(iter/s)": 0.353736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 313.375, "completions/min_length": 261.0, "epoch": 3.238213399503722, "grad_norm": 0.17938023805618286, "kl": 0.08380126953125, "learning_rate": 3.3291512469446253e-06, "loss": 0.0008395686745643616, "memory(GiB)": 38.09, "reward": 0.6379745006561279, "reward_std": 0.07106837630271912, "rewards/VisualizationJSONCombinedORM/mean": 0.6379745006561279, "rewards/VisualizationJSONCombinedORM/std": 0.11279087513685226, "step": 3915, "train_speed(iter/s)": 0.353079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 317.125, "completions/min_length": 243.0, "epoch": 3.23904052936311, "grad_norm": 0.1781550645828247, "kl": 0.1181640625, "learning_rate": 3.3264300260621175e-06, "loss": 0.0011826790869235992, "memory(GiB)": 38.09, "reward": 0.54201340675354, "reward_std": 0.02895347774028778, "rewards/VisualizationJSONCombinedORM/mean": 0.54201340675354, "rewards/VisualizationJSONCombinedORM/std": 0.22731126844882965, "step": 3916, "train_speed(iter/s)": 0.352278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 314.625, "completions/min_length": 231.0, "epoch": 3.2398676592224978, "grad_norm": 0.21863694489002228, "kl": 0.05035400390625, "learning_rate": 3.3237093633230323e-06, "loss": 0.0005036965012550354, "memory(GiB)": 38.09, "reward": 0.6978554725646973, "reward_std": 0.0917125940322876, "rewards/VisualizationJSONCombinedORM/mean": 0.6978554725646973, "rewards/VisualizationJSONCombinedORM/std": 0.12665385007858276, "step": 3917, "train_speed(iter/s)": 0.351319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 301.4375, "completions/min_length": 243.0, "epoch": 3.240694789081886, "grad_norm": 0.19734922051429749, "kl": 0.0723876953125, "learning_rate": 3.32098925963472e-06, "loss": 0.0007241889834403992, "memory(GiB)": 38.09, "reward": 0.3583793640136719, "reward_std": 0.023190852254629135, "rewards/VisualizationJSONCombinedORM/mean": 0.3583793640136719, "rewards/VisualizationJSONCombinedORM/std": 0.08837185055017471, "step": 3918, "train_speed(iter/s)": 0.350649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 290.5, "completions/min_length": 215.0, "epoch": 3.241521918941274, "grad_norm": 0.23838593065738678, "kl": 0.154296875, "learning_rate": 3.31826971590435e-06, "loss": 0.0015397127717733383, "memory(GiB)": 38.09, "reward": 0.4176177978515625, "reward_std": 0.0454772524535656, "rewards/VisualizationJSONCombinedORM/mean": 0.4176177978515625, "rewards/VisualizationJSONCombinedORM/std": 0.2702499032020569, "step": 3919, "train_speed(iter/s)": 0.350038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/mean_length": 315.25, "completions/min_length": 227.0, "epoch": 3.2423490488006617, "grad_norm": 0.20837117731571198, "kl": 0.0460205078125, "learning_rate": 3.3155507330389004e-06, "loss": 0.0004611164331436157, "memory(GiB)": 38.09, "reward": 0.6095705032348633, "reward_std": 0.05402890592813492, "rewards/VisualizationJSONCombinedORM/mean": 0.6095705032348633, "rewards/VisualizationJSONCombinedORM/std": 0.17293956875801086, "step": 3920, "train_speed(iter/s)": 0.349275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 315.5625, "completions/min_length": 238.0, "epoch": 3.2431761786600495, "grad_norm": 0.20386241376399994, "kl": 0.046142578125, "learning_rate": 3.3128323119451654e-06, "loss": 0.0004623234272003174, "memory(GiB)": 38.09, "reward": 0.6365994215011597, "reward_std": 0.07106654345989227, "rewards/VisualizationJSONCombinedORM/mean": 0.6365994215011597, "rewards/VisualizationJSONCombinedORM/std": 0.13143950700759888, "step": 3921, "train_speed(iter/s)": 0.348614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 306.9375, "completions/min_length": 250.0, "epoch": 3.2440033085194377, "grad_norm": 0.35865116119384766, "kl": 0.176513671875, "learning_rate": 3.3101144535297557e-06, "loss": 0.0017686448991298676, "memory(GiB)": 38.09, "reward": 0.6780471205711365, "reward_std": 0.08922889828681946, "rewards/VisualizationJSONCombinedORM/mean": 0.6780471205711365, "rewards/VisualizationJSONCombinedORM/std": 0.10330678522586823, "step": 3922, "train_speed(iter/s)": 0.347896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 298.5625, "completions/min_length": 224.0, "epoch": 3.2448304383788256, "grad_norm": 0.19895684719085693, "kl": 0.04901123046875, "learning_rate": 3.307397158699083e-06, "loss": 0.0004897825419902802, "memory(GiB)": 38.09, "reward": 0.6492924690246582, "reward_std": 0.05259978026151657, "rewards/VisualizationJSONCombinedORM/mean": 0.6492924690246582, "rewards/VisualizationJSONCombinedORM/std": 0.2194262444972992, "step": 3923, "train_speed(iter/s)": 0.347198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 298.9375, "completions/min_length": 245.0, "epoch": 3.2456575682382134, "grad_norm": 0.17578576505184174, "kl": 0.06195068359375, "learning_rate": 3.3046804283593813e-06, "loss": 0.000620722770690918, "memory(GiB)": 38.09, "reward": 0.59543776512146, "reward_std": 0.041768353432416916, "rewards/VisualizationJSONCombinedORM/mean": 0.59543776512146, "rewards/VisualizationJSONCombinedORM/std": 0.24112671613693237, "step": 3924, "train_speed(iter/s)": 0.34647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 294.9375, "completions/min_length": 215.0, "epoch": 3.246484698097601, "grad_norm": 0.20237910747528076, "kl": 0.04443359375, "learning_rate": 3.301964263416693e-06, "loss": 0.00044420361518859863, "memory(GiB)": 38.09, "reward": 0.48188796639442444, "reward_std": 0.05900809168815613, "rewards/VisualizationJSONCombinedORM/mean": 0.48188796639442444, "rewards/VisualizationJSONCombinedORM/std": 0.18927231431007385, "step": 3925, "train_speed(iter/s)": 0.345748 }, { "epoch": 3.246484698097601, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.4583333333333, "eval_completions/mean_length": 298.65625, "eval_completions/min_length": 247.25, "eval_kl": 0.07325236002604167, "eval_loss": 0.0007306523621082306, "eval_reward": 0.44987330958247185, "eval_reward_std": 0.057432539373015366, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44987330958247185, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05743254108044008, "eval_runtime": 312.0233, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 305.875, "completions/min_length": 241.0, "epoch": 3.247311827956989, "grad_norm": 0.18272265791893005, "kl": 0.05718994140625, "learning_rate": 3.299248664776872e-06, "loss": 0.0005704984068870544, "memory(GiB)": 38.09, "reward": 0.41570186614990234, "reward_std": 0.02766098827123642, "rewards/VisualizationJSONCombinedORM/mean": 0.41570186614990234, "rewards/VisualizationJSONCombinedORM/std": 0.15726017951965332, "step": 3926, "train_speed(iter/s)": 0.335791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 308.375, "completions/min_length": 239.0, "epoch": 3.2481389578163773, "grad_norm": 0.18384774029254913, "kl": 0.0684814453125, "learning_rate": 3.296533633345581e-06, "loss": 0.0006841300055384636, "memory(GiB)": 38.09, "reward": 0.3605901896953583, "reward_std": 0.03720253333449364, "rewards/VisualizationJSONCombinedORM/mean": 0.3605901896953583, "rewards/VisualizationJSONCombinedORM/std": 0.04843946918845177, "step": 3927, "train_speed(iter/s)": 0.335207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 283.75, "completions/min_length": 235.0, "epoch": 3.248966087675765, "grad_norm": 0.14484667778015137, "kl": 0.0211944580078125, "learning_rate": 3.293819170028299e-06, "loss": 0.00020940974354743958, "memory(GiB)": 38.09, "reward": 0.5296220183372498, "reward_std": 0.045990973711013794, "rewards/VisualizationJSONCombinedORM/mean": 0.5296220183372498, "rewards/VisualizationJSONCombinedORM/std": 0.23852337896823883, "step": 3928, "train_speed(iter/s)": 0.334525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 306.25, "completions/min_length": 241.0, "epoch": 3.249793217535153, "grad_norm": 0.18070095777511597, "kl": 0.05908203125, "learning_rate": 3.2911052757303075e-06, "loss": 0.0005915649235248566, "memory(GiB)": 38.09, "reward": 0.6383959650993347, "reward_std": 0.0679004043340683, "rewards/VisualizationJSONCombinedORM/mean": 0.6383959650993347, "rewards/VisualizationJSONCombinedORM/std": 0.16345642507076263, "step": 3929, "train_speed(iter/s)": 0.333869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 303.75, "completions/min_length": 241.0, "epoch": 3.2506203473945408, "grad_norm": 0.17316772043704987, "kl": 0.03558349609375, "learning_rate": 3.2883919513567096e-06, "loss": 0.0003562420606613159, "memory(GiB)": 38.09, "reward": 0.7268398404121399, "reward_std": 0.05265241116285324, "rewards/VisualizationJSONCombinedORM/mean": 0.7268398404121399, "rewards/VisualizationJSONCombinedORM/std": 0.12707163393497467, "step": 3930, "train_speed(iter/s)": 0.333286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 295.1875, "completions/min_length": 209.0, "epoch": 3.251447477253929, "grad_norm": 0.1760164201259613, "kl": 0.0615234375, "learning_rate": 3.285679197812405e-06, "loss": 0.0006146281957626343, "memory(GiB)": 38.09, "reward": 0.5006356239318848, "reward_std": 0.06332559883594513, "rewards/VisualizationJSONCombinedORM/mean": 0.5006356239318848, "rewards/VisualizationJSONCombinedORM/std": 0.1736287623643875, "step": 3931, "train_speed(iter/s)": 0.332763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 323.4375, "completions/min_length": 269.0, "epoch": 3.252274607113317, "grad_norm": 0.20449717342853546, "kl": 0.0716552734375, "learning_rate": 3.2829670160021137e-06, "loss": 0.0007164943963289261, "memory(GiB)": 38.09, "reward": 0.5342609882354736, "reward_std": 0.049470700323581696, "rewards/VisualizationJSONCombinedORM/mean": 0.5342609882354736, "rewards/VisualizationJSONCombinedORM/std": 0.20173312723636627, "step": 3932, "train_speed(iter/s)": 0.33207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 312.6875, "completions/min_length": 242.0, "epoch": 3.2531017369727047, "grad_norm": 0.21101684868335724, "kl": 0.0390625, "learning_rate": 3.2802554068303595e-06, "loss": 0.00039118528366088867, "memory(GiB)": 38.09, "reward": 0.5827499628067017, "reward_std": 0.0849158763885498, "rewards/VisualizationJSONCombinedORM/mean": 0.5827499628067017, "rewards/VisualizationJSONCombinedORM/std": 0.1873796582221985, "step": 3933, "train_speed(iter/s)": 0.331415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 298.75, "completions/min_length": 245.0, "epoch": 3.2539288668320925, "grad_norm": 0.19970820844173431, "kl": 0.03668212890625, "learning_rate": 3.2775443712014775e-06, "loss": 0.0003672987222671509, "memory(GiB)": 38.09, "reward": 0.6850647926330566, "reward_std": 0.03597213700413704, "rewards/VisualizationJSONCombinedORM/mean": 0.6850647926330566, "rewards/VisualizationJSONCombinedORM/std": 0.09814826399087906, "step": 3934, "train_speed(iter/s)": 0.330789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 277.25, "completions/min_length": 225.0, "epoch": 3.2547559966914807, "grad_norm": 0.265522837638855, "kl": 0.047119140625, "learning_rate": 3.2748339100196105e-06, "loss": 0.0004714950919151306, "memory(GiB)": 38.09, "reward": 0.7025071382522583, "reward_std": 0.11067010462284088, "rewards/VisualizationJSONCombinedORM/mean": 0.7025071382522583, "rewards/VisualizationJSONCombinedORM/std": 0.10726971179246902, "step": 3935, "train_speed(iter/s)": 0.33011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 299.375, "completions/min_length": 233.0, "epoch": 3.2555831265508686, "grad_norm": 0.1966657042503357, "kl": 0.0869140625, "learning_rate": 3.2721240241887108e-06, "loss": 0.0008688271045684814, "memory(GiB)": 38.09, "reward": 0.4190192222595215, "reward_std": 0.05299074202775955, "rewards/VisualizationJSONCombinedORM/mean": 0.4190192222595215, "rewards/VisualizationJSONCombinedORM/std": 0.07715815305709839, "step": 3936, "train_speed(iter/s)": 0.329453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 278.1875, "completions/min_length": 224.0, "epoch": 3.2564102564102564, "grad_norm": 0.1704120635986328, "kl": 0.0721435546875, "learning_rate": 3.269414714612534e-06, "loss": 0.000721195712685585, "memory(GiB)": 38.09, "reward": 0.43344828486442566, "reward_std": 0.0980769544839859, "rewards/VisualizationJSONCombinedORM/mean": 0.43344828486442566, "rewards/VisualizationJSONCombinedORM/std": 0.13031266629695892, "step": 3937, "train_speed(iter/s)": 0.328718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 300.625, "completions/min_length": 251.0, "epoch": 3.257237386269644, "grad_norm": 0.20225489139556885, "kl": 0.077880859375, "learning_rate": 3.2667059821946557e-06, "loss": 0.000778689980506897, "memory(GiB)": 38.09, "reward": 0.655150294303894, "reward_std": 0.09442444890737534, "rewards/VisualizationJSONCombinedORM/mean": 0.655150294303894, "rewards/VisualizationJSONCombinedORM/std": 0.1283566951751709, "step": 3938, "train_speed(iter/s)": 0.328239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 306.6875, "completions/min_length": 236.0, "epoch": 3.258064516129032, "grad_norm": 0.1692354828119278, "kl": 0.030731201171875, "learning_rate": 3.263997827838441e-06, "loss": 0.00030765682458877563, "memory(GiB)": 38.09, "reward": 0.753251850605011, "reward_std": 0.0581209696829319, "rewards/VisualizationJSONCombinedORM/mean": 0.753251850605011, "rewards/VisualizationJSONCombinedORM/std": 0.1372632086277008, "step": 3939, "train_speed(iter/s)": 0.327778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 331.875, "completions/min_length": 281.0, "epoch": 3.2588916459884203, "grad_norm": 0.16530071198940277, "kl": 0.0843505859375, "learning_rate": 3.2612902524470803e-06, "loss": 0.0008443072438240051, "memory(GiB)": 38.09, "reward": 0.6529019474983215, "reward_std": 0.04859240725636482, "rewards/VisualizationJSONCombinedORM/mean": 0.6529019474983215, "rewards/VisualizationJSONCombinedORM/std": 0.19671401381492615, "step": 3940, "train_speed(iter/s)": 0.32708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 342.75, "completions/min_length": 270.0, "epoch": 3.259718775847808, "grad_norm": 0.22575290501117706, "kl": 0.04901123046875, "learning_rate": 3.2585832569235576e-06, "loss": 0.0004909615963697433, "memory(GiB)": 38.09, "reward": 0.4654794931411743, "reward_std": 0.07892154902219772, "rewards/VisualizationJSONCombinedORM/mean": 0.4654794931411743, "rewards/VisualizationJSONCombinedORM/std": 0.19080553948879242, "step": 3941, "train_speed(iter/s)": 0.326573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 300.3125, "completions/min_length": 249.0, "epoch": 3.260545905707196, "grad_norm": 0.1787170171737671, "kl": 0.0572509765625, "learning_rate": 3.255876842170671e-06, "loss": 0.0005733668804168701, "memory(GiB)": 38.09, "reward": 0.30074572563171387, "reward_std": 0.03141094371676445, "rewards/VisualizationJSONCombinedORM/mean": 0.30074572563171387, "rewards/VisualizationJSONCombinedORM/std": 0.119420126080513, "step": 3942, "train_speed(iter/s)": 0.325986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 282.0, "completions/min_length": 224.0, "epoch": 3.261373035566584, "grad_norm": 0.20739172399044037, "kl": 0.16455078125, "learning_rate": 3.2531710090910207e-06, "loss": 0.0016418732702732086, "memory(GiB)": 38.09, "reward": 0.3651556670665741, "reward_std": 0.0352802649140358, "rewards/VisualizationJSONCombinedORM/mean": 0.3651556670665741, "rewards/VisualizationJSONCombinedORM/std": 0.03453551605343819, "step": 3943, "train_speed(iter/s)": 0.325341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 286.0625, "completions/min_length": 226.0, "epoch": 3.262200165425972, "grad_norm": 0.17438194155693054, "kl": 0.04351806640625, "learning_rate": 3.250465758587017e-06, "loss": 0.00043485313653945923, "memory(GiB)": 38.09, "reward": 0.7174633741378784, "reward_std": 0.05763384327292442, "rewards/VisualizationJSONCombinedORM/mean": 0.7174633741378784, "rewards/VisualizationJSONCombinedORM/std": 0.06863421946763992, "step": 3944, "train_speed(iter/s)": 0.324677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 323.25, "completions/min_length": 279.0, "epoch": 3.26302729528536, "grad_norm": 0.1819310039281845, "kl": 0.07720947265625, "learning_rate": 3.2477610915608705e-06, "loss": 0.0007734298706054688, "memory(GiB)": 38.09, "reward": 0.5837696194648743, "reward_std": 0.06812523305416107, "rewards/VisualizationJSONCombinedORM/mean": 0.5837696194648743, "rewards/VisualizationJSONCombinedORM/std": 0.0692908838391304, "step": 3945, "train_speed(iter/s)": 0.324082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 304.1875, "completions/min_length": 227.0, "epoch": 3.2638544251447477, "grad_norm": 0.18155010044574738, "kl": 0.072021484375, "learning_rate": 3.245057008914605e-06, "loss": 0.0007190555334091187, "memory(GiB)": 38.09, "reward": 0.29562145471572876, "reward_std": 0.029763100668787956, "rewards/VisualizationJSONCombinedORM/mean": 0.29562145471572876, "rewards/VisualizationJSONCombinedORM/std": 0.09458355605602264, "step": 3946, "train_speed(iter/s)": 0.323415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 312.3125, "completions/min_length": 244.0, "epoch": 3.2646815550041355, "grad_norm": 0.23412899672985077, "kl": 0.075927734375, "learning_rate": 3.2423535115500387e-06, "loss": 0.0007562637329101562, "memory(GiB)": 38.09, "reward": 0.4218783676624298, "reward_std": 0.04151558503508568, "rewards/VisualizationJSONCombinedORM/mean": 0.4218783676624298, "rewards/VisualizationJSONCombinedORM/std": 0.1139732077717781, "step": 3947, "train_speed(iter/s)": 0.322887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 313.0625, "completions/min_length": 259.0, "epoch": 3.2655086848635237, "grad_norm": 0.17753347754478455, "kl": 0.04296875, "learning_rate": 3.2396506003688062e-06, "loss": 0.0004293471574783325, "memory(GiB)": 38.09, "reward": 0.44818928837776184, "reward_std": 0.043974678963422775, "rewards/VisualizationJSONCombinedORM/mean": 0.44818928837776184, "rewards/VisualizationJSONCombinedORM/std": 0.047113317996263504, "step": 3948, "train_speed(iter/s)": 0.322267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 272.625, "completions/min_length": 219.0, "epoch": 3.2663358147229116, "grad_norm": 0.19307167828083038, "kl": 0.0845947265625, "learning_rate": 3.236948276272337e-06, "loss": 0.0008476227521896362, "memory(GiB)": 38.09, "reward": 0.5230966806411743, "reward_std": 0.063083216547966, "rewards/VisualizationJSONCombinedORM/mean": 0.5230966806411743, "rewards/VisualizationJSONCombinedORM/std": 0.147319957613945, "step": 3949, "train_speed(iter/s)": 0.321812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 270.0625, "completions/min_length": 216.0, "epoch": 3.2671629445822994, "grad_norm": 0.16738297045230865, "kl": 0.05126953125, "learning_rate": 3.2342465401618715e-06, "loss": 0.0005125813186168671, "memory(GiB)": 38.09, "reward": 0.7145106792449951, "reward_std": 0.06571555137634277, "rewards/VisualizationJSONCombinedORM/mean": 0.7145106792449951, "rewards/VisualizationJSONCombinedORM/std": 0.06608670949935913, "step": 3950, "train_speed(iter/s)": 0.321295 }, { "epoch": 3.2671629445822994, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.5833333333333, "eval_completions/mean_length": 299.8125, "eval_completions/min_length": 248.75, "eval_kl": 0.068756103515625, "eval_loss": 0.0006934019620530307, "eval_reward": 0.4485280861457189, "eval_reward_std": 0.04753485166778167, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4485280861457189, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04753485228866339, "eval_runtime": 311.3297, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 288.25, "completions/min_length": 239.0, "epoch": 3.267990074441687, "grad_norm": 0.22839851677417755, "kl": 0.06146240234375, "learning_rate": 3.23154539293845e-06, "loss": 0.0006140526384115219, "memory(GiB)": 38.09, "reward": 0.6881741881370544, "reward_std": 0.06696444004774094, "rewards/VisualizationJSONCombinedORM/mean": 0.6881741881370544, "rewards/VisualizationJSONCombinedORM/std": 0.09806489199399948, "step": 3951, "train_speed(iter/s)": 0.312814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 288.375, "completions/min_length": 237.0, "epoch": 3.268817204301075, "grad_norm": 0.1934506744146347, "kl": 0.07830810546875, "learning_rate": 3.228844835502919e-06, "loss": 0.000784650444984436, "memory(GiB)": 38.09, "reward": 0.6687617301940918, "reward_std": 0.03741852566599846, "rewards/VisualizationJSONCombinedORM/mean": 0.6687617301940918, "rewards/VisualizationJSONCombinedORM/std": 0.04597092419862747, "step": 3952, "train_speed(iter/s)": 0.312224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 301.75, "completions/min_length": 263.0, "epoch": 3.2696443341604633, "grad_norm": 0.18902723491191864, "kl": 0.06671142578125, "learning_rate": 3.226144868755923e-06, "loss": 0.0006670765578746796, "memory(GiB)": 38.09, "reward": 0.5228325128555298, "reward_std": 0.06142743304371834, "rewards/VisualizationJSONCombinedORM/mean": 0.5228325128555298, "rewards/VisualizationJSONCombinedORM/std": 0.1926710456609726, "step": 3953, "train_speed(iter/s)": 0.311685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 303.75, "completions/min_length": 236.0, "epoch": 3.270471464019851, "grad_norm": 0.15586183965206146, "kl": 0.03765869140625, "learning_rate": 3.223445493597921e-06, "loss": 0.00037654489278793335, "memory(GiB)": 38.09, "reward": 0.6221838593482971, "reward_std": 0.07791689783334732, "rewards/VisualizationJSONCombinedORM/mean": 0.6221838593482971, "rewards/VisualizationJSONCombinedORM/std": 0.15452787280082703, "step": 3954, "train_speed(iter/s)": 0.311065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 289.25, "completions/min_length": 220.0, "epoch": 3.271298593879239, "grad_norm": 0.28535744547843933, "kl": 0.07830810546875, "learning_rate": 3.220746710929159e-06, "loss": 0.0007843077182769775, "memory(GiB)": 38.09, "reward": 0.6043870449066162, "reward_std": 0.0930270105600357, "rewards/VisualizationJSONCombinedORM/mean": 0.6043870449066162, "rewards/VisualizationJSONCombinedORM/std": 0.11585914343595505, "step": 3955, "train_speed(iter/s)": 0.310603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 307.6875, "completions/min_length": 239.0, "epoch": 3.272125723738627, "grad_norm": 0.208998441696167, "kl": 0.0496826171875, "learning_rate": 3.2180485216496998e-06, "loss": 0.0004969649016857147, "memory(GiB)": 38.09, "reward": 0.6969153881072998, "reward_std": 0.08462276309728622, "rewards/VisualizationJSONCombinedORM/mean": 0.6969153881072998, "rewards/VisualizationJSONCombinedORM/std": 0.08330565690994263, "step": 3956, "train_speed(iter/s)": 0.310034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 301.0625, "completions/min_length": 248.0, "epoch": 3.272952853598015, "grad_norm": 0.18444274365901947, "kl": 0.0511474609375, "learning_rate": 3.2153509266593984e-06, "loss": 0.0005114562809467316, "memory(GiB)": 38.09, "reward": 0.5316445827484131, "reward_std": 0.06078292429447174, "rewards/VisualizationJSONCombinedORM/mean": 0.5316445827484131, "rewards/VisualizationJSONCombinedORM/std": 0.12706567347049713, "step": 3957, "train_speed(iter/s)": 0.309383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 286.1875, "completions/min_length": 235.0, "epoch": 3.273779983457403, "grad_norm": 0.22462202608585358, "kl": 0.07232666015625, "learning_rate": 3.2126539268579187e-06, "loss": 0.0007223095744848251, "memory(GiB)": 38.09, "reward": 0.4980952739715576, "reward_std": 0.045826688408851624, "rewards/VisualizationJSONCombinedORM/mean": 0.4980952739715576, "rewards/VisualizationJSONCombinedORM/std": 0.05783432349562645, "step": 3958, "train_speed(iter/s)": 0.308872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 293.4375, "completions/min_length": 234.0, "epoch": 3.2746071133167907, "grad_norm": 0.18828602135181427, "kl": 0.0496826171875, "learning_rate": 3.20995752314472e-06, "loss": 0.0004982389509677887, "memory(GiB)": 38.09, "reward": 0.6779161691665649, "reward_std": 0.050028666853904724, "rewards/VisualizationJSONCombinedORM/mean": 0.6779161691665649, "rewards/VisualizationJSONCombinedORM/std": 0.08142656832933426, "step": 3959, "train_speed(iter/s)": 0.308312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 275.875, "completions/min_length": 226.0, "epoch": 3.2754342431761785, "grad_norm": 0.1865963339805603, "kl": 0.04791259765625, "learning_rate": 3.207261716419067e-06, "loss": 0.00047842785716056824, "memory(GiB)": 38.09, "reward": 0.4639478921890259, "reward_std": 0.04664437472820282, "rewards/VisualizationJSONCombinedORM/mean": 0.4639478921890259, "rewards/VisualizationJSONCombinedORM/std": 0.06644345074892044, "step": 3960, "train_speed(iter/s)": 0.307809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 292.6875, "completions/min_length": 236.0, "epoch": 3.2762613730355667, "grad_norm": 0.25028595328330994, "kl": 0.07684326171875, "learning_rate": 3.204566507580021e-06, "loss": 0.0007690936326980591, "memory(GiB)": 38.09, "reward": 0.5425657033920288, "reward_std": 0.06245972961187363, "rewards/VisualizationJSONCombinedORM/mean": 0.5425657033920288, "rewards/VisualizationJSONCombinedORM/std": 0.0930333286523819, "step": 3961, "train_speed(iter/s)": 0.307273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 284.25, "completions/min_length": 237.0, "epoch": 3.2770885028949546, "grad_norm": 0.18818743526935577, "kl": 0.07464599609375, "learning_rate": 3.2018718975264524e-06, "loss": 0.0007502101361751556, "memory(GiB)": 38.09, "reward": 0.6758415699005127, "reward_std": 0.05267978459596634, "rewards/VisualizationJSONCombinedORM/mean": 0.6758415699005127, "rewards/VisualizationJSONCombinedORM/std": 0.06146103888750076, "step": 3962, "train_speed(iter/s)": 0.30674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 329.375, "completions/min_length": 217.0, "epoch": 3.2779156327543424, "grad_norm": 0.20766186714172363, "kl": 0.076904296875, "learning_rate": 3.19917788715702e-06, "loss": 0.0007700119167566299, "memory(GiB)": 38.09, "reward": 0.3859761357307434, "reward_std": 0.051077257841825485, "rewards/VisualizationJSONCombinedORM/mean": 0.3859761357307434, "rewards/VisualizationJSONCombinedORM/std": 0.18635974824428558, "step": 3963, "train_speed(iter/s)": 0.306028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 281.5625, "completions/min_length": 238.0, "epoch": 3.27874276261373, "grad_norm": 0.21180526912212372, "kl": 0.04638671875, "learning_rate": 3.1964844773701938e-06, "loss": 0.0004637688398361206, "memory(GiB)": 38.09, "reward": 0.4813334345817566, "reward_std": 0.02826252393424511, "rewards/VisualizationJSONCombinedORM/mean": 0.4813334345817566, "rewards/VisualizationJSONCombinedORM/std": 0.3410809636116028, "step": 3964, "train_speed(iter/s)": 0.305583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 292.375, "completions/min_length": 236.0, "epoch": 3.279569892473118, "grad_norm": 0.16957108676433563, "kl": 0.041015625, "learning_rate": 3.1937916690642356e-06, "loss": 0.0004094913601875305, "memory(GiB)": 38.09, "reward": 0.49387192726135254, "reward_std": 0.034309856593608856, "rewards/VisualizationJSONCombinedORM/mean": 0.49387192726135254, "rewards/VisualizationJSONCombinedORM/std": 0.0938865914940834, "step": 3965, "train_speed(iter/s)": 0.305068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 315.5625, "completions/min_length": 244.0, "epoch": 3.2803970223325063, "grad_norm": 0.2192777693271637, "kl": 0.114013671875, "learning_rate": 3.191099463137212e-06, "loss": 0.0011414159089326859, "memory(GiB)": 38.09, "reward": 0.402546763420105, "reward_std": 0.049940045922994614, "rewards/VisualizationJSONCombinedORM/mean": 0.402546763420105, "rewards/VisualizationJSONCombinedORM/std": 0.13994267582893372, "step": 3966, "train_speed(iter/s)": 0.304584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 293.5, "completions/min_length": 241.0, "epoch": 3.281224152191894, "grad_norm": 0.1843651533126831, "kl": 0.04937744140625, "learning_rate": 3.188407860486983e-06, "loss": 0.0004927739500999451, "memory(GiB)": 38.09, "reward": 0.7045295238494873, "reward_std": 0.03121776133775711, "rewards/VisualizationJSONCombinedORM/mean": 0.7045295238494873, "rewards/VisualizationJSONCombinedORM/std": 0.06668943166732788, "step": 3967, "train_speed(iter/s)": 0.303964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 316.125, "completions/min_length": 253.0, "epoch": 3.282051282051282, "grad_norm": 0.16366450488567352, "kl": 0.0645751953125, "learning_rate": 3.1857168620112145e-06, "loss": 0.000645703636109829, "memory(GiB)": 38.09, "reward": 0.6788628101348877, "reward_std": 0.0549992173910141, "rewards/VisualizationJSONCombinedORM/mean": 0.6788628101348877, "rewards/VisualizationJSONCombinedORM/std": 0.06058952212333679, "step": 3968, "train_speed(iter/s)": 0.303385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 306.3125, "completions/min_length": 236.0, "epoch": 3.28287841191067, "grad_norm": 0.18395952880382538, "kl": 0.0751953125, "learning_rate": 3.183026468607362e-06, "loss": 0.000751035287976265, "memory(GiB)": 38.09, "reward": 0.30031174421310425, "reward_std": 0.030155180022120476, "rewards/VisualizationJSONCombinedORM/mean": 0.30031174421310425, "rewards/VisualizationJSONCombinedORM/std": 0.07628590613603592, "step": 3969, "train_speed(iter/s)": 0.302847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 273.75, "completions/min_length": 223.0, "epoch": 3.283705541770058, "grad_norm": 0.17699159681797028, "kl": 0.059814453125, "learning_rate": 3.180336681172691e-06, "loss": 0.0005991309881210327, "memory(GiB)": 38.09, "reward": 0.37232479453086853, "reward_std": 0.03504911810159683, "rewards/VisualizationJSONCombinedORM/mean": 0.37232479453086853, "rewards/VisualizationJSONCombinedORM/std": 0.0342223159968853, "step": 3970, "train_speed(iter/s)": 0.302296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 312.3125, "completions/min_length": 235.0, "epoch": 3.284532671629446, "grad_norm": 0.17346230149269104, "kl": 0.073486328125, "learning_rate": 3.177647500604252e-06, "loss": 0.000733640044927597, "memory(GiB)": 38.09, "reward": 0.5589641332626343, "reward_std": 0.05206238478422165, "rewards/VisualizationJSONCombinedORM/mean": 0.5589641332626343, "rewards/VisualizationJSONCombinedORM/std": 0.1542573720216751, "step": 3971, "train_speed(iter/s)": 0.301757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 296.625, "completions/min_length": 223.0, "epoch": 3.2853598014888337, "grad_norm": 0.24880775809288025, "kl": 0.074462890625, "learning_rate": 3.1749589277989036e-06, "loss": 0.0007424727082252502, "memory(GiB)": 38.09, "reward": 0.343270868062973, "reward_std": 0.0511925145983696, "rewards/VisualizationJSONCombinedORM/mean": 0.343270868062973, "rewards/VisualizationJSONCombinedORM/std": 0.0900239646434784, "step": 3972, "train_speed(iter/s)": 0.301257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 307.0, "completions/min_length": 234.0, "epoch": 3.2861869313482215, "grad_norm": 0.21667933464050293, "kl": 0.0625, "learning_rate": 3.1722709636532944e-06, "loss": 0.0006264597177505493, "memory(GiB)": 38.09, "reward": 0.41398298740386963, "reward_std": 0.05061054974794388, "rewards/VisualizationJSONCombinedORM/mean": 0.41398298740386963, "rewards/VisualizationJSONCombinedORM/std": 0.08608372509479523, "step": 3973, "train_speed(iter/s)": 0.300637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 288.4375, "completions/min_length": 244.0, "epoch": 3.2870140612076097, "grad_norm": 0.22127652168273926, "kl": 0.0928955078125, "learning_rate": 3.169583609063876e-06, "loss": 0.0009283944964408875, "memory(GiB)": 38.09, "reward": 0.7391672730445862, "reward_std": 0.08444677293300629, "rewards/VisualizationJSONCombinedORM/mean": 0.7391672730445862, "rewards/VisualizationJSONCombinedORM/std": 0.08262253552675247, "step": 3974, "train_speed(iter/s)": 0.300158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 292.125, "completions/min_length": 236.0, "epoch": 3.2878411910669976, "grad_norm": 0.1822565644979477, "kl": 0.06402587890625, "learning_rate": 3.1668968649268905e-06, "loss": 0.0006425194442272186, "memory(GiB)": 38.09, "reward": 0.5751308798789978, "reward_std": 0.08253289014101028, "rewards/VisualizationJSONCombinedORM/mean": 0.5751308798789978, "rewards/VisualizationJSONCombinedORM/std": 0.1559898853302002, "step": 3975, "train_speed(iter/s)": 0.299539 }, { "epoch": 3.2878411910669976, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.6666666666667, "eval_completions/mean_length": 291.7552083333333, "eval_completions/min_length": 245.625, "eval_kl": 0.08478291829427083, "eval_loss": 0.0008420906960964203, "eval_reward": 0.4598672886689504, "eval_reward_std": 0.053521849564276636, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4598672886689504, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05352185022396346, "eval_runtime": 298.5008, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 288.875, "completions/min_length": 235.0, "epoch": 3.2886683209263854, "grad_norm": 0.26259666681289673, "kl": 0.0784912109375, "learning_rate": 3.164210732138383e-06, "loss": 0.0007858742028474808, "memory(GiB)": 38.09, "reward": 0.38829129934310913, "reward_std": 0.049275510013103485, "rewards/VisualizationJSONCombinedORM/mean": 0.38829129934310913, "rewards/VisualizationJSONCombinedORM/std": 0.051833752542734146, "step": 3976, "train_speed(iter/s)": 0.292468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 303.8125, "completions/min_length": 243.0, "epoch": 3.289495450785773, "grad_norm": 0.25321170687675476, "kl": 0.0615234375, "learning_rate": 3.161525211594187e-06, "loss": 0.0006160363554954529, "memory(GiB)": 38.09, "reward": 0.5599541664123535, "reward_std": 0.0725308358669281, "rewards/VisualizationJSONCombinedORM/mean": 0.5599541664123535, "rewards/VisualizationJSONCombinedORM/std": 0.09843015670776367, "step": 3977, "train_speed(iter/s)": 0.291999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 288.8125, "completions/min_length": 222.0, "epoch": 3.2903225806451615, "grad_norm": 0.203339084982872, "kl": 0.0877685546875, "learning_rate": 3.158840304189942e-06, "loss": 0.0008762180805206299, "memory(GiB)": 38.09, "reward": 0.5186691880226135, "reward_std": 0.06122313439846039, "rewards/VisualizationJSONCombinedORM/mean": 0.5186691880226135, "rewards/VisualizationJSONCombinedORM/std": 0.08658955991268158, "step": 3978, "train_speed(iter/s)": 0.291465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 324.8125, "completions/min_length": 268.0, "epoch": 3.2911497105045493, "grad_norm": 0.2072424441576004, "kl": 0.04644775390625, "learning_rate": 3.156156010821071e-06, "loss": 0.00046502798795700073, "memory(GiB)": 38.09, "reward": 0.643278956413269, "reward_std": 0.09749723225831985, "rewards/VisualizationJSONCombinedORM/mean": 0.643278956413269, "rewards/VisualizationJSONCombinedORM/std": 0.181808203458786, "step": 3979, "train_speed(iter/s)": 0.290995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 307.5625, "completions/min_length": 249.0, "epoch": 3.291976840363937, "grad_norm": 0.2017517387866974, "kl": 0.1138916015625, "learning_rate": 3.153472332382803e-06, "loss": 0.0011399239301681519, "memory(GiB)": 38.09, "reward": 0.5894772410392761, "reward_std": 0.057474445551633835, "rewards/VisualizationJSONCombinedORM/mean": 0.5894772410392761, "rewards/VisualizationJSONCombinedORM/std": 0.20131109654903412, "step": 3980, "train_speed(iter/s)": 0.290554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 275.6875, "completions/min_length": 231.0, "epoch": 3.292803970223325, "grad_norm": 0.18339575827121735, "kl": 0.0576171875, "learning_rate": 3.150789269770155e-06, "loss": 0.0005745962262153625, "memory(GiB)": 38.09, "reward": 0.7053423523902893, "reward_std": 0.05558011680841446, "rewards/VisualizationJSONCombinedORM/mean": 0.7053423523902893, "rewards/VisualizationJSONCombinedORM/std": 0.10854770243167877, "step": 3981, "train_speed(iter/s)": 0.290139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 310.125, "completions/min_length": 269.0, "epoch": 3.293631100082713, "grad_norm": 0.20076106488704681, "kl": 0.0421142578125, "learning_rate": 3.1481068238779423e-06, "loss": 0.00042037665843963623, "memory(GiB)": 38.09, "reward": 0.4740501344203949, "reward_std": 0.04207457974553108, "rewards/VisualizationJSONCombinedORM/mean": 0.4740501344203949, "rewards/VisualizationJSONCombinedORM/std": 0.05858359485864639, "step": 3982, "train_speed(iter/s)": 0.289725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 282.1875, "completions/min_length": 230.0, "epoch": 3.294458229942101, "grad_norm": 0.22148282825946808, "kl": 0.1407470703125, "learning_rate": 3.1454249956007722e-06, "loss": 0.0014073364436626434, "memory(GiB)": 38.09, "reward": 0.35738396644592285, "reward_std": 0.0538875088095665, "rewards/VisualizationJSONCombinedORM/mean": 0.35738396644592285, "rewards/VisualizationJSONCombinedORM/std": 0.11784439533948898, "step": 3983, "train_speed(iter/s)": 0.289307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 298.5, "completions/min_length": 244.0, "epoch": 3.295285359801489, "grad_norm": 0.1991662234067917, "kl": 0.0814208984375, "learning_rate": 3.142743785833048e-06, "loss": 0.0008164122700691223, "memory(GiB)": 38.09, "reward": 0.695579469203949, "reward_std": 0.07635213434696198, "rewards/VisualizationJSONCombinedORM/mean": 0.695579469203949, "rewards/VisualizationJSONCombinedORM/std": 0.11386014521121979, "step": 3984, "train_speed(iter/s)": 0.288751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 284.1875, "completions/min_length": 229.0, "epoch": 3.2961124896608767, "grad_norm": 0.24539661407470703, "kl": 0.08099365234375, "learning_rate": 3.1400631954689626e-06, "loss": 0.0008106082677841187, "memory(GiB)": 38.09, "reward": 0.29946818947792053, "reward_std": 0.026067513972520828, "rewards/VisualizationJSONCombinedORM/mean": 0.29946818947792053, "rewards/VisualizationJSONCombinedORM/std": 0.06408822536468506, "step": 3985, "train_speed(iter/s)": 0.288252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 275.3125, "completions/min_length": 228.0, "epoch": 3.2969396195202645, "grad_norm": 0.22711001336574554, "kl": 0.08819580078125, "learning_rate": 3.137383225402511e-06, "loss": 0.0008834153413772583, "memory(GiB)": 38.09, "reward": 0.387292742729187, "reward_std": 0.0745212510228157, "rewards/VisualizationJSONCombinedORM/mean": 0.387292742729187, "rewards/VisualizationJSONCombinedORM/std": 0.08224155753850937, "step": 3986, "train_speed(iter/s)": 0.287796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 304.9375, "completions/min_length": 247.0, "epoch": 3.2977667493796528, "grad_norm": 0.2965027987957001, "kl": 0.197021484375, "learning_rate": 3.1347038765274693e-06, "loss": 0.0019714906811714172, "memory(GiB)": 38.09, "reward": 0.5121723413467407, "reward_std": 0.04881317913532257, "rewards/VisualizationJSONCombinedORM/mean": 0.5121723413467407, "rewards/VisualizationJSONCombinedORM/std": 0.26123547554016113, "step": 3987, "train_speed(iter/s)": 0.287322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 276.4375, "completions/min_length": 215.0, "epoch": 3.2985938792390406, "grad_norm": 0.24835216999053955, "kl": 0.1339111328125, "learning_rate": 3.1320251497374187e-06, "loss": 0.0013371780514717102, "memory(GiB)": 38.09, "reward": 0.736103355884552, "reward_std": 0.06832930445671082, "rewards/VisualizationJSONCombinedORM/mean": 0.736103355884552, "rewards/VisualizationJSONCombinedORM/std": 0.0675593912601471, "step": 3988, "train_speed(iter/s)": 0.28678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 318.25, "completions/min_length": 237.0, "epoch": 3.2994210090984284, "grad_norm": 0.19466812908649445, "kl": 0.1153564453125, "learning_rate": 3.1293470459257237e-06, "loss": 0.00115291029214859, "memory(GiB)": 38.09, "reward": 0.3781820833683014, "reward_std": 0.03875163197517395, "rewards/VisualizationJSONCombinedORM/mean": 0.3781820833683014, "rewards/VisualizationJSONCombinedORM/std": 0.07156548649072647, "step": 3989, "train_speed(iter/s)": 0.286311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 297.8125, "completions/min_length": 242.0, "epoch": 3.300248138957816, "grad_norm": 0.17960967123508453, "kl": 0.0401611328125, "learning_rate": 3.1266695659855462e-06, "loss": 0.00040096044540405273, "memory(GiB)": 38.09, "reward": 0.45378804206848145, "reward_std": 0.05383164808154106, "rewards/VisualizationJSONCombinedORM/mean": 0.45378804206848145, "rewards/VisualizationJSONCombinedORM/std": 0.1815662831068039, "step": 3990, "train_speed(iter/s)": 0.285845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 314.9375, "completions/min_length": 248.0, "epoch": 3.3010752688172045, "grad_norm": 0.1868135780096054, "kl": 0.05078125, "learning_rate": 3.123992710809837e-06, "loss": 0.0005079619586467743, "memory(GiB)": 38.09, "reward": 0.6995381116867065, "reward_std": 0.09082794189453125, "rewards/VisualizationJSONCombinedORM/mean": 0.6995381116867065, "rewards/VisualizationJSONCombinedORM/std": 0.1339421421289444, "step": 3991, "train_speed(iter/s)": 0.285377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 294.75, "completions/min_length": 230.0, "epoch": 3.3019023986765923, "grad_norm": 0.16199390590190887, "kl": 0.06256103515625, "learning_rate": 3.12131648129134e-06, "loss": 0.0006272047758102417, "memory(GiB)": 38.09, "reward": 0.5166395902633667, "reward_std": 0.04426468908786774, "rewards/VisualizationJSONCombinedORM/mean": 0.5166395902633667, "rewards/VisualizationJSONCombinedORM/std": 0.04880698397755623, "step": 3992, "train_speed(iter/s)": 0.284886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 293.9375, "completions/min_length": 240.0, "epoch": 3.30272952853598, "grad_norm": 0.2038649469614029, "kl": 0.171142578125, "learning_rate": 3.118640878322589e-06, "loss": 0.0017105303704738617, "memory(GiB)": 38.09, "reward": 0.45634716749191284, "reward_std": 0.05698540061712265, "rewards/VisualizationJSONCombinedORM/mean": 0.45634716749191284, "rewards/VisualizationJSONCombinedORM/std": 0.16309598088264465, "step": 3993, "train_speed(iter/s)": 0.28435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 298.5625, "completions/min_length": 217.0, "epoch": 3.303556658395368, "grad_norm": 0.306900292634964, "kl": 0.074951171875, "learning_rate": 3.115965902795915e-06, "loss": 0.0007479563355445862, "memory(GiB)": 38.09, "reward": 0.42587268352508545, "reward_std": 0.06476490944623947, "rewards/VisualizationJSONCombinedORM/mean": 0.42587268352508545, "rewards/VisualizationJSONCombinedORM/std": 0.09080683439970016, "step": 3994, "train_speed(iter/s)": 0.283849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 270.4375, "completions/min_length": 220.0, "epoch": 3.304383788254756, "grad_norm": 0.16470450162887573, "kl": 0.083740234375, "learning_rate": 3.1132915556034283e-06, "loss": 0.0008384212851524353, "memory(GiB)": 38.09, "reward": 0.6069722175598145, "reward_std": 0.034323081374168396, "rewards/VisualizationJSONCombinedORM/mean": 0.6069722175598145, "rewards/VisualizationJSONCombinedORM/std": 0.16835683584213257, "step": 3995, "train_speed(iter/s)": 0.283526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 279.875, "completions/min_length": 233.0, "epoch": 3.305210918114144, "grad_norm": 0.1498551368713379, "kl": 0.1343994140625, "learning_rate": 3.1106178376370418e-06, "loss": 0.001340758055448532, "memory(GiB)": 38.09, "reward": 0.7727242708206177, "reward_std": 0.059518154710531235, "rewards/VisualizationJSONCombinedORM/mean": 0.7727242708206177, "rewards/VisualizationJSONCombinedORM/std": 0.05978852137923241, "step": 3996, "train_speed(iter/s)": 0.283123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 298.1875, "completions/min_length": 248.0, "epoch": 3.306038047973532, "grad_norm": 0.2978217601776123, "kl": 0.0823974609375, "learning_rate": 3.107944749788449e-06, "loss": 0.0008244514465332031, "memory(GiB)": 38.09, "reward": 0.526823103427887, "reward_std": 0.0752708837389946, "rewards/VisualizationJSONCombinedORM/mean": 0.526823103427887, "rewards/VisualizationJSONCombinedORM/std": 0.14576174318790436, "step": 3997, "train_speed(iter/s)": 0.282738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 297.9375, "completions/min_length": 250.0, "epoch": 3.3068651778329197, "grad_norm": 0.16873039305210114, "kl": 0.1175537109375, "learning_rate": 3.1052722929491402e-06, "loss": 0.001176290214061737, "memory(GiB)": 38.09, "reward": 0.42091405391693115, "reward_std": 0.04056989401578903, "rewards/VisualizationJSONCombinedORM/mean": 0.42091405391693115, "rewards/VisualizationJSONCombinedORM/std": 0.06256043910980225, "step": 3998, "train_speed(iter/s)": 0.28229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 282.6875, "completions/min_length": 237.0, "epoch": 3.3076923076923075, "grad_norm": 0.20841464400291443, "kl": 0.14404296875, "learning_rate": 3.10260046801039e-06, "loss": 0.0014407560229301453, "memory(GiB)": 38.09, "reward": 0.5861772298812866, "reward_std": 0.07885368168354034, "rewards/VisualizationJSONCombinedORM/mean": 0.5861772298812866, "rewards/VisualizationJSONCombinedORM/std": 0.13858865201473236, "step": 3999, "train_speed(iter/s)": 0.281869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 258.0, "completions/min_length": 199.0, "epoch": 3.3085194375516958, "grad_norm": 0.2676696479320526, "kl": 0.0469970703125, "learning_rate": 3.099929275863266e-06, "loss": 0.00047064945101737976, "memory(GiB)": 38.09, "reward": 0.47390255331993103, "reward_std": 0.07259955257177353, "rewards/VisualizationJSONCombinedORM/mean": 0.47390255331993103, "rewards/VisualizationJSONCombinedORM/std": 0.09985706210136414, "step": 4000, "train_speed(iter/s)": 0.281523 }, { "epoch": 3.3085194375516958, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 331.9166666666667, "eval_completions/mean_length": 280.8177083333333, "eval_completions/min_length": 238.70833333333334, "eval_kl": 0.09200541178385417, "eval_loss": 0.0009303788538090885, "eval_reward": 0.4755833347638448, "eval_reward_std": 0.056913680532791965, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4755833347638448, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05691368084323282, "eval_runtime": 290.5596, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.01, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 271.8125, "completions/min_length": 228.0, "epoch": 3.3093465674110836, "grad_norm": 0.19647328555583954, "kl": 0.2340087890625, "learning_rate": 3.0972587173986214e-06, "loss": 0.0023403018712997437, "memory(GiB)": 38.09, "reward": 0.38921332359313965, "reward_std": 0.04369144141674042, "rewards/VisualizationJSONCombinedORM/mean": 0.38921332359313965, "rewards/VisualizationJSONCombinedORM/std": 0.1208961084485054, "step": 4001, "train_speed(iter/s)": 0.275464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 303.0625, "completions/min_length": 252.0, "epoch": 3.3101736972704714, "grad_norm": 0.16559264063835144, "kl": 0.10430908203125, "learning_rate": 3.094588793507103e-06, "loss": 0.00104513019323349, "memory(GiB)": 38.09, "reward": 0.6241159439086914, "reward_std": 0.034466613084077835, "rewards/VisualizationJSONCombinedORM/mean": 0.6241159439086914, "rewards/VisualizationJSONCombinedORM/std": 0.2201050966978073, "step": 4002, "train_speed(iter/s)": 0.275065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 288.125, "completions/min_length": 220.0, "epoch": 3.311000827129859, "grad_norm": 0.1839503049850464, "kl": 0.05755615234375, "learning_rate": 3.091919505079139e-06, "loss": 0.0005754493176937103, "memory(GiB)": 38.09, "reward": 0.534697949886322, "reward_std": 0.0566951148211956, "rewards/VisualizationJSONCombinedORM/mean": 0.534697949886322, "rewards/VisualizationJSONCombinedORM/std": 0.05606257542967796, "step": 4003, "train_speed(iter/s)": 0.274607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 280.375, "completions/min_length": 225.0, "epoch": 3.3118279569892475, "grad_norm": 0.21451763808727264, "kl": 0.064910888671875, "learning_rate": 3.0892508530049536e-06, "loss": 0.0006480589509010315, "memory(GiB)": 38.09, "reward": 0.5230978727340698, "reward_std": 0.06538863480091095, "rewards/VisualizationJSONCombinedORM/mean": 0.5230978727340698, "rewards/VisualizationJSONCombinedORM/std": 0.16877606511116028, "step": 4004, "train_speed(iter/s)": 0.274224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 292.125, "completions/min_length": 233.0, "epoch": 3.3126550868486353, "grad_norm": 0.2175723910331726, "kl": 0.059814453125, "learning_rate": 3.0865828381745515e-06, "loss": 0.0005972906947135925, "memory(GiB)": 38.09, "reward": 0.312159925699234, "reward_std": 0.033438488841056824, "rewards/VisualizationJSONCombinedORM/mean": 0.312159925699234, "rewards/VisualizationJSONCombinedORM/std": 0.07173339277505875, "step": 4005, "train_speed(iter/s)": 0.273824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 309.625, "completions/min_length": 262.0, "epoch": 3.313482216708023, "grad_norm": 0.15636098384857178, "kl": 0.1190185546875, "learning_rate": 3.08391546147773e-06, "loss": 0.0011933352798223495, "memory(GiB)": 38.09, "reward": 0.6582210063934326, "reward_std": 0.05713111162185669, "rewards/VisualizationJSONCombinedORM/mean": 0.6582210063934326, "rewards/VisualizationJSONCombinedORM/std": 0.16060461103916168, "step": 4006, "train_speed(iter/s)": 0.273429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 260.0, "completions/min_length": 207.0, "epoch": 3.314309346567411, "grad_norm": 0.1806773692369461, "kl": 0.1058349609375, "learning_rate": 3.0812487238040723e-06, "loss": 0.0010558776557445526, "memory(GiB)": 38.09, "reward": 0.6767933368682861, "reward_std": 0.04915473610162735, "rewards/VisualizationJSONCombinedORM/mean": 0.6767933368682861, "rewards/VisualizationJSONCombinedORM/std": 0.14099082350730896, "step": 4007, "train_speed(iter/s)": 0.272912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 304.75, "completions/min_length": 209.0, "epoch": 3.315136476426799, "grad_norm": 0.17555338144302368, "kl": 0.1109619140625, "learning_rate": 3.078582626042944e-06, "loss": 0.0011100992560386658, "memory(GiB)": 38.09, "reward": 0.5540438890457153, "reward_std": 0.06683548539876938, "rewards/VisualizationJSONCombinedORM/mean": 0.5540438890457153, "rewards/VisualizationJSONCombinedORM/std": 0.0969911515712738, "step": 4008, "train_speed(iter/s)": 0.272448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 310.6875, "completions/min_length": 256.0, "epoch": 3.315963606286187, "grad_norm": 0.2704947590827942, "kl": 0.080322265625, "learning_rate": 3.075917169083508e-06, "loss": 0.000805538147687912, "memory(GiB)": 38.09, "reward": 0.62488853931427, "reward_std": 0.09525981545448303, "rewards/VisualizationJSONCombinedORM/mean": 0.62488853931427, "rewards/VisualizationJSONCombinedORM/std": 0.17629167437553406, "step": 4009, "train_speed(iter/s)": 0.272032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 286.5625, "completions/min_length": 216.0, "epoch": 3.316790736145575, "grad_norm": 0.17294469475746155, "kl": 0.029815673828125, "learning_rate": 3.0732523538146997e-06, "loss": 0.0002979077398777008, "memory(GiB)": 38.09, "reward": 0.44456928968429565, "reward_std": 0.020075254142284393, "rewards/VisualizationJSONCombinedORM/mean": 0.44456928968429565, "rewards/VisualizationJSONCombinedORM/std": 0.020312342792749405, "step": 4010, "train_speed(iter/s)": 0.271587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 293.5, "completions/min_length": 230.0, "epoch": 3.3176178660049627, "grad_norm": 0.1789271980524063, "kl": 0.0830078125, "learning_rate": 3.070588181125252e-06, "loss": 0.0008295439183712006, "memory(GiB)": 38.09, "reward": 0.5953022241592407, "reward_std": 0.05947883427143097, "rewards/VisualizationJSONCombinedORM/mean": 0.5953022241592407, "rewards/VisualizationJSONCombinedORM/std": 0.1610180139541626, "step": 4011, "train_speed(iter/s)": 0.27114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 281.1875, "completions/min_length": 231.0, "epoch": 3.3184449958643505, "grad_norm": 0.1812598556280136, "kl": 0.094970703125, "learning_rate": 3.0679246519036766e-06, "loss": 0.0009510032832622528, "memory(GiB)": 38.09, "reward": 0.5533812046051025, "reward_std": 0.06107168644666672, "rewards/VisualizationJSONCombinedORM/mean": 0.5533812046051025, "rewards/VisualizationJSONCombinedORM/std": 0.10956496000289917, "step": 4012, "train_speed(iter/s)": 0.270766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 275.1875, "completions/min_length": 224.0, "epoch": 3.3192721257237388, "grad_norm": 0.19643433392047882, "kl": 0.083251953125, "learning_rate": 3.0652617670382745e-06, "loss": 0.0008333548903465271, "memory(GiB)": 38.09, "reward": 0.5459100604057312, "reward_std": 0.07647325843572617, "rewards/VisualizationJSONCombinedORM/mean": 0.5459100604057312, "rewards/VisualizationJSONCombinedORM/std": 0.1556822657585144, "step": 4013, "train_speed(iter/s)": 0.27041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 299.1875, "completions/min_length": 259.0, "epoch": 3.3200992555831266, "grad_norm": 0.1821141093969345, "kl": 0.0987548828125, "learning_rate": 3.0625995274171284e-06, "loss": 0.0009870119392871857, "memory(GiB)": 38.09, "reward": 0.5041345357894897, "reward_std": 0.059489790350198746, "rewards/VisualizationJSONCombinedORM/mean": 0.5041345357894897, "rewards/VisualizationJSONCombinedORM/std": 0.09265117347240448, "step": 4014, "train_speed(iter/s)": 0.269992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 282.9375, "completions/min_length": 221.0, "epoch": 3.3209263854425144, "grad_norm": 0.16719314455986023, "kl": 0.028228759765625, "learning_rate": 3.05993793392811e-06, "loss": 0.00028257817029953003, "memory(GiB)": 38.09, "reward": 0.6114544868469238, "reward_std": 0.06169654056429863, "rewards/VisualizationJSONCombinedORM/mean": 0.6114544868469238, "rewards/VisualizationJSONCombinedORM/std": 0.06082776561379433, "step": 4015, "train_speed(iter/s)": 0.269539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 279.9375, "completions/min_length": 236.0, "epoch": 3.321753515301902, "grad_norm": 0.19474543631076813, "kl": 0.06939697265625, "learning_rate": 3.0572769874588704e-06, "loss": 0.0006938017904758453, "memory(GiB)": 38.09, "reward": 0.590755820274353, "reward_std": 0.08534915745258331, "rewards/VisualizationJSONCombinedORM/mean": 0.590755820274353, "rewards/VisualizationJSONCombinedORM/std": 0.12293103337287903, "step": 4016, "train_speed(iter/s)": 0.269191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 275.8125, "completions/min_length": 221.0, "epoch": 3.3225806451612905, "grad_norm": 0.17289352416992188, "kl": 0.03204345703125, "learning_rate": 3.054616688896852e-06, "loss": 0.00032095611095428467, "memory(GiB)": 38.09, "reward": 0.5983919501304626, "reward_std": 0.047699134796857834, "rewards/VisualizationJSONCombinedORM/mean": 0.5983919501304626, "rewards/VisualizationJSONCombinedORM/std": 0.24227376282215118, "step": 4017, "train_speed(iter/s)": 0.268805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 291.0, "completions/min_length": 238.0, "epoch": 3.3234077750206783, "grad_norm": 0.20732435584068298, "kl": 0.05450439453125, "learning_rate": 3.0519570391292715e-06, "loss": 0.0005449838936328888, "memory(GiB)": 38.09, "reward": 0.48859596252441406, "reward_std": 0.05492164194583893, "rewards/VisualizationJSONCombinedORM/mean": 0.48859596252441406, "rewards/VisualizationJSONCombinedORM/std": 0.05681965500116348, "step": 4018, "train_speed(iter/s)": 0.268388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 290.9375, "completions/min_length": 245.0, "epoch": 3.324234904880066, "grad_norm": 0.16686584055423737, "kl": 0.0465087890625, "learning_rate": 3.049298039043139e-06, "loss": 0.0004649609327316284, "memory(GiB)": 38.09, "reward": 0.6640278697013855, "reward_std": 0.05288020521402359, "rewards/VisualizationJSONCombinedORM/mean": 0.6640278697013855, "rewards/VisualizationJSONCombinedORM/std": 0.13575883209705353, "step": 4019, "train_speed(iter/s)": 0.268023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 288.6875, "completions/min_length": 242.0, "epoch": 3.325062034739454, "grad_norm": 0.18039704859256744, "kl": 0.0963134765625, "learning_rate": 3.0466396895252405e-06, "loss": 0.0009629391133785248, "memory(GiB)": 38.09, "reward": 0.5467590093612671, "reward_std": 0.02847887948155403, "rewards/VisualizationJSONCombinedORM/mean": 0.5467590093612671, "rewards/VisualizationJSONCombinedORM/std": 0.17396928369998932, "step": 4020, "train_speed(iter/s)": 0.267683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 249.0, "completions/min_length": 231.0, "epoch": 3.325889164598842, "grad_norm": 0.17057685554027557, "kl": 0.077392578125, "learning_rate": 3.04398199146215e-06, "loss": 0.0007755458354949951, "memory(GiB)": 38.09, "reward": 0.4643450975418091, "reward_std": 0.04779747128486633, "rewards/VisualizationJSONCombinedORM/mean": 0.4643450975418091, "rewards/VisualizationJSONCombinedORM/std": 0.23405252397060394, "step": 4021, "train_speed(iter/s)": 0.267323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 285.5, "completions/min_length": 249.0, "epoch": 3.32671629445823, "grad_norm": 0.19455625116825104, "kl": 0.091552734375, "learning_rate": 3.0413249457402206e-06, "loss": 0.0009158886969089508, "memory(GiB)": 38.09, "reward": 0.652021050453186, "reward_std": 0.06864041090011597, "rewards/VisualizationJSONCombinedORM/mean": 0.652021050453186, "rewards/VisualizationJSONCombinedORM/std": 0.10518299788236618, "step": 4022, "train_speed(iter/s)": 0.267022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 297.875, "completions/min_length": 252.0, "epoch": 3.327543424317618, "grad_norm": 0.2416379749774933, "kl": 0.05645751953125, "learning_rate": 3.0386685532455913e-06, "loss": 0.0005641058087348938, "memory(GiB)": 38.09, "reward": 0.5624234676361084, "reward_std": 0.07616981863975525, "rewards/VisualizationJSONCombinedORM/mean": 0.5624234676361084, "rewards/VisualizationJSONCombinedORM/std": 0.2369266003370285, "step": 4023, "train_speed(iter/s)": 0.266726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 304.9375, "completions/min_length": 244.0, "epoch": 3.3283705541770057, "grad_norm": 0.17269982397556305, "kl": 0.03631591796875, "learning_rate": 3.036012814864178e-06, "loss": 0.000361807644367218, "memory(GiB)": 38.09, "reward": 0.5086970329284668, "reward_std": 0.04534338414669037, "rewards/VisualizationJSONCombinedORM/mean": 0.5086970329284668, "rewards/VisualizationJSONCombinedORM/std": 0.05696683004498482, "step": 4024, "train_speed(iter/s)": 0.266273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 307.3125, "completions/min_length": 238.0, "epoch": 3.3291976840363935, "grad_norm": 0.19517816603183746, "kl": 0.089599609375, "learning_rate": 3.0333577314816883e-06, "loss": 0.0008967705070972443, "memory(GiB)": 38.09, "reward": 0.6944770812988281, "reward_std": 0.07462182641029358, "rewards/VisualizationJSONCombinedORM/mean": 0.6944770812988281, "rewards/VisualizationJSONCombinedORM/std": 0.10913510620594025, "step": 4025, "train_speed(iter/s)": 0.265906 }, { "epoch": 3.3291976840363935, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 352.75, "eval_completions/mean_length": 293.8802083333333, "eval_completions/min_length": 243.20833333333334, "eval_kl": 0.07100423177083333, "eval_loss": 0.0007188369636423886, "eval_reward": 0.4411891649166743, "eval_reward_std": 0.049621820488634207, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4411891649166743, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0496218199065576, "eval_runtime": 303.4462, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 303.6875, "completions/min_length": 251.0, "epoch": 3.3300248138957818, "grad_norm": 0.18045887351036072, "kl": 0.09326171875, "learning_rate": 3.030703303983597e-06, "loss": 0.000933976611122489, "memory(GiB)": 38.09, "reward": 0.21359992027282715, "reward_std": 0.007037741132080555, "rewards/VisualizationJSONCombinedORM/mean": 0.21359992027282715, "rewards/VisualizationJSONCombinedORM/std": 0.026910552754998207, "step": 4026, "train_speed(iter/s)": 0.26025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 288.75, "completions/min_length": 230.0, "epoch": 3.3308519437551696, "grad_norm": 0.24772870540618896, "kl": 0.0823974609375, "learning_rate": 3.0280495332551747e-06, "loss": 0.0008262470364570618, "memory(GiB)": 38.09, "reward": 0.45658132433891296, "reward_std": 0.08986666053533554, "rewards/VisualizationJSONCombinedORM/mean": 0.45658132433891296, "rewards/VisualizationJSONCombinedORM/std": 0.08785786479711533, "step": 4027, "train_speed(iter/s)": 0.259837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 321.0, "completions/min_length": 249.0, "epoch": 3.3316790736145574, "grad_norm": 0.2050844132900238, "kl": 0.0799560546875, "learning_rate": 3.0253964201814624e-06, "loss": 0.0007997788488864899, "memory(GiB)": 38.09, "reward": 0.6761458516120911, "reward_std": 0.08687393367290497, "rewards/VisualizationJSONCombinedORM/mean": 0.6761458516120911, "rewards/VisualizationJSONCombinedORM/std": 0.08799024671316147, "step": 4028, "train_speed(iter/s)": 0.259406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 292.75, "completions/min_length": 242.0, "epoch": 3.332506203473945, "grad_norm": 0.1742820292711258, "kl": 0.067138671875, "learning_rate": 3.0227439656472878e-06, "loss": 0.0006712377071380615, "memory(GiB)": 38.09, "reward": 0.5136979818344116, "reward_std": 0.0311189666390419, "rewards/VisualizationJSONCombinedORM/mean": 0.5136979818344116, "rewards/VisualizationJSONCombinedORM/std": 0.07600753754377365, "step": 4029, "train_speed(iter/s)": 0.25905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 294.125, "completions/min_length": 239.0, "epoch": 3.3333333333333335, "grad_norm": 0.20028752088546753, "kl": 0.1405029296875, "learning_rate": 3.0200921705372555e-06, "loss": 0.001407139003276825, "memory(GiB)": 38.09, "reward": 0.4945172667503357, "reward_std": 0.05224255472421646, "rewards/VisualizationJSONCombinedORM/mean": 0.4945172667503357, "rewards/VisualizationJSONCombinedORM/std": 0.13014152646064758, "step": 4030, "train_speed(iter/s)": 0.258614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 311.6875, "completions/min_length": 231.0, "epoch": 3.3341604631927213, "grad_norm": 0.2484455704689026, "kl": 0.06256103515625, "learning_rate": 3.017441035735753e-06, "loss": 0.0006260313093662262, "memory(GiB)": 38.09, "reward": 0.5553252696990967, "reward_std": 0.07024893164634705, "rewards/VisualizationJSONCombinedORM/mean": 0.5553252696990967, "rewards/VisualizationJSONCombinedORM/std": 0.20633742213249207, "step": 4031, "train_speed(iter/s)": 0.258235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 290.8125, "completions/min_length": 240.0, "epoch": 3.334987593052109, "grad_norm": 0.2305736094713211, "kl": 0.0693359375, "learning_rate": 3.0147905621269433e-06, "loss": 0.0006929486989974976, "memory(GiB)": 38.09, "reward": 0.597606897354126, "reward_std": 0.05875676870346069, "rewards/VisualizationJSONCombinedORM/mean": 0.597606897354126, "rewards/VisualizationJSONCombinedORM/std": 0.21139657497406006, "step": 4032, "train_speed(iter/s)": 0.257903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 286.4375, "completions/min_length": 242.0, "epoch": 3.335814722911497, "grad_norm": 0.16903407871723175, "kl": 0.08544921875, "learning_rate": 3.0121407505947774e-06, "loss": 0.0008539482951164246, "memory(GiB)": 38.09, "reward": 0.4178480803966522, "reward_std": 0.0335489958524704, "rewards/VisualizationJSONCombinedORM/mean": 0.4178480803966522, "rewards/VisualizationJSONCombinedORM/std": 0.2339021861553192, "step": 4033, "train_speed(iter/s)": 0.257469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 323.375, "completions/min_length": 241.0, "epoch": 3.336641852770885, "grad_norm": 0.16626979410648346, "kl": 0.03045654296875, "learning_rate": 3.009491602022973e-06, "loss": 0.0003047138452529907, "memory(GiB)": 38.09, "reward": 0.6832824349403381, "reward_std": 0.042378123849630356, "rewards/VisualizationJSONCombinedORM/mean": 0.6832824349403381, "rewards/VisualizationJSONCombinedORM/std": 0.13669230043888092, "step": 4034, "train_speed(iter/s)": 0.257075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 289.375, "completions/min_length": 215.0, "epoch": 3.337468982630273, "grad_norm": 0.21096952259540558, "kl": 0.1209716796875, "learning_rate": 3.0068431172950387e-06, "loss": 0.001209903508424759, "memory(GiB)": 38.09, "reward": 0.5090683102607727, "reward_std": 0.08488988876342773, "rewards/VisualizationJSONCombinedORM/mean": 0.5090683102607727, "rewards/VisualizationJSONCombinedORM/std": 0.09500442445278168, "step": 4035, "train_speed(iter/s)": 0.256625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 255.5625, "completions/min_length": 209.0, "epoch": 3.338296112489661, "grad_norm": 0.15549547970294952, "kl": 0.04803466796875, "learning_rate": 3.004195297294254e-06, "loss": 0.00048010796308517456, "memory(GiB)": 38.09, "reward": 0.44939255714416504, "reward_std": 0.04499819502234459, "rewards/VisualizationJSONCombinedORM/mean": 0.44939255714416504, "rewards/VisualizationJSONCombinedORM/std": 0.07578355818986893, "step": 4036, "train_speed(iter/s)": 0.256244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 327.625, "completions/min_length": 255.0, "epoch": 3.3391232423490487, "grad_norm": 0.1871822029352188, "kl": 0.04705810546875, "learning_rate": 3.0015481429036807e-06, "loss": 0.0004711523652076721, "memory(GiB)": 38.09, "reward": 0.47622501850128174, "reward_std": 0.04109333083033562, "rewards/VisualizationJSONCombinedORM/mean": 0.47622501850128174, "rewards/VisualizationJSONCombinedORM/std": 0.14258822798728943, "step": 4037, "train_speed(iter/s)": 0.255851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 300.6875, "completions/min_length": 243.0, "epoch": 3.3399503722084365, "grad_norm": 0.23517099022865295, "kl": 0.08746337890625, "learning_rate": 2.9989016550061558e-06, "loss": 0.0008750408887863159, "memory(GiB)": 38.09, "reward": 0.6188271045684814, "reward_std": 0.13905848562717438, "rewards/VisualizationJSONCombinedORM/mean": 0.6188271045684814, "rewards/VisualizationJSONCombinedORM/std": 0.21473610401153564, "step": 4038, "train_speed(iter/s)": 0.255417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 292.4375, "completions/min_length": 203.0, "epoch": 3.3407775020678248, "grad_norm": 0.13571174442768097, "kl": 0.042724609375, "learning_rate": 2.9962558344842963e-06, "loss": 0.0004277396947145462, "memory(GiB)": 38.09, "reward": 0.5385351181030273, "reward_std": 0.06107502430677414, "rewards/VisualizationJSONCombinedORM/mean": 0.5385351181030273, "rewards/VisualizationJSONCombinedORM/std": 0.16231447458267212, "step": 4039, "train_speed(iter/s)": 0.255074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 283.0, "completions/min_length": 210.0, "epoch": 3.3416046319272126, "grad_norm": 0.22284138202667236, "kl": 0.126953125, "learning_rate": 2.9936106822204937e-06, "loss": 0.001274164766073227, "memory(GiB)": 38.09, "reward": 0.5485175848007202, "reward_std": 0.05379968136548996, "rewards/VisualizationJSONCombinedORM/mean": 0.5485175848007202, "rewards/VisualizationJSONCombinedORM/std": 0.25025737285614014, "step": 4040, "train_speed(iter/s)": 0.254738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 267.25, "completions/min_length": 204.0, "epoch": 3.3424317617866004, "grad_norm": 0.18678925931453705, "kl": 0.034698486328125, "learning_rate": 2.990966199096924e-06, "loss": 0.00034765154123306274, "memory(GiB)": 38.09, "reward": 0.6200507879257202, "reward_std": 0.06587866693735123, "rewards/VisualizationJSONCombinedORM/mean": 0.6200507879257202, "rewards/VisualizationJSONCombinedORM/std": 0.06427492946386337, "step": 4041, "train_speed(iter/s)": 0.254418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 281.125, "completions/min_length": 229.0, "epoch": 3.3432588916459887, "grad_norm": 0.22574833035469055, "kl": 0.1044921875, "learning_rate": 2.988322385995527e-06, "loss": 0.0010440051555633545, "memory(GiB)": 38.09, "reward": 0.4831896424293518, "reward_std": 0.03597864881157875, "rewards/VisualizationJSONCombinedORM/mean": 0.4831896424293518, "rewards/VisualizationJSONCombinedORM/std": 0.1041211411356926, "step": 4042, "train_speed(iter/s)": 0.254021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 289.8125, "completions/min_length": 249.0, "epoch": 3.3440860215053765, "grad_norm": 0.21232768893241882, "kl": 0.065185546875, "learning_rate": 2.985679243798034e-06, "loss": 0.0006511565297842026, "memory(GiB)": 38.09, "reward": 0.4736400842666626, "reward_std": 0.06600308418273926, "rewards/VisualizationJSONCombinedORM/mean": 0.4736400842666626, "rewards/VisualizationJSONCombinedORM/std": 0.09524966031312943, "step": 4043, "train_speed(iter/s)": 0.253699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 283.0625, "completions/min_length": 227.0, "epoch": 3.3449131513647643, "grad_norm": 0.2021341770887375, "kl": 0.086669921875, "learning_rate": 2.983036773385941e-06, "loss": 0.0008678063750267029, "memory(GiB)": 38.09, "reward": 0.5183744430541992, "reward_std": 0.06629985570907593, "rewards/VisualizationJSONCombinedORM/mean": 0.5183744430541992, "rewards/VisualizationJSONCombinedORM/std": 0.12492217868566513, "step": 4044, "train_speed(iter/s)": 0.253315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 282.3125, "completions/min_length": 206.0, "epoch": 3.345740281224152, "grad_norm": 0.21348543465137482, "kl": 0.08209228515625, "learning_rate": 2.980394975640526e-06, "loss": 0.0008209655061364174, "memory(GiB)": 38.09, "reward": 0.42219221591949463, "reward_std": 0.05434364452958107, "rewards/VisualizationJSONCombinedORM/mean": 0.42219221591949463, "rewards/VisualizationJSONCombinedORM/std": 0.1628892868757248, "step": 4045, "train_speed(iter/s)": 0.252966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 288.1875, "completions/min_length": 219.0, "epoch": 3.34656741108354, "grad_norm": 0.18465878069400787, "kl": 0.0406494140625, "learning_rate": 2.9777538514428395e-06, "loss": 0.0004075765609741211, "memory(GiB)": 38.09, "reward": 0.290141761302948, "reward_std": 0.03800209239125252, "rewards/VisualizationJSONCombinedORM/mean": 0.290141761302948, "rewards/VisualizationJSONCombinedORM/std": 0.05756961554288864, "step": 4046, "train_speed(iter/s)": 0.252671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 306.5, "completions/min_length": 235.0, "epoch": 3.347394540942928, "grad_norm": 0.36280450224876404, "kl": 0.24169921875, "learning_rate": 2.9751134016737104e-06, "loss": 0.002419799566268921, "memory(GiB)": 38.09, "reward": 0.6986021995544434, "reward_std": 0.08987045288085938, "rewards/VisualizationJSONCombinedORM/mean": 0.6986021995544434, "rewards/VisualizationJSONCombinedORM/std": 0.08699700236320496, "step": 4047, "train_speed(iter/s)": 0.252342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 296.9375, "completions/min_length": 243.0, "epoch": 3.348221670802316, "grad_norm": 0.215397447347641, "kl": 0.0750732421875, "learning_rate": 2.972473627213739e-06, "loss": 0.0007490664720535278, "memory(GiB)": 38.09, "reward": 0.5579246282577515, "reward_std": 0.06684829294681549, "rewards/VisualizationJSONCombinedORM/mean": 0.5579246282577515, "rewards/VisualizationJSONCombinedORM/std": 0.09402978420257568, "step": 4048, "train_speed(iter/s)": 0.252038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 284.75, "completions/min_length": 223.0, "epoch": 3.349048800661704, "grad_norm": 0.2005082666873932, "kl": 0.06939697265625, "learning_rate": 2.969834528943306e-06, "loss": 0.0006927251815795898, "memory(GiB)": 38.09, "reward": 0.6073547601699829, "reward_std": 0.08153457194566727, "rewards/VisualizationJSONCombinedORM/mean": 0.6073547601699829, "rewards/VisualizationJSONCombinedORM/std": 0.08279947936534882, "step": 4049, "train_speed(iter/s)": 0.251723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 300.125, "completions/min_length": 221.0, "epoch": 3.3498759305210917, "grad_norm": 0.20209263265132904, "kl": 0.04620361328125, "learning_rate": 2.9671961077425583e-06, "loss": 0.00046164169907569885, "memory(GiB)": 38.09, "reward": 0.6400796175003052, "reward_std": 0.10165207087993622, "rewards/VisualizationJSONCombinedORM/mean": 0.6400796175003052, "rewards/VisualizationJSONCombinedORM/std": 0.09823762625455856, "step": 4050, "train_speed(iter/s)": 0.251473 }, { "epoch": 3.3498759305210917, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 354.0833333333333, "eval_completions/mean_length": 293.9895833333333, "eval_completions/min_length": 246.5, "eval_kl": 0.07535807291666667, "eval_loss": 0.0007561743259429932, "eval_reward": 0.4422802484283845, "eval_reward_std": 0.04826395304795975, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4422802484283845, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04826395343601083, "eval_runtime": 303.8992, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 285.625, "completions/min_length": 227.0, "epoch": 3.3507030603804795, "grad_norm": 0.1777259260416031, "kl": 0.06903076171875, "learning_rate": 2.9645583644914253e-06, "loss": 0.0006895847618579865, "memory(GiB)": 38.09, "reward": 0.6129703521728516, "reward_std": 0.035574331879615784, "rewards/VisualizationJSONCombinedORM/mean": 0.6129703521728516, "rewards/VisualizationJSONCombinedORM/std": 0.035430908203125, "step": 4051, "train_speed(iter/s)": 0.246554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 308.8125, "completions/min_length": 258.0, "epoch": 3.3515301902398678, "grad_norm": 0.16349002718925476, "kl": 0.0618896484375, "learning_rate": 2.9619213000696055e-06, "loss": 0.0006192214787006378, "memory(GiB)": 38.09, "reward": 0.4565226435661316, "reward_std": 0.0414741188287735, "rewards/VisualizationJSONCombinedORM/mean": 0.4565226435661316, "rewards/VisualizationJSONCombinedORM/std": 0.07031086832284927, "step": 4052, "train_speed(iter/s)": 0.246241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 266.4375, "completions/min_length": 217.0, "epoch": 3.3523573200992556, "grad_norm": 0.4564235508441925, "kl": 0.1192626953125, "learning_rate": 2.9592849153565727e-06, "loss": 0.0011917278170585632, "memory(GiB)": 38.09, "reward": 0.6416261196136475, "reward_std": 0.07558071613311768, "rewards/VisualizationJSONCombinedORM/mean": 0.6416261196136475, "rewards/VisualizationJSONCombinedORM/std": 0.13130579888820648, "step": 4053, "train_speed(iter/s)": 0.245984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 283.5, "completions/min_length": 236.0, "epoch": 3.3531844499586434, "grad_norm": 0.21669119596481323, "kl": 0.0771484375, "learning_rate": 2.9566492112315726e-06, "loss": 0.00077013298869133, "memory(GiB)": 38.09, "reward": 0.49184244871139526, "reward_std": 0.06257094442844391, "rewards/VisualizationJSONCombinedORM/mean": 0.49184244871139526, "rewards/VisualizationJSONCombinedORM/std": 0.09688646346330643, "step": 4054, "train_speed(iter/s)": 0.245613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 281.3125, "completions/min_length": 219.0, "epoch": 3.3540115798180317, "grad_norm": 0.17175103724002838, "kl": 0.094482421875, "learning_rate": 2.954014188573626e-06, "loss": 0.0009454824030399323, "memory(GiB)": 38.09, "reward": 0.37734660506248474, "reward_std": 0.03684999793767929, "rewards/VisualizationJSONCombinedORM/mean": 0.37734660506248474, "rewards/VisualizationJSONCombinedORM/std": 0.14677894115447998, "step": 4055, "train_speed(iter/s)": 0.24529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 281.1875, "completions/min_length": 224.0, "epoch": 3.3548387096774195, "grad_norm": 0.19900330901145935, "kl": 0.042236328125, "learning_rate": 2.951379848261523e-06, "loss": 0.00042191892862319946, "memory(GiB)": 38.09, "reward": 0.5465203523635864, "reward_std": 0.02897982858121395, "rewards/VisualizationJSONCombinedORM/mean": 0.5465203523635864, "rewards/VisualizationJSONCombinedORM/std": 0.29064255952835083, "step": 4056, "train_speed(iter/s)": 0.244958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 303.0625, "completions/min_length": 246.0, "epoch": 3.3556658395368073, "grad_norm": 0.1987733095884323, "kl": 0.10980224609375, "learning_rate": 2.9487461911738334e-06, "loss": 0.0011006258428096771, "memory(GiB)": 38.09, "reward": 0.5685681104660034, "reward_std": 0.06472411006689072, "rewards/VisualizationJSONCombinedORM/mean": 0.5685681104660034, "rewards/VisualizationJSONCombinedORM/std": 0.11181420087814331, "step": 4057, "train_speed(iter/s)": 0.244606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 293.3125, "completions/min_length": 247.0, "epoch": 3.356492969396195, "grad_norm": 0.16571824252605438, "kl": 0.043701171875, "learning_rate": 2.9461132181888874e-06, "loss": 0.0004375763237476349, "memory(GiB)": 38.09, "reward": 0.5825613141059875, "reward_std": 0.057919085025787354, "rewards/VisualizationJSONCombinedORM/mean": 0.5825613141059875, "rewards/VisualizationJSONCombinedORM/std": 0.21227383613586426, "step": 4058, "train_speed(iter/s)": 0.244329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 277.8125, "completions/min_length": 223.0, "epoch": 3.357320099255583, "grad_norm": 0.21520794928073883, "kl": 0.07904052734375, "learning_rate": 2.9434809301848e-06, "loss": 0.0007906109094619751, "memory(GiB)": 38.09, "reward": 0.6601061820983887, "reward_std": 0.052702076733112335, "rewards/VisualizationJSONCombinedORM/mean": 0.6601061820983887, "rewards/VisualizationJSONCombinedORM/std": 0.08753348141908646, "step": 4059, "train_speed(iter/s)": 0.243958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 287.0625, "completions/min_length": 242.0, "epoch": 3.358147229114971, "grad_norm": 0.18202200531959534, "kl": 0.03363037109375, "learning_rate": 2.940849328039447e-06, "loss": 0.0003372803330421448, "memory(GiB)": 38.09, "reward": 0.5485161542892456, "reward_std": 0.04597967863082886, "rewards/VisualizationJSONCombinedORM/mean": 0.5485161542892456, "rewards/VisualizationJSONCombinedORM/std": 0.162289097905159, "step": 4060, "train_speed(iter/s)": 0.243662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 283.4375, "completions/min_length": 235.0, "epoch": 3.358974358974359, "grad_norm": 0.1822165995836258, "kl": 0.0859375, "learning_rate": 2.9382184126304834e-06, "loss": 0.0008583143353462219, "memory(GiB)": 38.09, "reward": 0.5056780576705933, "reward_std": 0.038561273366212845, "rewards/VisualizationJSONCombinedORM/mean": 0.5056780576705933, "rewards/VisualizationJSONCombinedORM/std": 0.08645166456699371, "step": 4061, "train_speed(iter/s)": 0.243396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 270.5, "completions/min_length": 223.0, "epoch": 3.359801488833747, "grad_norm": 0.17530940473079681, "kl": 0.02447509765625, "learning_rate": 2.9355881848353297e-06, "loss": 0.0002451092004776001, "memory(GiB)": 38.09, "reward": 0.7214508056640625, "reward_std": 0.06935912370681763, "rewards/VisualizationJSONCombinedORM/mean": 0.7214508056640625, "rewards/VisualizationJSONCombinedORM/std": 0.15942150354385376, "step": 4062, "train_speed(iter/s)": 0.243102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 294.125, "completions/min_length": 237.0, "epoch": 3.3606286186931347, "grad_norm": 0.17705830931663513, "kl": 0.1558837890625, "learning_rate": 2.93295864553118e-06, "loss": 0.0015590786933898926, "memory(GiB)": 38.09, "reward": 0.4328686594963074, "reward_std": 0.04940269887447357, "rewards/VisualizationJSONCombinedORM/mean": 0.4328686594963074, "rewards/VisualizationJSONCombinedORM/std": 0.1317417472600937, "step": 4063, "train_speed(iter/s)": 0.242746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 299.9375, "completions/min_length": 222.0, "epoch": 3.361455748552523, "grad_norm": 0.23889222741127014, "kl": 0.03765869140625, "learning_rate": 2.930329795594996e-06, "loss": 0.00037646200507879257, "memory(GiB)": 38.09, "reward": 0.49139851331710815, "reward_std": 0.05042721703648567, "rewards/VisualizationJSONCombinedORM/mean": 0.49139851331710815, "rewards/VisualizationJSONCombinedORM/std": 0.06278369575738907, "step": 4064, "train_speed(iter/s)": 0.242467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 299.5, "completions/min_length": 244.0, "epoch": 3.3622828784119108, "grad_norm": 0.16239574551582336, "kl": 0.03173828125, "learning_rate": 2.9277016359035165e-06, "loss": 0.0003171211574226618, "memory(GiB)": 38.09, "reward": 0.6385433673858643, "reward_std": 0.03811227157711983, "rewards/VisualizationJSONCombinedORM/mean": 0.6385433673858643, "rewards/VisualizationJSONCombinedORM/std": 0.10402841866016388, "step": 4065, "train_speed(iter/s)": 0.242156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 272.625, "completions/min_length": 224.0, "epoch": 3.3631100082712986, "grad_norm": 0.22056151926517487, "kl": 0.06842041015625, "learning_rate": 2.925074167333238e-06, "loss": 0.0006825365126132965, "memory(GiB)": 38.09, "reward": 0.6749655604362488, "reward_std": 0.08141195774078369, "rewards/VisualizationJSONCombinedORM/mean": 0.6749655604362488, "rewards/VisualizationJSONCombinedORM/std": 0.10370922088623047, "step": 4066, "train_speed(iter/s)": 0.241865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 293.6875, "completions/min_length": 242.0, "epoch": 3.3639371381306864, "grad_norm": 0.2021556794643402, "kl": 0.105712890625, "learning_rate": 2.9224473907604407e-06, "loss": 0.0010558497160673141, "memory(GiB)": 38.09, "reward": 0.4950537085533142, "reward_std": 0.045471642166376114, "rewards/VisualizationJSONCombinedORM/mean": 0.4950537085533142, "rewards/VisualizationJSONCombinedORM/std": 0.047897376120090485, "step": 4067, "train_speed(iter/s)": 0.241487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 253.5, "completions/min_length": 211.0, "epoch": 3.3647642679900747, "grad_norm": 0.17157138884067535, "kl": 0.042327880859375, "learning_rate": 2.919821307061162e-06, "loss": 0.00042314082384109497, "memory(GiB)": 38.09, "reward": 0.6756385564804077, "reward_std": 0.07992519438266754, "rewards/VisualizationJSONCombinedORM/mean": 0.6756385564804077, "rewards/VisualizationJSONCombinedORM/std": 0.14040197432041168, "step": 4068, "train_speed(iter/s)": 0.241274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 272.125, "completions/min_length": 239.0, "epoch": 3.3655913978494625, "grad_norm": 0.19668494164943695, "kl": 0.031158447265625, "learning_rate": 2.917195917111215e-06, "loss": 0.00031070783734321594, "memory(GiB)": 38.09, "reward": 0.5535356998443604, "reward_std": 0.03433574363589287, "rewards/VisualizationJSONCombinedORM/mean": 0.5535356998443604, "rewards/VisualizationJSONCombinedORM/std": 0.26665565371513367, "step": 4069, "train_speed(iter/s)": 0.240998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 289.5, "completions/min_length": 225.0, "epoch": 3.3664185277088503, "grad_norm": 0.2809807360172272, "kl": 0.06170654296875, "learning_rate": 2.914571221786179e-06, "loss": 0.0006187222898006439, "memory(GiB)": 38.09, "reward": 0.4929749369621277, "reward_std": 0.07556940615177155, "rewards/VisualizationJSONCombinedORM/mean": 0.4929749369621277, "rewards/VisualizationJSONCombinedORM/std": 0.23439112305641174, "step": 4070, "train_speed(iter/s)": 0.240703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 284.6875, "completions/min_length": 217.0, "epoch": 3.367245657568238, "grad_norm": 0.20343327522277832, "kl": 0.11376953125, "learning_rate": 2.911947221961402e-06, "loss": 0.0011365339159965515, "memory(GiB)": 38.09, "reward": 0.49780380725860596, "reward_std": 0.048496995121240616, "rewards/VisualizationJSONCombinedORM/mean": 0.49780380725860596, "rewards/VisualizationJSONCombinedORM/std": 0.18015387654304504, "step": 4071, "train_speed(iter/s)": 0.240436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 284.8125, "completions/min_length": 249.0, "epoch": 3.368072787427626, "grad_norm": 0.1887066811323166, "kl": 0.06854248046875, "learning_rate": 2.909323918512001e-06, "loss": 0.0006842601578682661, "memory(GiB)": 38.09, "reward": 0.4490915536880493, "reward_std": 0.048500239849090576, "rewards/VisualizationJSONCombinedORM/mean": 0.4490915536880493, "rewards/VisualizationJSONCombinedORM/std": 0.04706931114196777, "step": 4072, "train_speed(iter/s)": 0.240076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 294.5, "completions/min_length": 241.0, "epoch": 3.368899917287014, "grad_norm": 0.22039374709129333, "kl": 0.0721435546875, "learning_rate": 2.906701312312861e-06, "loss": 0.0007202848792076111, "memory(GiB)": 38.09, "reward": 0.4183468520641327, "reward_std": 0.05005413293838501, "rewards/VisualizationJSONCombinedORM/mean": 0.4183468520641327, "rewards/VisualizationJSONCombinedORM/std": 0.12130106985569, "step": 4073, "train_speed(iter/s)": 0.239781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 266.0625, "completions/min_length": 209.0, "epoch": 3.369727047146402, "grad_norm": 0.18842101097106934, "kl": 0.096923828125, "learning_rate": 2.9040794042386305e-06, "loss": 0.0009687244892120361, "memory(GiB)": 38.09, "reward": 0.5760021209716797, "reward_std": 0.06692153960466385, "rewards/VisualizationJSONCombinedORM/mean": 0.5760021209716797, "rewards/VisualizationJSONCombinedORM/std": 0.17727673053741455, "step": 4074, "train_speed(iter/s)": 0.239429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 294.5, "completions/min_length": 247.0, "epoch": 3.37055417700579, "grad_norm": 0.18953271210193634, "kl": 0.1474609375, "learning_rate": 2.9014581951637295e-06, "loss": 0.0014738142490386963, "memory(GiB)": 38.09, "reward": 0.5941035151481628, "reward_std": 0.09865833818912506, "rewards/VisualizationJSONCombinedORM/mean": 0.5941035151481628, "rewards/VisualizationJSONCombinedORM/std": 0.14114634692668915, "step": 4075, "train_speed(iter/s)": 0.239114 }, { "epoch": 3.37055417700579, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 350.375, "eval_completions/mean_length": 294.8333333333333, "eval_completions/min_length": 248.29166666666666, "eval_kl": 0.095550537109375, "eval_loss": 0.0009543579071760178, "eval_reward": 0.46469008301695186, "eval_reward_std": 0.055417965864762664, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46469008301695186, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0554179679408359, "eval_runtime": 301.4803, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 272.6875, "completions/min_length": 204.0, "epoch": 3.3713813068651777, "grad_norm": 0.21892443299293518, "kl": 0.103515625, "learning_rate": 2.8988376859623437e-06, "loss": 0.0010345429182052612, "memory(GiB)": 38.09, "reward": 0.5173546075820923, "reward_std": 0.07722577452659607, "rewards/VisualizationJSONCombinedORM/mean": 0.5173546075820923, "rewards/VisualizationJSONCombinedORM/std": 0.08488039672374725, "step": 4076, "train_speed(iter/s)": 0.23467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 285.625, "completions/min_length": 231.0, "epoch": 3.372208436724566, "grad_norm": 0.21762919425964355, "kl": 0.1016845703125, "learning_rate": 2.8962178775084267e-06, "loss": 0.0010192245244979858, "memory(GiB)": 38.09, "reward": 0.49492165446281433, "reward_std": 0.03859303146600723, "rewards/VisualizationJSONCombinedORM/mean": 0.49492165446281433, "rewards/VisualizationJSONCombinedORM/std": 0.2528998553752899, "step": 4077, "train_speed(iter/s)": 0.234369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 311.4375, "completions/min_length": 243.0, "epoch": 3.3730355665839538, "grad_norm": 0.17748388648033142, "kl": 0.07208251953125, "learning_rate": 2.8935987706756914e-06, "loss": 0.0007216045632958412, "memory(GiB)": 38.09, "reward": 0.3826954960823059, "reward_std": 0.02099563367664814, "rewards/VisualizationJSONCombinedORM/mean": 0.3826954960823059, "rewards/VisualizationJSONCombinedORM/std": 0.09995788335800171, "step": 4078, "train_speed(iter/s)": 0.234127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 317.875, "completions/min_length": 264.0, "epoch": 3.3738626964433416, "grad_norm": 0.16662438213825226, "kl": 0.139892578125, "learning_rate": 2.89098036633763e-06, "loss": 0.001398107036948204, "memory(GiB)": 38.09, "reward": 0.3081433176994324, "reward_std": 0.022538121789693832, "rewards/VisualizationJSONCombinedORM/mean": 0.3081433176994324, "rewards/VisualizationJSONCombinedORM/std": 0.10607005655765533, "step": 4079, "train_speed(iter/s)": 0.233892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 279.9375, "completions/min_length": 227.0, "epoch": 3.3746898263027294, "grad_norm": 0.18732932209968567, "kl": 0.0689697265625, "learning_rate": 2.8883626653674867e-06, "loss": 0.0006900131702423096, "memory(GiB)": 38.09, "reward": 0.5325112342834473, "reward_std": 0.03539259731769562, "rewards/VisualizationJSONCombinedORM/mean": 0.5325112342834473, "rewards/VisualizationJSONCombinedORM/std": 0.12000075727701187, "step": 4080, "train_speed(iter/s)": 0.233561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 294.1875, "completions/min_length": 215.0, "epoch": 3.3755169561621177, "grad_norm": 0.9147846698760986, "kl": 0.8448486328125, "learning_rate": 2.8857456686382794e-06, "loss": 0.008476153016090393, "memory(GiB)": 38.09, "reward": 0.3803485333919525, "reward_std": 0.05007922276854515, "rewards/VisualizationJSONCombinedORM/mean": 0.3803485333919525, "rewards/VisualizationJSONCombinedORM/std": 0.07065477222204208, "step": 4081, "train_speed(iter/s)": 0.233354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 273.6875, "completions/min_length": 215.0, "epoch": 3.3763440860215055, "grad_norm": 0.2003759890794754, "kl": 0.0687255859375, "learning_rate": 2.8831293770227887e-06, "loss": 0.0006861314177513123, "memory(GiB)": 38.09, "reward": 0.5235087871551514, "reward_std": 0.08529657870531082, "rewards/VisualizationJSONCombinedORM/mean": 0.5235087871551514, "rewards/VisualizationJSONCombinedORM/std": 0.18100401759147644, "step": 4082, "train_speed(iter/s)": 0.233092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 302.125, "completions/min_length": 263.0, "epoch": 3.3771712158808933, "grad_norm": 0.18383142352104187, "kl": 0.0880126953125, "learning_rate": 2.880513791393561e-06, "loss": 0.0008816830813884735, "memory(GiB)": 38.09, "reward": 0.6490206718444824, "reward_std": 0.07116543501615524, "rewards/VisualizationJSONCombinedORM/mean": 0.6490206718444824, "rewards/VisualizationJSONCombinedORM/std": 0.13563640415668488, "step": 4083, "train_speed(iter/s)": 0.232788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 307.0, "completions/min_length": 257.0, "epoch": 3.377998345740281, "grad_norm": 0.26516956090927124, "kl": 0.095703125, "learning_rate": 2.877898912622901e-06, "loss": 0.0009559802711009979, "memory(GiB)": 38.09, "reward": 0.7464448809623718, "reward_std": 0.09693722426891327, "rewards/VisualizationJSONCombinedORM/mean": 0.7464448809623718, "rewards/VisualizationJSONCombinedORM/std": 0.09755726158618927, "step": 4084, "train_speed(iter/s)": 0.232524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 303.3125, "completions/min_length": 243.0, "epoch": 3.378825475599669, "grad_norm": 0.15133017301559448, "kl": 0.06390380859375, "learning_rate": 2.8752847415828923e-06, "loss": 0.0006391408387571573, "memory(GiB)": 38.09, "reward": 0.5622304677963257, "reward_std": 0.044896770268678665, "rewards/VisualizationJSONCombinedORM/mean": 0.5622304677963257, "rewards/VisualizationJSONCombinedORM/std": 0.08089541643857956, "step": 4085, "train_speed(iter/s)": 0.23227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 302.5625, "completions/min_length": 256.0, "epoch": 3.379652605459057, "grad_norm": 0.16432303190231323, "kl": 0.04864501953125, "learning_rate": 2.872671279145365e-06, "loss": 0.0004858262836933136, "memory(GiB)": 38.09, "reward": 0.49558281898498535, "reward_std": 0.040324024856090546, "rewards/VisualizationJSONCombinedORM/mean": 0.49558281898498535, "rewards/VisualizationJSONCombinedORM/std": 0.04044199362397194, "step": 4086, "train_speed(iter/s)": 0.232012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 286.8125, "completions/min_length": 249.0, "epoch": 3.380479735318445, "grad_norm": 0.21334557235240936, "kl": 0.1109619140625, "learning_rate": 2.870058526181924e-06, "loss": 0.0011101756244897842, "memory(GiB)": 38.09, "reward": 0.5422489047050476, "reward_std": 0.05762773007154465, "rewards/VisualizationJSONCombinedORM/mean": 0.5422489047050476, "rewards/VisualizationJSONCombinedORM/std": 0.07017433643341064, "step": 4087, "train_speed(iter/s)": 0.231691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 301.625, "completions/min_length": 262.0, "epoch": 3.381306865177833, "grad_norm": 0.215106800198555, "kl": 0.153076171875, "learning_rate": 2.8674464835639346e-06, "loss": 0.0015311390161514282, "memory(GiB)": 38.09, "reward": 0.38807713985443115, "reward_std": 0.11766281723976135, "rewards/VisualizationJSONCombinedORM/mean": 0.38807713985443115, "rewards/VisualizationJSONCombinedORM/std": 0.13125066459178925, "step": 4088, "train_speed(iter/s)": 0.231416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 317.375, "completions/min_length": 246.0, "epoch": 3.3821339950372207, "grad_norm": 0.16825242340564728, "kl": 0.104736328125, "learning_rate": 2.8648351521625284e-06, "loss": 0.0010458678007125854, "memory(GiB)": 38.09, "reward": 0.39945530891418457, "reward_std": 0.032421939074993134, "rewards/VisualizationJSONCombinedORM/mean": 0.39945530891418457, "rewards/VisualizationJSONCombinedORM/std": 0.05428437143564224, "step": 4089, "train_speed(iter/s)": 0.231141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 289.625, "completions/min_length": 232.0, "epoch": 3.382961124896609, "grad_norm": 0.21070799231529236, "kl": 0.12939453125, "learning_rate": 2.862224532848591e-06, "loss": 0.0012932177633047104, "memory(GiB)": 38.09, "reward": 0.65447998046875, "reward_std": 0.057700492441654205, "rewards/VisualizationJSONCombinedORM/mean": 0.65447998046875, "rewards/VisualizationJSONCombinedORM/std": 0.16679885983467102, "step": 4090, "train_speed(iter/s)": 0.230875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 300.5625, "completions/min_length": 235.0, "epoch": 3.3837882547559968, "grad_norm": 0.16497811675071716, "kl": 0.08941650390625, "learning_rate": 2.8596146264927767e-06, "loss": 0.0008926764130592346, "memory(GiB)": 38.09, "reward": 0.4015958905220032, "reward_std": 0.021263064816594124, "rewards/VisualizationJSONCombinedORM/mean": 0.4015958905220032, "rewards/VisualizationJSONCombinedORM/std": 0.09197547286748886, "step": 4091, "train_speed(iter/s)": 0.230566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 296.625, "completions/min_length": 230.0, "epoch": 3.3846153846153846, "grad_norm": 0.18097783625125885, "kl": 0.04644775390625, "learning_rate": 2.8570054339655075e-06, "loss": 0.00046750903129577637, "memory(GiB)": 38.09, "reward": 0.42599019408226013, "reward_std": 0.023241274058818817, "rewards/VisualizationJSONCombinedORM/mean": 0.42599019408226013, "rewards/VisualizationJSONCombinedORM/std": 0.0382172130048275, "step": 4092, "train_speed(iter/s)": 0.230308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 273.0625, "completions/min_length": 231.0, "epoch": 3.3854425144747724, "grad_norm": 0.19413252174854279, "kl": 0.13330078125, "learning_rate": 2.8543969561369556e-06, "loss": 0.0013339519500732422, "memory(GiB)": 38.09, "reward": 0.5999784469604492, "reward_std": 0.08435198664665222, "rewards/VisualizationJSONCombinedORM/mean": 0.5999784469604492, "rewards/VisualizationJSONCombinedORM/std": 0.14959684014320374, "step": 4093, "train_speed(iter/s)": 0.230057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 307.125, "completions/min_length": 255.0, "epoch": 3.3862696443341607, "grad_norm": 0.18995079398155212, "kl": 0.067626953125, "learning_rate": 2.851789193877062e-06, "loss": 0.0006775930523872375, "memory(GiB)": 38.09, "reward": 0.5799808502197266, "reward_std": 0.07469534873962402, "rewards/VisualizationJSONCombinedORM/mean": 0.5799808502197266, "rewards/VisualizationJSONCombinedORM/std": 0.15775109827518463, "step": 4094, "train_speed(iter/s)": 0.229768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 277.375, "completions/min_length": 222.0, "epoch": 3.3870967741935485, "grad_norm": 0.15786957740783691, "kl": 0.0562744140625, "learning_rate": 2.8491821480555283e-06, "loss": 0.0005619525909423828, "memory(GiB)": 38.09, "reward": 0.5503699779510498, "reward_std": 0.058504801243543625, "rewards/VisualizationJSONCombinedORM/mean": 0.5503699779510498, "rewards/VisualizationJSONCombinedORM/std": 0.08532878756523132, "step": 4095, "train_speed(iter/s)": 0.229519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 289.5, "completions/min_length": 231.0, "epoch": 3.3879239040529363, "grad_norm": 0.1858176440000534, "kl": 0.063232421875, "learning_rate": 2.8465758195418182e-06, "loss": 0.0006319545209407806, "memory(GiB)": 38.09, "reward": 0.7185182571411133, "reward_std": 0.08828899264335632, "rewards/VisualizationJSONCombinedORM/mean": 0.7185182571411133, "rewards/VisualizationJSONCombinedORM/std": 0.09663856029510498, "step": 4096, "train_speed(iter/s)": 0.229293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 294.25, "completions/min_length": 227.0, "epoch": 3.388751033912324, "grad_norm": 0.246400386095047, "kl": 0.04876708984375, "learning_rate": 2.8439702092051497e-06, "loss": 0.0004885047674179077, "memory(GiB)": 38.09, "reward": 0.579879879951477, "reward_std": 0.059065066277980804, "rewards/VisualizationJSONCombinedORM/mean": 0.579879879951477, "rewards/VisualizationJSONCombinedORM/std": 0.06755515187978745, "step": 4097, "train_speed(iter/s)": 0.22901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 304.75, "completions/min_length": 233.0, "epoch": 3.389578163771712, "grad_norm": 0.1858813613653183, "kl": 0.08489990234375, "learning_rate": 2.8413653179145096e-06, "loss": 0.0008458085358142853, "memory(GiB)": 38.09, "reward": 0.48115074634552, "reward_std": 0.05687557905912399, "rewards/VisualizationJSONCombinedORM/mean": 0.48115074634552, "rewards/VisualizationJSONCombinedORM/std": 0.09192048758268356, "step": 4098, "train_speed(iter/s)": 0.228759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 295.0625, "completions/min_length": 231.0, "epoch": 3.3904052936311, "grad_norm": 0.18427348136901855, "kl": 0.06256103515625, "learning_rate": 2.83876114653864e-06, "loss": 0.0006255935877561569, "memory(GiB)": 38.09, "reward": 0.4067169427871704, "reward_std": 0.034808166325092316, "rewards/VisualizationJSONCombinedORM/mean": 0.4067169427871704, "rewards/VisualizationJSONCombinedORM/std": 0.03369128704071045, "step": 4099, "train_speed(iter/s)": 0.228425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 279.6875, "completions/min_length": 212.0, "epoch": 3.391232423490488, "grad_norm": 0.18319232761859894, "kl": 0.053955078125, "learning_rate": 2.836157695946047e-06, "loss": 0.0005393140017986298, "memory(GiB)": 38.09, "reward": 0.5210472345352173, "reward_std": 0.06752892583608627, "rewards/VisualizationJSONCombinedORM/mean": 0.5210472345352173, "rewards/VisualizationJSONCombinedORM/std": 0.08318803459405899, "step": 4100, "train_speed(iter/s)": 0.228152 }, { "epoch": 3.391232423490488, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 359.9166666666667, "eval_completions/mean_length": 294.9583333333333, "eval_completions/min_length": 247.625, "eval_kl": 0.070068359375, "eval_loss": 0.0007017484749667346, "eval_reward": 0.4573229377468427, "eval_reward_std": 0.05089239380322397, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4573229377468427, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05089239617033551, "eval_runtime": 307.263, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 299.125, "completions/min_length": 235.0, "epoch": 3.392059553349876, "grad_norm": 0.21805961430072784, "kl": 0.093017578125, "learning_rate": 2.8335549670049866e-06, "loss": 0.0009292624890804291, "memory(GiB)": 38.09, "reward": 0.4795266091823578, "reward_std": 0.046992719173431396, "rewards/VisualizationJSONCombinedORM/mean": 0.4795266091823578, "rewards/VisualizationJSONCombinedORM/std": 0.2534400522708893, "step": 4101, "train_speed(iter/s)": 0.224118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 304.625, "completions/min_length": 233.0, "epoch": 3.3928866832092637, "grad_norm": 0.20400123298168182, "kl": 0.17987060546875, "learning_rate": 2.8309529605834906e-06, "loss": 0.0017987936735153198, "memory(GiB)": 38.09, "reward": 0.5757319331169128, "reward_std": 0.04980194568634033, "rewards/VisualizationJSONCombinedORM/mean": 0.5757319331169128, "rewards/VisualizationJSONCombinedORM/std": 0.2029181718826294, "step": 4102, "train_speed(iter/s)": 0.223877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 292.125, "completions/min_length": 239.0, "epoch": 3.393713813068652, "grad_norm": 0.21240432560443878, "kl": 0.086669921875, "learning_rate": 2.828351677549333e-06, "loss": 0.000868355855345726, "memory(GiB)": 38.09, "reward": 0.6067947149276733, "reward_std": 0.03538919985294342, "rewards/VisualizationJSONCombinedORM/mean": 0.6067947149276733, "rewards/VisualizationJSONCombinedORM/std": 0.1624906212091446, "step": 4103, "train_speed(iter/s)": 0.223622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 268.875, "completions/min_length": 221.0, "epoch": 3.3945409429280398, "grad_norm": 0.2751622200012207, "kl": 0.050048828125, "learning_rate": 2.8257511187700563e-06, "loss": 0.0004990771412849426, "memory(GiB)": 38.09, "reward": 0.4986341893672943, "reward_std": 0.03296186774969101, "rewards/VisualizationJSONCombinedORM/mean": 0.4986341893672943, "rewards/VisualizationJSONCombinedORM/std": 0.040164947509765625, "step": 4104, "train_speed(iter/s)": 0.223359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 282.875, "completions/min_length": 232.0, "epoch": 3.3953680727874276, "grad_norm": 0.20358139276504517, "kl": 0.06011962890625, "learning_rate": 2.8231512851129596e-06, "loss": 0.0006018113344907761, "memory(GiB)": 38.09, "reward": 0.6149166822433472, "reward_std": 0.04932226240634918, "rewards/VisualizationJSONCombinedORM/mean": 0.6149166822433472, "rewards/VisualizationJSONCombinedORM/std": 0.16309475898742676, "step": 4105, "train_speed(iter/s)": 0.223076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 298.75, "completions/min_length": 223.0, "epoch": 3.3961952026468154, "grad_norm": 0.18496966361999512, "kl": 0.072998046875, "learning_rate": 2.8205521774451008e-06, "loss": 0.0007325410842895508, "memory(GiB)": 38.09, "reward": 0.6040845513343811, "reward_std": 0.027519728988409042, "rewards/VisualizationJSONCombinedORM/mean": 0.6040845513343811, "rewards/VisualizationJSONCombinedORM/std": 0.18227899074554443, "step": 4106, "train_speed(iter/s)": 0.222802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 301.0625, "completions/min_length": 249.0, "epoch": 3.3970223325062037, "grad_norm": 0.21652643382549286, "kl": 0.0965576171875, "learning_rate": 2.817953796633289e-06, "loss": 0.0009667500853538513, "memory(GiB)": 38.09, "reward": 0.5469621419906616, "reward_std": 0.05439463630318642, "rewards/VisualizationJSONCombinedORM/mean": 0.5469621419906616, "rewards/VisualizationJSONCombinedORM/std": 0.06843754649162292, "step": 4107, "train_speed(iter/s)": 0.222501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 265.5, "completions/min_length": 208.0, "epoch": 3.3978494623655915, "grad_norm": 0.21139369904994965, "kl": 0.1240234375, "learning_rate": 2.8153561435441035e-06, "loss": 0.001239139586687088, "memory(GiB)": 38.09, "reward": 0.5453718304634094, "reward_std": 0.048605337738990784, "rewards/VisualizationJSONCombinedORM/mean": 0.5453718304634094, "rewards/VisualizationJSONCombinedORM/std": 0.24410681426525116, "step": 4108, "train_speed(iter/s)": 0.22228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 274.8125, "completions/min_length": 229.0, "epoch": 3.3986765922249793, "grad_norm": 0.19597598910331726, "kl": 0.0733642578125, "learning_rate": 2.812759219043869e-06, "loss": 0.000730863306671381, "memory(GiB)": 38.09, "reward": 0.4245668649673462, "reward_std": 0.05076087638735771, "rewards/VisualizationJSONCombinedORM/mean": 0.4245668649673462, "rewards/VisualizationJSONCombinedORM/std": 0.12699361145496368, "step": 4109, "train_speed(iter/s)": 0.222032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 313.0625, "completions/min_length": 246.0, "epoch": 3.399503722084367, "grad_norm": 0.19646801054477692, "kl": 0.05487060546875, "learning_rate": 2.810163023998673e-06, "loss": 0.0005470775067806244, "memory(GiB)": 38.09, "reward": 0.6933324933052063, "reward_std": 0.08932209014892578, "rewards/VisualizationJSONCombinedORM/mean": 0.6933324933052063, "rewards/VisualizationJSONCombinedORM/std": 0.09030237793922424, "step": 4110, "train_speed(iter/s)": 0.221795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 311.375, "completions/min_length": 191.0, "epoch": 3.400330851943755, "grad_norm": 0.1901445984840393, "kl": 0.0450439453125, "learning_rate": 2.807567559274359e-06, "loss": 0.0004510432481765747, "memory(GiB)": 38.09, "reward": 0.6202571988105774, "reward_std": 0.07032659649848938, "rewards/VisualizationJSONCombinedORM/mean": 0.6202571988105774, "rewards/VisualizationJSONCombinedORM/std": 0.16002008318901062, "step": 4111, "train_speed(iter/s)": 0.221563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 291.75, "completions/min_length": 232.0, "epoch": 3.401157981803143, "grad_norm": 0.2182789444923401, "kl": 0.0919189453125, "learning_rate": 2.8049728257365293e-06, "loss": 0.0009190738201141357, "memory(GiB)": 38.09, "reward": 0.4191405177116394, "reward_std": 0.03710421174764633, "rewards/VisualizationJSONCombinedORM/mean": 0.4191405177116394, "rewards/VisualizationJSONCombinedORM/std": 0.0640576034784317, "step": 4112, "train_speed(iter/s)": 0.221334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 268.1875, "completions/min_length": 218.0, "epoch": 3.401985111662531, "grad_norm": 0.2635818421840668, "kl": 0.04425048828125, "learning_rate": 2.8023788242505345e-06, "loss": 0.00044201314449310303, "memory(GiB)": 38.09, "reward": 0.5395708084106445, "reward_std": 0.10737672448158264, "rewards/VisualizationJSONCombinedORM/mean": 0.5395708084106445, "rewards/VisualizationJSONCombinedORM/std": 0.13632889091968536, "step": 4113, "train_speed(iter/s)": 0.221103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 292.625, "completions/min_length": 204.0, "epoch": 3.402812241521919, "grad_norm": 0.20824077725410461, "kl": 0.1109619140625, "learning_rate": 2.799785555681489e-06, "loss": 0.0011092647910118103, "memory(GiB)": 38.09, "reward": 0.5901480913162231, "reward_std": 0.06273295730352402, "rewards/VisualizationJSONCombinedORM/mean": 0.5901480913162231, "rewards/VisualizationJSONCombinedORM/std": 0.19735945761203766, "step": 4114, "train_speed(iter/s)": 0.220826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 289.3125, "completions/min_length": 237.0, "epoch": 3.4036393713813067, "grad_norm": 0.1630079299211502, "kl": 0.0894775390625, "learning_rate": 2.79719302089426e-06, "loss": 0.0008947402238845825, "memory(GiB)": 38.09, "reward": 0.5928954482078552, "reward_std": 0.055939435958862305, "rewards/VisualizationJSONCombinedORM/mean": 0.5928954482078552, "rewards/VisualizationJSONCombinedORM/std": 0.11756404489278793, "step": 4115, "train_speed(iter/s)": 0.220587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 259.6875, "completions/min_length": 209.0, "epoch": 3.404466501240695, "grad_norm": 0.1856449842453003, "kl": 0.055419921875, "learning_rate": 2.794601220753471e-06, "loss": 0.0005551725625991821, "memory(GiB)": 38.09, "reward": 0.4593361020088196, "reward_std": 0.03703323379158974, "rewards/VisualizationJSONCombinedORM/mean": 0.4593361020088196, "rewards/VisualizationJSONCombinedORM/std": 0.03866977617144585, "step": 4116, "train_speed(iter/s)": 0.22034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 281.1875, "completions/min_length": 221.0, "epoch": 3.4052936311000828, "grad_norm": 0.2072649896144867, "kl": 0.175048828125, "learning_rate": 2.7920101561234954e-06, "loss": 0.0017505809664726257, "memory(GiB)": 38.09, "reward": 0.36062926054000854, "reward_std": 0.030646802857518196, "rewards/VisualizationJSONCombinedORM/mean": 0.36062926054000854, "rewards/VisualizationJSONCombinedORM/std": 0.05311042442917824, "step": 4117, "train_speed(iter/s)": 0.220098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 304.4375, "completions/min_length": 223.0, "epoch": 3.4061207609594706, "grad_norm": 0.5198779702186584, "kl": 0.23004150390625, "learning_rate": 2.7894198278684713e-06, "loss": 0.002301078289747238, "memory(GiB)": 38.09, "reward": 0.6028090119361877, "reward_std": 0.08328947424888611, "rewards/VisualizationJSONCombinedORM/mean": 0.6028090119361877, "rewards/VisualizationJSONCombinedORM/std": 0.14562974870204926, "step": 4118, "train_speed(iter/s)": 0.219853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 301.4375, "completions/min_length": 205.0, "epoch": 3.4069478908188584, "grad_norm": 0.17919868230819702, "kl": 0.04791259765625, "learning_rate": 2.786830236852281e-06, "loss": 0.0004795994609594345, "memory(GiB)": 38.09, "reward": 0.6462222337722778, "reward_std": 0.062086742371320724, "rewards/VisualizationJSONCombinedORM/mean": 0.6462222337722778, "rewards/VisualizationJSONCombinedORM/std": 0.1254090666770935, "step": 4119, "train_speed(iter/s)": 0.219596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 295.4375, "completions/min_length": 235.0, "epoch": 3.4077750206782467, "grad_norm": 0.20316267013549805, "kl": 0.067626953125, "learning_rate": 2.784241383938566e-06, "loss": 0.0006745085120201111, "memory(GiB)": 38.09, "reward": 0.5222375392913818, "reward_std": 0.049859292805194855, "rewards/VisualizationJSONCombinedORM/mean": 0.5222375392913818, "rewards/VisualizationJSONCombinedORM/std": 0.21516840159893036, "step": 4120, "train_speed(iter/s)": 0.219291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 303.1875, "completions/min_length": 245.0, "epoch": 3.4086021505376345, "grad_norm": 0.2036665827035904, "kl": 0.1378173828125, "learning_rate": 2.7816532699907217e-06, "loss": 0.0013787485659122467, "memory(GiB)": 38.09, "reward": 0.5032958984375, "reward_std": 0.06777729094028473, "rewards/VisualizationJSONCombinedORM/mean": 0.5032958984375, "rewards/VisualizationJSONCombinedORM/std": 0.0888676717877388, "step": 4121, "train_speed(iter/s)": 0.219025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 302.4375, "completions/min_length": 206.0, "epoch": 3.4094292803970223, "grad_norm": 0.21572455763816833, "kl": 0.1226806640625, "learning_rate": 2.7790658958718974e-06, "loss": 0.0012265723198652267, "memory(GiB)": 38.09, "reward": 0.5537434816360474, "reward_std": 0.04494573175907135, "rewards/VisualizationJSONCombinedORM/mean": 0.5537434816360474, "rewards/VisualizationJSONCombinedORM/std": 0.15588650107383728, "step": 4122, "train_speed(iter/s)": 0.218763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 280.4375, "completions/min_length": 221.0, "epoch": 3.41025641025641, "grad_norm": 0.26040443778038025, "kl": 0.09521484375, "learning_rate": 2.77647926244499e-06, "loss": 0.0009500570595264435, "memory(GiB)": 38.09, "reward": 0.491677463054657, "reward_std": 0.08740760385990143, "rewards/VisualizationJSONCombinedORM/mean": 0.491677463054657, "rewards/VisualizationJSONCombinedORM/std": 0.0975240096449852, "step": 4123, "train_speed(iter/s)": 0.218541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 291.6875, "completions/min_length": 234.0, "epoch": 3.411083540115798, "grad_norm": 0.18926279246807098, "kl": 0.06634521484375, "learning_rate": 2.77389337057266e-06, "loss": 0.0006632134318351746, "memory(GiB)": 38.09, "reward": 0.6215234994888306, "reward_std": 0.08955536037683487, "rewards/VisualizationJSONCombinedORM/mean": 0.6215234994888306, "rewards/VisualizationJSONCombinedORM/std": 0.11943234503269196, "step": 4124, "train_speed(iter/s)": 0.218324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 304.9375, "completions/min_length": 251.0, "epoch": 3.411910669975186, "grad_norm": 0.17431724071502686, "kl": 0.1138916015625, "learning_rate": 2.771308221117309e-06, "loss": 0.0011395514011383057, "memory(GiB)": 38.09, "reward": 0.675644040107727, "reward_std": 0.07043356448411942, "rewards/VisualizationJSONCombinedORM/mean": 0.675644040107727, "rewards/VisualizationJSONCombinedORM/std": 0.16606996953487396, "step": 4125, "train_speed(iter/s)": 0.218096 }, { "epoch": 3.411910669975186, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 344.2083333333333, "eval_completions/mean_length": 286.4322916666667, "eval_completions/min_length": 242.08333333333334, "eval_kl": 0.06915791829427083, "eval_loss": 0.0006974649732001126, "eval_reward": 0.42275630434354144, "eval_reward_std": 0.043078171358502004, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.42275630434354144, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04307817026225772, "eval_runtime": 297.9761, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 281.25, "completions/min_length": 226.0, "epoch": 3.412737799834574, "grad_norm": 0.2460397183895111, "kl": 0.04827880859375, "learning_rate": 2.768723814941099e-06, "loss": 0.00048222392797470093, "memory(GiB)": 38.09, "reward": 0.562624454498291, "reward_std": 0.03561948239803314, "rewards/VisualizationJSONCombinedORM/mean": 0.562624454498291, "rewards/VisualizationJSONCombinedORM/std": 0.18882764875888824, "step": 4126, "train_speed(iter/s)": 0.214433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 315.625, "completions/min_length": 208.0, "epoch": 3.413564929693962, "grad_norm": 0.21231535077095032, "kl": 0.1534423828125, "learning_rate": 2.7661401529059416e-06, "loss": 0.0015349090099334717, "memory(GiB)": 38.09, "reward": 0.44091206789016724, "reward_std": 0.047295331954956055, "rewards/VisualizationJSONCombinedORM/mean": 0.44091206789016724, "rewards/VisualizationJSONCombinedORM/std": 0.24756313860416412, "step": 4127, "train_speed(iter/s)": 0.214191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 285.8125, "completions/min_length": 233.0, "epoch": 3.41439205955335, "grad_norm": 0.17588716745376587, "kl": 0.030731201171875, "learning_rate": 2.763557235873502e-06, "loss": 0.0003079846501350403, "memory(GiB)": 38.09, "reward": 0.7972022891044617, "reward_std": 0.06997493654489517, "rewards/VisualizationJSONCombinedORM/mean": 0.7972022891044617, "rewards/VisualizationJSONCombinedORM/std": 0.06938590109348297, "step": 4128, "train_speed(iter/s)": 0.213943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 299.8125, "completions/min_length": 249.0, "epoch": 3.415219189412738, "grad_norm": 0.1643611043691635, "kl": 0.1148681640625, "learning_rate": 2.76097506470519e-06, "loss": 0.0011491477489471436, "memory(GiB)": 38.09, "reward": 0.39005735516548157, "reward_std": 0.03768865019083023, "rewards/VisualizationJSONCombinedORM/mean": 0.39005735516548157, "rewards/VisualizationJSONCombinedORM/std": 0.06558652222156525, "step": 4129, "train_speed(iter/s)": 0.21366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 289.4375, "completions/min_length": 232.0, "epoch": 3.4160463192721258, "grad_norm": 0.2516886293888092, "kl": 0.034454345703125, "learning_rate": 2.7583936402621753e-06, "loss": 0.000344693660736084, "memory(GiB)": 38.09, "reward": 0.5413697957992554, "reward_std": 0.06840691715478897, "rewards/VisualizationJSONCombinedORM/mean": 0.5413697957992554, "rewards/VisualizationJSONCombinedORM/std": 0.08262304961681366, "step": 4130, "train_speed(iter/s)": 0.213443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 245.1875, "completions/min_length": 211.0, "epoch": 3.4168734491315136, "grad_norm": 0.1798514425754547, "kl": 0.05511474609375, "learning_rate": 2.7558129634053744e-06, "loss": 0.0005512107163667679, "memory(GiB)": 38.09, "reward": 0.4589065909385681, "reward_std": 0.09736159443855286, "rewards/VisualizationJSONCombinedORM/mean": 0.4589065909385681, "rewards/VisualizationJSONCombinedORM/std": 0.12955200672149658, "step": 4131, "train_speed(iter/s)": 0.213262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 286.9375, "completions/min_length": 227.0, "epoch": 3.4177005789909014, "grad_norm": 0.398395836353302, "kl": 0.1923828125, "learning_rate": 2.753233034995457e-06, "loss": 0.0019244663417339325, "memory(GiB)": 38.09, "reward": 0.6820774674415588, "reward_std": 0.11217956244945526, "rewards/VisualizationJSONCombinedORM/mean": 0.6820774674415588, "rewards/VisualizationJSONCombinedORM/std": 0.11672859638929367, "step": 4132, "train_speed(iter/s)": 0.213031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 256.25, "completions/min_length": 220.0, "epoch": 3.4185277088502897, "grad_norm": 0.2896411120891571, "kl": 0.05810546875, "learning_rate": 2.750653855892836e-06, "loss": 0.0005815550684928894, "memory(GiB)": 38.09, "reward": 0.48755142092704773, "reward_std": 0.0605006143450737, "rewards/VisualizationJSONCombinedORM/mean": 0.48755142092704773, "rewards/VisualizationJSONCombinedORM/std": 0.17308375239372253, "step": 4133, "train_speed(iter/s)": 0.212883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 308.5, "completions/min_length": 254.0, "epoch": 3.4193548387096775, "grad_norm": 0.2250979244709015, "kl": 0.054107666015625, "learning_rate": 2.7480754269576872e-06, "loss": 0.0005408450961112976, "memory(GiB)": 38.09, "reward": 0.5772157311439514, "reward_std": 0.08427292853593826, "rewards/VisualizationJSONCombinedORM/mean": 0.5772157311439514, "rewards/VisualizationJSONCombinedORM/std": 0.1961052417755127, "step": 4134, "train_speed(iter/s)": 0.212691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 296.4375, "completions/min_length": 230.0, "epoch": 3.4201819685690653, "grad_norm": 0.20583853125572205, "kl": 0.05010986328125, "learning_rate": 2.745497749049922e-06, "loss": 0.0005012247711420059, "memory(GiB)": 38.09, "reward": 0.6603900790214539, "reward_std": 0.06087901070713997, "rewards/VisualizationJSONCombinedORM/mean": 0.6603900790214539, "rewards/VisualizationJSONCombinedORM/std": 0.08181507885456085, "step": 4135, "train_speed(iter/s)": 0.212441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 278.875, "completions/min_length": 221.0, "epoch": 3.421009098428453, "grad_norm": 0.1735556721687317, "kl": 0.10302734375, "learning_rate": 2.7429208230292115e-06, "loss": 0.0010266155004501343, "memory(GiB)": 38.09, "reward": 0.6184618473052979, "reward_std": 0.05862165987491608, "rewards/VisualizationJSONCombinedORM/mean": 0.6184618473052979, "rewards/VisualizationJSONCombinedORM/std": 0.06296629458665848, "step": 4136, "train_speed(iter/s)": 0.212226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 305.3125, "completions/min_length": 251.0, "epoch": 3.421836228287841, "grad_norm": 0.18133725225925446, "kl": 0.10992431640625, "learning_rate": 2.7403446497549714e-06, "loss": 0.0010957308113574982, "memory(GiB)": 38.09, "reward": 0.5563052892684937, "reward_std": 0.041488468647003174, "rewards/VisualizationJSONCombinedORM/mean": 0.5563052892684937, "rewards/VisualizationJSONCombinedORM/std": 0.33700358867645264, "step": 4137, "train_speed(iter/s)": 0.212004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 303.8125, "completions/min_length": 230.0, "epoch": 3.422663358147229, "grad_norm": 0.15665288269519806, "kl": 0.03179931640625, "learning_rate": 2.73776923008637e-06, "loss": 0.0003181546926498413, "memory(GiB)": 38.09, "reward": 0.32353729009628296, "reward_std": 0.016870100051164627, "rewards/VisualizationJSONCombinedORM/mean": 0.32353729009628296, "rewards/VisualizationJSONCombinedORM/std": 0.09848503768444061, "step": 4138, "train_speed(iter/s)": 0.21178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 298.9375, "completions/min_length": 253.0, "epoch": 3.423490488006617, "grad_norm": 0.21103894710540771, "kl": 0.150390625, "learning_rate": 2.735194564882315e-06, "loss": 0.0015007033944129944, "memory(GiB)": 38.09, "reward": 0.41358745098114014, "reward_std": 0.04238496348261833, "rewards/VisualizationJSONCombinedORM/mean": 0.41358745098114014, "rewards/VisualizationJSONCombinedORM/std": 0.1446673572063446, "step": 4139, "train_speed(iter/s)": 0.211526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 297.9375, "completions/min_length": 228.0, "epoch": 3.424317617866005, "grad_norm": 0.23405945301055908, "kl": 0.05792236328125, "learning_rate": 2.7326206550014793e-06, "loss": 0.0005795285105705261, "memory(GiB)": 38.09, "reward": 0.41182035207748413, "reward_std": 0.0728154331445694, "rewards/VisualizationJSONCombinedORM/mean": 0.41182035207748413, "rewards/VisualizationJSONCombinedORM/std": 0.1857784390449524, "step": 4140, "train_speed(iter/s)": 0.211332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 307.625, "completions/min_length": 239.0, "epoch": 3.425144747725393, "grad_norm": 0.24994242191314697, "kl": 0.0535888671875, "learning_rate": 2.7300475013022666e-06, "loss": 0.0005357861518859863, "memory(GiB)": 38.09, "reward": 0.3878524601459503, "reward_std": 0.043296489864587784, "rewards/VisualizationJSONCombinedORM/mean": 0.3878524601459503, "rewards/VisualizationJSONCombinedORM/std": 0.11737916618585587, "step": 4141, "train_speed(iter/s)": 0.21113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 272.5625, "completions/min_length": 222.0, "epoch": 3.425971877584781, "grad_norm": 0.2693951725959778, "kl": 0.04052734375, "learning_rate": 2.727475104642838e-06, "loss": 0.00040511786937713623, "memory(GiB)": 38.09, "reward": 0.719623327255249, "reward_std": 0.10566242039203644, "rewards/VisualizationJSONCombinedORM/mean": 0.719623327255249, "rewards/VisualizationJSONCombinedORM/std": 0.10418104380369186, "step": 4142, "train_speed(iter/s)": 0.210932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 292.125, "completions/min_length": 240.0, "epoch": 3.4267990074441688, "grad_norm": 0.221663698554039, "kl": 0.1434326171875, "learning_rate": 2.724903465881099e-06, "loss": 0.0014350488781929016, "memory(GiB)": 38.09, "reward": 0.35894083976745605, "reward_std": 0.035038694739341736, "rewards/VisualizationJSONCombinedORM/mean": 0.35894083976745605, "rewards/VisualizationJSONCombinedORM/std": 0.11080422252416611, "step": 4143, "train_speed(iter/s)": 0.210713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 280.625, "completions/min_length": 252.0, "epoch": 3.4276261373035566, "grad_norm": 0.25313255190849304, "kl": 0.0404052734375, "learning_rate": 2.7223325858747056e-06, "loss": 0.00040409713983535767, "memory(GiB)": 38.09, "reward": 0.4943086504936218, "reward_std": 0.06726153194904327, "rewards/VisualizationJSONCombinedORM/mean": 0.4943086504936218, "rewards/VisualizationJSONCombinedORM/std": 0.17972852289676666, "step": 4144, "train_speed(iter/s)": 0.210518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 294.5, "completions/min_length": 223.0, "epoch": 3.4284532671629444, "grad_norm": 0.19526326656341553, "kl": 0.06634521484375, "learning_rate": 2.719762465481055e-06, "loss": 0.000666547566652298, "memory(GiB)": 38.09, "reward": 0.37548935413360596, "reward_std": 0.013831054791808128, "rewards/VisualizationJSONCombinedORM/mean": 0.37548935413360596, "rewards/VisualizationJSONCombinedORM/std": 0.11451797932386398, "step": 4145, "train_speed(iter/s)": 0.210261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 277.125, "completions/min_length": 227.0, "epoch": 3.4292803970223327, "grad_norm": 0.22051985561847687, "kl": 0.2080078125, "learning_rate": 2.717193105557296e-06, "loss": 0.0020834840834140778, "memory(GiB)": 38.09, "reward": 0.5934668779373169, "reward_std": 0.09753164649009705, "rewards/VisualizationJSONCombinedORM/mean": 0.5934668779373169, "rewards/VisualizationJSONCombinedORM/std": 0.09435073286294937, "step": 4146, "train_speed(iter/s)": 0.210007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 302.8125, "completions/min_length": 228.0, "epoch": 3.4301075268817205, "grad_norm": 0.1813305765390396, "kl": 0.02984619140625, "learning_rate": 2.7146245069603215e-06, "loss": 0.0002984441816806793, "memory(GiB)": 38.09, "reward": 0.45113056898117065, "reward_std": 0.050008878111839294, "rewards/VisualizationJSONCombinedORM/mean": 0.45113056898117065, "rewards/VisualizationJSONCombinedORM/std": 0.26789411902427673, "step": 4147, "train_speed(iter/s)": 0.209793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 278.125, "completions/min_length": 221.0, "epoch": 3.4309346567411083, "grad_norm": 0.16557128727436066, "kl": 0.03118896484375, "learning_rate": 2.712056670546774e-06, "loss": 0.0003113783895969391, "memory(GiB)": 38.09, "reward": 0.5455259680747986, "reward_std": 0.019306965172290802, "rewards/VisualizationJSONCombinedORM/mean": 0.5455259680747986, "rewards/VisualizationJSONCombinedORM/std": 0.286414235830307, "step": 4148, "train_speed(iter/s)": 0.209578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 307.5625, "completions/min_length": 218.0, "epoch": 3.431761786600496, "grad_norm": 0.16068468987941742, "kl": 0.033203125, "learning_rate": 2.7094895971730326e-06, "loss": 0.0003315731883049011, "memory(GiB)": 38.09, "reward": 0.5594198703765869, "reward_std": 0.03563404455780983, "rewards/VisualizationJSONCombinedORM/mean": 0.5594198703765869, "rewards/VisualizationJSONCombinedORM/std": 0.19063545763492584, "step": 4149, "train_speed(iter/s)": 0.20934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 264.6875, "completions/min_length": 228.0, "epoch": 3.432588916459884, "grad_norm": 0.16338033974170685, "kl": 0.09283447265625, "learning_rate": 2.7069232876952368e-06, "loss": 0.0009276820346713066, "memory(GiB)": 38.09, "reward": 0.49881866574287415, "reward_std": 0.04383430629968643, "rewards/VisualizationJSONCombinedORM/mean": 0.49881866574287415, "rewards/VisualizationJSONCombinedORM/std": 0.23373854160308838, "step": 4150, "train_speed(iter/s)": 0.209118 }, { "epoch": 3.432588916459884, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 354.4583333333333, "eval_completions/mean_length": 293.8958333333333, "eval_completions/min_length": 247.875, "eval_kl": 0.0582122802734375, "eval_loss": 0.0005835779011249542, "eval_reward": 0.4208799923459689, "eval_reward_std": 0.046513517561834306, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4208799923459689, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04651351888120795, "eval_runtime": 303.9832, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 292.9375, "completions/min_length": 245.0, "epoch": 3.433416046319272, "grad_norm": 0.2100696861743927, "kl": 0.07476806640625, "learning_rate": 2.7043577429692557e-06, "loss": 0.000748608261346817, "memory(GiB)": 38.09, "reward": 0.6047326326370239, "reward_std": 0.0636192262172699, "rewards/VisualizationJSONCombinedORM/mean": 0.6047326326370239, "rewards/VisualizationJSONCombinedORM/std": 0.22511355578899384, "step": 4151, "train_speed(iter/s)": 0.205769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 302.75, "completions/min_length": 231.0, "epoch": 3.43424317617866, "grad_norm": 0.2231404036283493, "kl": 0.0574951171875, "learning_rate": 2.7017929638507134e-06, "loss": 0.0005742087960243225, "memory(GiB)": 38.09, "reward": 0.4664640426635742, "reward_std": 0.0721331238746643, "rewards/VisualizationJSONCombinedORM/mean": 0.4664640426635742, "rewards/VisualizationJSONCombinedORM/std": 0.16526499390602112, "step": 4152, "train_speed(iter/s)": 0.205551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 328.6875, "completions/min_length": 252.0, "epoch": 3.435070306038048, "grad_norm": 0.17550989985466003, "kl": 0.06646728515625, "learning_rate": 2.699228951194976e-06, "loss": 0.0006646886467933655, "memory(GiB)": 38.09, "reward": 0.4095395803451538, "reward_std": 0.036301229149103165, "rewards/VisualizationJSONCombinedORM/mean": 0.4095395803451538, "rewards/VisualizationJSONCombinedORM/std": 0.07719191163778305, "step": 4153, "train_speed(iter/s)": 0.205306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 297.9375, "completions/min_length": 242.0, "epoch": 3.435897435897436, "grad_norm": 0.17590878903865814, "kl": 0.134521484375, "learning_rate": 2.6966657058571545e-06, "loss": 0.0013466626405715942, "memory(GiB)": 38.09, "reward": 0.4870845079421997, "reward_std": 0.020803287625312805, "rewards/VisualizationJSONCombinedORM/mean": 0.4870845079421997, "rewards/VisualizationJSONCombinedORM/std": 0.21370667219161987, "step": 4154, "train_speed(iter/s)": 0.205086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 278.9375, "completions/min_length": 230.0, "epoch": 3.436724565756824, "grad_norm": 0.2170068323612213, "kl": 0.05120849609375, "learning_rate": 2.694103228692099e-06, "loss": 0.0005125626921653748, "memory(GiB)": 38.09, "reward": 0.45304206013679504, "reward_std": 0.07593169063329697, "rewards/VisualizationJSONCombinedORM/mean": 0.45304206013679504, "rewards/VisualizationJSONCombinedORM/std": 0.16881021857261658, "step": 4155, "train_speed(iter/s)": 0.204915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 284.125, "completions/min_length": 236.0, "epoch": 3.4375516956162118, "grad_norm": 0.17380830645561218, "kl": 0.03033447265625, "learning_rate": 2.6915415205544134e-06, "loss": 0.00030345097184181213, "memory(GiB)": 38.09, "reward": 0.5891804099082947, "reward_std": 0.039647988975048065, "rewards/VisualizationJSONCombinedORM/mean": 0.5891804099082947, "rewards/VisualizationJSONCombinedORM/std": 0.2592310607433319, "step": 4156, "train_speed(iter/s)": 0.204724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 264.1875, "completions/min_length": 222.0, "epoch": 3.4383788254755996, "grad_norm": 0.17362377047538757, "kl": 0.075439453125, "learning_rate": 2.6889805822984348e-06, "loss": 0.0007535926997661591, "memory(GiB)": 38.09, "reward": 0.7245970964431763, "reward_std": 0.09036874771118164, "rewards/VisualizationJSONCombinedORM/mean": 0.7245970964431763, "rewards/VisualizationJSONCombinedORM/std": 0.1065661683678627, "step": 4157, "train_speed(iter/s)": 0.204478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 277.3125, "completions/min_length": 229.0, "epoch": 3.4392059553349874, "grad_norm": 0.17468500137329102, "kl": 0.07769775390625, "learning_rate": 2.68642041477825e-06, "loss": 0.0007785763591527939, "memory(GiB)": 38.09, "reward": 0.3150958716869354, "reward_std": 0.01433698832988739, "rewards/VisualizationJSONCombinedORM/mean": 0.3150958716869354, "rewards/VisualizationJSONCombinedORM/std": 0.13729895651340485, "step": 4158, "train_speed(iter/s)": 0.204237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 297.1875, "completions/min_length": 245.0, "epoch": 3.4400330851943757, "grad_norm": 0.16735999286174774, "kl": 0.0863037109375, "learning_rate": 2.6838610188476865e-06, "loss": 0.0008623376488685608, "memory(GiB)": 38.09, "reward": 0.3368517756462097, "reward_std": 0.0345921590924263, "rewards/VisualizationJSONCombinedORM/mean": 0.3368517756462097, "rewards/VisualizationJSONCombinedORM/std": 0.06498683989048004, "step": 4159, "train_speed(iter/s)": 0.204063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 307.3125, "completions/min_length": 215.0, "epoch": 3.4408602150537635, "grad_norm": 0.3885791599750519, "kl": 0.0474853515625, "learning_rate": 2.6813023953603168e-06, "loss": 0.00047557055950164795, "memory(GiB)": 38.09, "reward": 0.500423789024353, "reward_std": 0.07208722084760666, "rewards/VisualizationJSONCombinedORM/mean": 0.500423789024353, "rewards/VisualizationJSONCombinedORM/std": 0.09640786796808243, "step": 4160, "train_speed(iter/s)": 0.203887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 267.75, "completions/min_length": 217.0, "epoch": 3.4416873449131513, "grad_norm": 0.26708170771598816, "kl": 0.0869140625, "learning_rate": 2.6787445451694506e-06, "loss": 0.0008682049810886383, "memory(GiB)": 38.09, "reward": 0.5264036059379578, "reward_std": 0.0704878643155098, "rewards/VisualizationJSONCombinedORM/mean": 0.5264036059379578, "rewards/VisualizationJSONCombinedORM/std": 0.07049978524446487, "step": 4161, "train_speed(iter/s)": 0.203636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 294.75, "completions/min_length": 218.0, "epoch": 3.442514474772539, "grad_norm": 0.2150736302137375, "kl": 0.066650390625, "learning_rate": 2.6761874691281443e-06, "loss": 0.0006670271977782249, "memory(GiB)": 38.09, "reward": 0.5010268688201904, "reward_std": 0.0528927743434906, "rewards/VisualizationJSONCombinedORM/mean": 0.5010268688201904, "rewards/VisualizationJSONCombinedORM/std": 0.13409070670604706, "step": 4162, "train_speed(iter/s)": 0.203426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 285.4375, "completions/min_length": 222.0, "epoch": 3.4433416046319274, "grad_norm": 0.1718721091747284, "kl": 0.0379638671875, "learning_rate": 2.6736311680891948e-06, "loss": 0.00038010627031326294, "memory(GiB)": 38.09, "reward": 0.4763672947883606, "reward_std": 0.06957598775625229, "rewards/VisualizationJSONCombinedORM/mean": 0.4763672947883606, "rewards/VisualizationJSONCombinedORM/std": 0.09849622845649719, "step": 4163, "train_speed(iter/s)": 0.203228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 271.0625, "completions/min_length": 236.0, "epoch": 3.444168734491315, "grad_norm": 0.25742217898368835, "kl": 0.07415771484375, "learning_rate": 2.6710756429051416e-06, "loss": 0.000740591436624527, "memory(GiB)": 38.09, "reward": 0.4962133467197418, "reward_std": 0.05572225898504257, "rewards/VisualizationJSONCombinedORM/mean": 0.4962133467197418, "rewards/VisualizationJSONCombinedORM/std": 0.25226742029190063, "step": 4164, "train_speed(iter/s)": 0.20306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 348.5, "completions/min_length": 291.0, "epoch": 3.444995864350703, "grad_norm": 0.15907491743564606, "kl": 0.08087158203125, "learning_rate": 2.668520894428259e-06, "loss": 0.0008094124495983124, "memory(GiB)": 38.09, "reward": 0.5520169734954834, "reward_std": 0.0387941412627697, "rewards/VisualizationJSONCombinedORM/mean": 0.5520169734954834, "rewards/VisualizationJSONCombinedORM/std": 0.04071497544646263, "step": 4165, "train_speed(iter/s)": 0.202792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 300.4375, "completions/min_length": 238.0, "epoch": 3.445822994210091, "grad_norm": 0.1894502490758896, "kl": 0.04376220703125, "learning_rate": 2.6659669235105766e-06, "loss": 0.00043748319149017334, "memory(GiB)": 38.09, "reward": 0.5652251243591309, "reward_std": 0.051365915685892105, "rewards/VisualizationJSONCombinedORM/mean": 0.5652251243591309, "rewards/VisualizationJSONCombinedORM/std": 0.11114463210105896, "step": 4166, "train_speed(iter/s)": 0.202595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 287.4375, "completions/min_length": 231.0, "epoch": 3.446650124069479, "grad_norm": 0.20968323945999146, "kl": 0.06475830078125, "learning_rate": 2.6634137310038486e-06, "loss": 0.0006481409072875977, "memory(GiB)": 38.09, "reward": 0.3528105914592743, "reward_std": 0.04261820763349533, "rewards/VisualizationJSONCombinedORM/mean": 0.3528105914592743, "rewards/VisualizationJSONCombinedORM/std": 0.07419679313898087, "step": 4167, "train_speed(iter/s)": 0.20241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 282.375, "completions/min_length": 242.0, "epoch": 3.447477253928867, "grad_norm": 0.19888420403003693, "kl": 0.03704833984375, "learning_rate": 2.660861317759579e-06, "loss": 0.00036995112895965576, "memory(GiB)": 38.09, "reward": 0.4234139919281006, "reward_std": 0.037363942712545395, "rewards/VisualizationJSONCombinedORM/mean": 0.4234139919281006, "rewards/VisualizationJSONCombinedORM/std": 0.03874831646680832, "step": 4168, "train_speed(iter/s)": 0.202206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 282.625, "completions/min_length": 235.0, "epoch": 3.4483043837882548, "grad_norm": 0.19096671044826508, "kl": 0.0465087890625, "learning_rate": 2.658309684629009e-06, "loss": 0.00046584755182266235, "memory(GiB)": 38.09, "reward": 0.5404446721076965, "reward_std": 0.05248985067009926, "rewards/VisualizationJSONCombinedORM/mean": 0.5404446721076965, "rewards/VisualizationJSONCombinedORM/std": 0.07966087013483047, "step": 4169, "train_speed(iter/s)": 0.202004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 299.4375, "completions/min_length": 244.0, "epoch": 3.4491315136476426, "grad_norm": 0.22211956977844238, "kl": 0.04547119140625, "learning_rate": 2.6557588324631223e-06, "loss": 0.00045369938015937805, "memory(GiB)": 38.09, "reward": 0.5967233777046204, "reward_std": 0.04425203800201416, "rewards/VisualizationJSONCombinedORM/mean": 0.5967233777046204, "rewards/VisualizationJSONCombinedORM/std": 0.07342690974473953, "step": 4170, "train_speed(iter/s)": 0.201804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 295.375, "completions/min_length": 239.0, "epoch": 3.4499586435070304, "grad_norm": 0.21865437924861908, "kl": 0.085693359375, "learning_rate": 2.6532087621126347e-06, "loss": 0.0008580461144447327, "memory(GiB)": 38.09, "reward": 0.46451979875564575, "reward_std": 0.05510823056101799, "rewards/VisualizationJSONCombinedORM/mean": 0.46451979875564575, "rewards/VisualizationJSONCombinedORM/std": 0.07080048322677612, "step": 4171, "train_speed(iter/s)": 0.2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 299.25, "completions/min_length": 237.0, "epoch": 3.4507857733664187, "grad_norm": 0.2420654594898224, "kl": 0.05145263671875, "learning_rate": 2.6506594744280147e-06, "loss": 0.0005147196352481842, "memory(GiB)": 38.09, "reward": 0.6246212720870972, "reward_std": 0.05643921345472336, "rewards/VisualizationJSONCombinedORM/mean": 0.6246212720870972, "rewards/VisualizationJSONCombinedORM/std": 0.05757756158709526, "step": 4172, "train_speed(iter/s)": 0.201425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 282.5, "completions/min_length": 224.0, "epoch": 3.4516129032258065, "grad_norm": 0.5403317213058472, "kl": 0.533447265625, "learning_rate": 2.648110970259454e-06, "loss": 0.0053422823548316956, "memory(GiB)": 38.09, "reward": 0.5176699757575989, "reward_std": 0.08594957739114761, "rewards/VisualizationJSONCombinedORM/mean": 0.5176699757575989, "rewards/VisualizationJSONCombinedORM/std": 0.13557639718055725, "step": 4173, "train_speed(iter/s)": 0.201174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 270.5, "completions/min_length": 217.0, "epoch": 3.4524400330851943, "grad_norm": 0.20854245126247406, "kl": 0.03631591796875, "learning_rate": 2.645563250456894e-06, "loss": 0.0003629028797149658, "memory(GiB)": 38.09, "reward": 0.3623938262462616, "reward_std": 0.05108770728111267, "rewards/VisualizationJSONCombinedORM/mean": 0.3623938262462616, "rewards/VisualizationJSONCombinedORM/std": 0.15371073782444, "step": 4174, "train_speed(iter/s)": 0.200973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 308.5625, "completions/min_length": 238.0, "epoch": 3.453267162944582, "grad_norm": 0.174663245677948, "kl": 0.04803466796875, "learning_rate": 2.6430163158700116e-06, "loss": 0.00048087164759635925, "memory(GiB)": 38.09, "reward": 0.5059378147125244, "reward_std": 0.041347719728946686, "rewards/VisualizationJSONCombinedORM/mean": 0.5059378147125244, "rewards/VisualizationJSONCombinedORM/std": 0.2599357068538666, "step": 4175, "train_speed(iter/s)": 0.200757 }, { "epoch": 3.453267162944582, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.8333333333333, "eval_completions/mean_length": 297.1875, "eval_completions/min_length": 247.375, "eval_kl": 0.08428955078125, "eval_loss": 0.0008506865124218166, "eval_reward": 0.4478037152439356, "eval_reward_std": 0.04891444051948687, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4478037152439356, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04891444257615755, "eval_runtime": 305.9017, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 294.8125, "completions/min_length": 208.0, "epoch": 3.4540942928039704, "grad_norm": 0.19501900672912598, "kl": 0.0394287109375, "learning_rate": 2.640470167348221e-06, "loss": 0.0003953613340854645, "memory(GiB)": 38.09, "reward": 0.719179093837738, "reward_std": 0.04761030897498131, "rewards/VisualizationJSONCombinedORM/mean": 0.719179093837738, "rewards/VisualizationJSONCombinedORM/std": 0.12675441801548004, "step": 4176, "train_speed(iter/s)": 0.197662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 285.75, "completions/min_length": 232.0, "epoch": 3.454921422663358, "grad_norm": 0.2105015516281128, "kl": 0.0517578125, "learning_rate": 2.637924805740676e-06, "loss": 0.0005180239677429199, "memory(GiB)": 38.09, "reward": 0.5024601221084595, "reward_std": 0.03315648436546326, "rewards/VisualizationJSONCombinedORM/mean": 0.5024601221084595, "rewards/VisualizationJSONCombinedORM/std": 0.2585100829601288, "step": 4177, "train_speed(iter/s)": 0.197472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 300.0625, "completions/min_length": 242.0, "epoch": 3.455748552522746, "grad_norm": 0.24691320955753326, "kl": 0.05322265625, "learning_rate": 2.6353802318962614e-06, "loss": 0.0005339458584785461, "memory(GiB)": 38.09, "reward": 0.6811855435371399, "reward_std": 0.09231467545032501, "rewards/VisualizationJSONCombinedORM/mean": 0.6811855435371399, "rewards/VisualizationJSONCombinedORM/std": 0.0891970619559288, "step": 4178, "train_speed(iter/s)": 0.197289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 280.1875, "completions/min_length": 226.0, "epoch": 3.456575682382134, "grad_norm": 0.20014579594135284, "kl": 0.100341796875, "learning_rate": 2.632836446663612e-06, "loss": 0.0010059252381324768, "memory(GiB)": 38.09, "reward": 0.560466468334198, "reward_std": 0.06880948692560196, "rewards/VisualizationJSONCombinedORM/mean": 0.560466468334198, "rewards/VisualizationJSONCombinedORM/std": 0.20742779970169067, "step": 4179, "train_speed(iter/s)": 0.197095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 307.4375, "completions/min_length": 212.0, "epoch": 3.457402812241522, "grad_norm": 0.17867670953273773, "kl": 0.087890625, "learning_rate": 2.630293450891086e-06, "loss": 0.000881412997841835, "memory(GiB)": 38.09, "reward": 0.4301929175853729, "reward_std": 0.05718716233968735, "rewards/VisualizationJSONCombinedORM/mean": 0.4301929175853729, "rewards/VisualizationJSONCombinedORM/std": 0.12135382741689682, "step": 4180, "train_speed(iter/s)": 0.196905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 288.25, "completions/min_length": 242.0, "epoch": 3.45822994210091, "grad_norm": 0.2059485763311386, "kl": 0.07977294921875, "learning_rate": 2.6277512454267874e-06, "loss": 0.0007969029247760773, "memory(GiB)": 38.09, "reward": 0.4226952791213989, "reward_std": 0.051115792244672775, "rewards/VisualizationJSONCombinedORM/mean": 0.4226952791213989, "rewards/VisualizationJSONCombinedORM/std": 0.09121552109718323, "step": 4181, "train_speed(iter/s)": 0.196736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 306.3125, "completions/min_length": 254.0, "epoch": 3.4590570719602978, "grad_norm": 0.19327585399150848, "kl": 0.1063232421875, "learning_rate": 2.625209831118552e-06, "loss": 0.0010626502335071564, "memory(GiB)": 38.09, "reward": 0.43330591917037964, "reward_std": 0.06456582993268967, "rewards/VisualizationJSONCombinedORM/mean": 0.43330591917037964, "rewards/VisualizationJSONCombinedORM/std": 0.15932869911193848, "step": 4182, "train_speed(iter/s)": 0.196545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 307.0, "completions/min_length": 256.0, "epoch": 3.4598842018196856, "grad_norm": 0.16860507428646088, "kl": 0.07684326171875, "learning_rate": 2.6226692088139567e-06, "loss": 0.0007693599909543991, "memory(GiB)": 38.09, "reward": 0.4535074830055237, "reward_std": 0.040003471076488495, "rewards/VisualizationJSONCombinedORM/mean": 0.4535074830055237, "rewards/VisualizationJSONCombinedORM/std": 0.300632119178772, "step": 4183, "train_speed(iter/s)": 0.196347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 328.0, "completions/min_length": 272.0, "epoch": 3.4607113316790734, "grad_norm": 0.20055444538593292, "kl": 0.05267333984375, "learning_rate": 2.6201293793603067e-06, "loss": 0.0005271509289741516, "memory(GiB)": 38.09, "reward": 0.4451298117637634, "reward_std": 0.05469576269388199, "rewards/VisualizationJSONCombinedORM/mean": 0.4451298117637634, "rewards/VisualizationJSONCombinedORM/std": 0.18916581571102142, "step": 4184, "train_speed(iter/s)": 0.196127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 289.875, "completions/min_length": 226.0, "epoch": 3.4615384615384617, "grad_norm": 0.1917756050825119, "kl": 0.0849609375, "learning_rate": 2.617590343604648e-06, "loss": 0.0008486136794090271, "memory(GiB)": 38.09, "reward": 0.4079427719116211, "reward_std": 0.036011792719364166, "rewards/VisualizationJSONCombinedORM/mean": 0.4079427719116211, "rewards/VisualizationJSONCombinedORM/std": 0.046391911804676056, "step": 4185, "train_speed(iter/s)": 0.195906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 275.0, "completions/min_length": 213.0, "epoch": 3.4623655913978495, "grad_norm": 0.20408089458942413, "kl": 0.072021484375, "learning_rate": 2.6150521023937626e-06, "loss": 0.0007189065217971802, "memory(GiB)": 38.09, "reward": 0.5339884757995605, "reward_std": 0.05326554551720619, "rewards/VisualizationJSONCombinedORM/mean": 0.5339884757995605, "rewards/VisualizationJSONCombinedORM/std": 0.3355501890182495, "step": 4186, "train_speed(iter/s)": 0.195721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 304.0625, "completions/min_length": 222.0, "epoch": 3.4631927212572373, "grad_norm": 0.21804779767990112, "kl": 0.05682373046875, "learning_rate": 2.6125146565741666e-06, "loss": 0.0005685221403837204, "memory(GiB)": 38.09, "reward": 0.6678935885429382, "reward_std": 0.054832153022289276, "rewards/VisualizationJSONCombinedORM/mean": 0.6678935885429382, "rewards/VisualizationJSONCombinedORM/std": 0.07831697165966034, "step": 4187, "train_speed(iter/s)": 0.195517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 292.5625, "completions/min_length": 226.0, "epoch": 3.464019851116625, "grad_norm": 0.221616730093956, "kl": 0.06103515625, "learning_rate": 2.6099780069921043e-06, "loss": 0.0006117671728134155, "memory(GiB)": 38.09, "reward": 0.7532700896263123, "reward_std": 0.08617215603590012, "rewards/VisualizationJSONCombinedORM/mean": 0.7532700896263123, "rewards/VisualizationJSONCombinedORM/std": 0.08429432660341263, "step": 4188, "train_speed(iter/s)": 0.195314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 304.4375, "completions/min_length": 246.0, "epoch": 3.4648469809760134, "grad_norm": 0.22109192609786987, "kl": 0.07427978515625, "learning_rate": 2.607442154493568e-06, "loss": 0.000743180513381958, "memory(GiB)": 38.09, "reward": 0.5644168853759766, "reward_std": 0.05476253852248192, "rewards/VisualizationJSONCombinedORM/mean": 0.5644168853759766, "rewards/VisualizationJSONCombinedORM/std": 0.2694627642631531, "step": 4189, "train_speed(iter/s)": 0.195141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 296.0625, "completions/min_length": 249.0, "epoch": 3.4656741108354012, "grad_norm": 0.2010578066110611, "kl": 0.1114501953125, "learning_rate": 2.6049070999242708e-06, "loss": 0.001112561672925949, "memory(GiB)": 38.09, "reward": 0.6252983808517456, "reward_std": 0.06059949845075607, "rewards/VisualizationJSONCombinedORM/mean": 0.6252983808517456, "rewards/VisualizationJSONCombinedORM/std": 0.17147493362426758, "step": 4190, "train_speed(iter/s)": 0.19496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 272.625, "completions/min_length": 224.0, "epoch": 3.466501240694789, "grad_norm": 0.2335435301065445, "kl": 0.0819091796875, "learning_rate": 2.602372844129668e-06, "loss": 0.0008178651332855225, "memory(GiB)": 38.09, "reward": 0.5886400938034058, "reward_std": 0.08043565601110458, "rewards/VisualizationJSONCombinedORM/mean": 0.5886400938034058, "rewards/VisualizationJSONCombinedORM/std": 0.15862031280994415, "step": 4191, "train_speed(iter/s)": 0.194752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 301.6875, "completions/min_length": 243.0, "epoch": 3.467328370554177, "grad_norm": 0.22515694797039032, "kl": 0.0999755859375, "learning_rate": 2.5998393879549444e-06, "loss": 0.0009983628988265991, "memory(GiB)": 38.09, "reward": 0.3143980801105499, "reward_std": 0.027846071869134903, "rewards/VisualizationJSONCombinedORM/mean": 0.3143980801105499, "rewards/VisualizationJSONCombinedORM/std": 0.03706994280219078, "step": 4192, "train_speed(iter/s)": 0.194591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 287.625, "completions/min_length": 216.0, "epoch": 3.468155500413565, "grad_norm": 0.18075916171073914, "kl": 0.1058349609375, "learning_rate": 2.597306732245021e-06, "loss": 0.0010585663840174675, "memory(GiB)": 38.09, "reward": 0.46845903992652893, "reward_std": 0.036333777010440826, "rewards/VisualizationJSONCombinedORM/mean": 0.46845903992652893, "rewards/VisualizationJSONCombinedORM/std": 0.231139674782753, "step": 4193, "train_speed(iter/s)": 0.19442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 289.0625, "completions/min_length": 239.0, "epoch": 3.468982630272953, "grad_norm": 0.21475842595100403, "kl": 0.1688232421875, "learning_rate": 2.594774877844547e-06, "loss": 0.001684814691543579, "memory(GiB)": 38.09, "reward": 0.6688686609268188, "reward_std": 0.04617717117071152, "rewards/VisualizationJSONCombinedORM/mean": 0.6688686609268188, "rewards/VisualizationJSONCombinedORM/std": 0.07882925122976303, "step": 4194, "train_speed(iter/s)": 0.194291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 306.4375, "completions/min_length": 246.0, "epoch": 3.4698097601323408, "grad_norm": 0.19717486202716827, "kl": 0.08282470703125, "learning_rate": 2.5922438255979125e-06, "loss": 0.0008286647498607635, "memory(GiB)": 38.09, "reward": 0.6355406641960144, "reward_std": 0.07154978811740875, "rewards/VisualizationJSONCombinedORM/mean": 0.6355406641960144, "rewards/VisualizationJSONCombinedORM/std": 0.0911419540643692, "step": 4195, "train_speed(iter/s)": 0.194138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 326.6875, "completions/min_length": 252.0, "epoch": 3.4706368899917286, "grad_norm": 0.3805266320705414, "kl": 0.13653564453125, "learning_rate": 2.589713576349232e-06, "loss": 0.001363452523946762, "memory(GiB)": 38.09, "reward": 0.4012666940689087, "reward_std": 0.04973304271697998, "rewards/VisualizationJSONCombinedORM/mean": 0.4012666940689087, "rewards/VisualizationJSONCombinedORM/std": 0.04817528650164604, "step": 4196, "train_speed(iter/s)": 0.193945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 327.5625, "completions/min_length": 263.0, "epoch": 3.4714640198511164, "grad_norm": 0.19889423251152039, "kl": 0.0479736328125, "learning_rate": 2.5871841309423557e-06, "loss": 0.00048056989908218384, "memory(GiB)": 38.09, "reward": 0.5327118635177612, "reward_std": 0.0471244715154171, "rewards/VisualizationJSONCombinedORM/mean": 0.5327118635177612, "rewards/VisualizationJSONCombinedORM/std": 0.20288459956645966, "step": 4197, "train_speed(iter/s)": 0.19374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 315.8125, "completions/min_length": 247.0, "epoch": 3.4722911497105047, "grad_norm": 0.14786101877689362, "kl": 0.08306884765625, "learning_rate": 2.584655490220866e-06, "loss": 0.0008310899138450623, "memory(GiB)": 38.09, "reward": 0.7225133776664734, "reward_std": 0.16326357424259186, "rewards/VisualizationJSONCombinedORM/mean": 0.7225133776664734, "rewards/VisualizationJSONCombinedORM/std": 0.2066880762577057, "step": 4198, "train_speed(iter/s)": 0.193546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 296.375, "completions/min_length": 226.0, "epoch": 3.4731182795698925, "grad_norm": 0.2058391273021698, "kl": 0.098388671875, "learning_rate": 2.582127655028078e-06, "loss": 0.0009836405515670776, "memory(GiB)": 38.09, "reward": 0.6796143054962158, "reward_std": 0.08299045264720917, "rewards/VisualizationJSONCombinedORM/mean": 0.6796143054962158, "rewards/VisualizationJSONCombinedORM/std": 0.11718021333217621, "step": 4199, "train_speed(iter/s)": 0.193314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 262.8125, "completions/min_length": 212.0, "epoch": 3.4739454094292803, "grad_norm": 0.28766050934791565, "kl": 0.0794677734375, "learning_rate": 2.5796006262070337e-06, "loss": 0.00079319067299366, "memory(GiB)": 38.09, "reward": 0.6299541592597961, "reward_std": 0.05784228444099426, "rewards/VisualizationJSONCombinedORM/mean": 0.6299541592597961, "rewards/VisualizationJSONCombinedORM/std": 0.05625331401824951, "step": 4200, "train_speed(iter/s)": 0.193152 }, { "epoch": 3.4739454094292803, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.2083333333333, "eval_completions/mean_length": 300.1197916666667, "eval_completions/min_length": 255.0, "eval_kl": 0.105743408203125, "eval_loss": 0.0010502872755751014, "eval_reward": 0.48244528907040757, "eval_reward_std": 0.057462384924292564, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.48244528907040757, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057462388571972646, "eval_runtime": 306.4673, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 325.6875, "completions/min_length": 255.0, "epoch": 3.474772539288668, "grad_norm": 0.29748278856277466, "kl": 0.04095458984375, "learning_rate": 2.577074404600511e-06, "loss": 0.0004104524850845337, "memory(GiB)": 38.09, "reward": 0.6390348076820374, "reward_std": 0.08076997846364975, "rewards/VisualizationJSONCombinedORM/mean": 0.6390348076820374, "rewards/VisualizationJSONCombinedORM/std": 0.09065680205821991, "step": 4201, "train_speed(iter/s)": 0.190293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 278.8125, "completions/min_length": 210.0, "epoch": 3.4755996691480564, "grad_norm": 0.29955044388771057, "kl": 0.0736083984375, "learning_rate": 2.574548991051015e-06, "loss": 0.0007363948971033096, "memory(GiB)": 38.09, "reward": 0.559008777141571, "reward_std": 0.08171606063842773, "rewards/VisualizationJSONCombinedORM/mean": 0.559008777141571, "rewards/VisualizationJSONCombinedORM/std": 0.26964250206947327, "step": 4202, "train_speed(iter/s)": 0.190127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 318.875, "completions/min_length": 233.0, "epoch": 3.4764267990074442, "grad_norm": 0.2052912414073944, "kl": 0.1353759765625, "learning_rate": 2.5720243864007866e-06, "loss": 0.001356232911348343, "memory(GiB)": 38.09, "reward": 0.4442986845970154, "reward_std": 0.06228846311569214, "rewards/VisualizationJSONCombinedORM/mean": 0.4442986845970154, "rewards/VisualizationJSONCombinedORM/std": 0.08992493897676468, "step": 4203, "train_speed(iter/s)": 0.189904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 331.6875, "completions/min_length": 254.0, "epoch": 3.477253928866832, "grad_norm": 0.25737282633781433, "kl": 0.07305908203125, "learning_rate": 2.569500591491786e-06, "loss": 0.0007305890321731567, "memory(GiB)": 38.09, "reward": 0.6004287004470825, "reward_std": 0.0779537484049797, "rewards/VisualizationJSONCombinedORM/mean": 0.6004287004470825, "rewards/VisualizationJSONCombinedORM/std": 0.11008778214454651, "step": 4204, "train_speed(iter/s)": 0.189731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 294.4375, "completions/min_length": 241.0, "epoch": 3.47808105872622, "grad_norm": 0.23457184433937073, "kl": 0.052734375, "learning_rate": 2.5669776071657194e-06, "loss": 0.0005284268409013748, "memory(GiB)": 38.09, "reward": 0.4811391234397888, "reward_std": 0.023321466520428658, "rewards/VisualizationJSONCombinedORM/mean": 0.4811391234397888, "rewards/VisualizationJSONCombinedORM/std": 0.1935514509677887, "step": 4205, "train_speed(iter/s)": 0.189563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 302.8125, "completions/min_length": 226.0, "epoch": 3.478908188585608, "grad_norm": 0.18136052787303925, "kl": 0.08642578125, "learning_rate": 2.5644554342640084e-06, "loss": 0.000863589346408844, "memory(GiB)": 38.09, "reward": 0.6698033213615417, "reward_std": 0.07413087785243988, "rewards/VisualizationJSONCombinedORM/mean": 0.6698033213615417, "rewards/VisualizationJSONCombinedORM/std": 0.08358728885650635, "step": 4206, "train_speed(iter/s)": 0.189398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 301.8125, "completions/min_length": 225.0, "epoch": 3.479735318444996, "grad_norm": 0.17952942848205566, "kl": 0.114990234375, "learning_rate": 2.5619340736278096e-06, "loss": 0.001149304211139679, "memory(GiB)": 38.09, "reward": 0.5176770687103271, "reward_std": 0.05867888033390045, "rewards/VisualizationJSONCombinedORM/mean": 0.5176770687103271, "rewards/VisualizationJSONCombinedORM/std": 0.24917902052402496, "step": 4207, "train_speed(iter/s)": 0.189199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 282.25, "completions/min_length": 207.0, "epoch": 3.4805624483043838, "grad_norm": 0.22204791009426117, "kl": 0.1160888671875, "learning_rate": 2.5594135260980108e-06, "loss": 0.0011630803346633911, "memory(GiB)": 38.09, "reward": 0.223908931016922, "reward_std": 0.020519882440567017, "rewards/VisualizationJSONCombinedORM/mean": 0.223908931016922, "rewards/VisualizationJSONCombinedORM/std": 0.021329384297132492, "step": 4208, "train_speed(iter/s)": 0.188998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 309.1875, "completions/min_length": 249.0, "epoch": 3.4813895781637716, "grad_norm": 0.19971661269664764, "kl": 0.05242919921875, "learning_rate": 2.5568937925152272e-06, "loss": 0.0005243569612503052, "memory(GiB)": 38.09, "reward": 0.30775436758995056, "reward_std": 0.030872922390699387, "rewards/VisualizationJSONCombinedORM/mean": 0.30775436758995056, "rewards/VisualizationJSONCombinedORM/std": 0.034829769283533096, "step": 4209, "train_speed(iter/s)": 0.188781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 308.8125, "completions/min_length": 236.0, "epoch": 3.4822167080231594, "grad_norm": 0.1804787665605545, "kl": 0.0723876953125, "learning_rate": 2.5543748737197953e-06, "loss": 0.0007267966866493225, "memory(GiB)": 38.09, "reward": 0.4624217450618744, "reward_std": 0.05899709090590477, "rewards/VisualizationJSONCombinedORM/mean": 0.4624217450618744, "rewards/VisualizationJSONCombinedORM/std": 0.08093159645795822, "step": 4210, "train_speed(iter/s)": 0.188625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 298.0625, "completions/min_length": 229.0, "epoch": 3.4830438378825477, "grad_norm": 0.1951235979795456, "kl": 0.051025390625, "learning_rate": 2.551856770551795e-06, "loss": 0.0005115307867527008, "memory(GiB)": 38.09, "reward": 0.4817120134830475, "reward_std": 0.0664631575345993, "rewards/VisualizationJSONCombinedORM/mean": 0.4817120134830475, "rewards/VisualizationJSONCombinedORM/std": 0.08634751290082932, "step": 4211, "train_speed(iter/s)": 0.188488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 278.0, "completions/min_length": 222.0, "epoch": 3.4838709677419355, "grad_norm": 0.23054176568984985, "kl": 0.09619140625, "learning_rate": 2.54933948385102e-06, "loss": 0.0009635128080844879, "memory(GiB)": 38.09, "reward": 0.6679802536964417, "reward_std": 0.10769236832857132, "rewards/VisualizationJSONCombinedORM/mean": 0.6679802536964417, "rewards/VisualizationJSONCombinedORM/std": 0.10668819397687912, "step": 4212, "train_speed(iter/s)": 0.18829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 322.9375, "completions/min_length": 259.0, "epoch": 3.4846980976013233, "grad_norm": 0.17075547575950623, "kl": 0.07861328125, "learning_rate": 2.546823014456998e-06, "loss": 0.0007857903838157654, "memory(GiB)": 38.09, "reward": 0.5025858879089355, "reward_std": 0.06395958364009857, "rewards/VisualizationJSONCombinedORM/mean": 0.5025858879089355, "rewards/VisualizationJSONCombinedORM/std": 0.07292229682207108, "step": 4213, "train_speed(iter/s)": 0.188082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 321.375, "completions/min_length": 263.0, "epoch": 3.485525227460711, "grad_norm": 0.24600635468959808, "kl": 0.03814697265625, "learning_rate": 2.5443073632089843e-06, "loss": 0.00038063153624534607, "memory(GiB)": 38.09, "reward": 0.528019905090332, "reward_std": 0.04664665833115578, "rewards/VisualizationJSONCombinedORM/mean": 0.528019905090332, "rewards/VisualizationJSONCombinedORM/std": 0.06811846792697906, "step": 4214, "train_speed(iter/s)": 0.1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 314.0, "completions/min_length": 253.0, "epoch": 3.4863523573200994, "grad_norm": 0.166708841919899, "kl": 0.0440673828125, "learning_rate": 2.5417925309459623e-06, "loss": 0.0004413686692714691, "memory(GiB)": 38.09, "reward": 0.5538742542266846, "reward_std": 0.04818776249885559, "rewards/VisualizationJSONCombinedORM/mean": 0.5538742542266846, "rewards/VisualizationJSONCombinedORM/std": 0.06898023188114166, "step": 4215, "train_speed(iter/s)": 0.18774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 305.4375, "completions/min_length": 219.0, "epoch": 3.4871794871794872, "grad_norm": 0.21438400447368622, "kl": 0.050537109375, "learning_rate": 2.5392785185066356e-06, "loss": 0.0005061253905296326, "memory(GiB)": 38.09, "reward": 0.6511267423629761, "reward_std": 0.03880536928772926, "rewards/VisualizationJSONCombinedORM/mean": 0.6511267423629761, "rewards/VisualizationJSONCombinedORM/std": 0.08090534061193466, "step": 4216, "train_speed(iter/s)": 0.187557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 305.25, "completions/min_length": 246.0, "epoch": 3.488006617038875, "grad_norm": 0.19471590220928192, "kl": 0.04046630859375, "learning_rate": 2.5367653267294413e-06, "loss": 0.0004048459231853485, "memory(GiB)": 38.09, "reward": 0.48746398091316223, "reward_std": 0.050837136805057526, "rewards/VisualizationJSONCombinedORM/mean": 0.48746398091316223, "rewards/VisualizationJSONCombinedORM/std": 0.0846569612622261, "step": 4217, "train_speed(iter/s)": 0.187393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 300.3125, "completions/min_length": 235.0, "epoch": 3.488833746898263, "grad_norm": 0.24498310685157776, "kl": 0.05401611328125, "learning_rate": 2.5342529564525414e-06, "loss": 0.0005397088825702667, "memory(GiB)": 38.09, "reward": 0.7567881345748901, "reward_std": 0.07581029832363129, "rewards/VisualizationJSONCombinedORM/mean": 0.7567881345748901, "rewards/VisualizationJSONCombinedORM/std": 0.08135467767715454, "step": 4218, "train_speed(iter/s)": 0.187213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 305.625, "completions/min_length": 247.0, "epoch": 3.489660876757651, "grad_norm": 0.19162532687187195, "kl": 0.133544921875, "learning_rate": 2.5317414085138243e-06, "loss": 0.001332007348537445, "memory(GiB)": 38.09, "reward": 0.5885622501373291, "reward_std": 0.05258028954267502, "rewards/VisualizationJSONCombinedORM/mean": 0.5885622501373291, "rewards/VisualizationJSONCombinedORM/std": 0.09714639186859131, "step": 4219, "train_speed(iter/s)": 0.187041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 307.9375, "completions/min_length": 269.0, "epoch": 3.490488006617039, "grad_norm": 0.18477173149585724, "kl": 0.08038330078125, "learning_rate": 2.529230683750897e-06, "loss": 0.0008040294051170349, "memory(GiB)": 38.09, "reward": 0.5625322461128235, "reward_std": 0.052503593266010284, "rewards/VisualizationJSONCombinedORM/mean": 0.5625322461128235, "rewards/VisualizationJSONCombinedORM/std": 0.06107346713542938, "step": 4220, "train_speed(iter/s)": 0.1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 281.25, "completions/min_length": 239.0, "epoch": 3.4913151364764268, "grad_norm": 0.1832769215106964, "kl": 0.0716552734375, "learning_rate": 2.526720783001107e-06, "loss": 0.0007175467908382416, "memory(GiB)": 38.09, "reward": 0.660101592540741, "reward_std": 0.03701743111014366, "rewards/VisualizationJSONCombinedORM/mean": 0.660101592540741, "rewards/VisualizationJSONCombinedORM/std": 0.18132975697517395, "step": 4221, "train_speed(iter/s)": 0.186703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 273.6875, "completions/min_length": 219.0, "epoch": 3.4921422663358146, "grad_norm": 0.24501419067382812, "kl": 0.089599609375, "learning_rate": 2.524211707101511e-06, "loss": 0.0008966661989688873, "memory(GiB)": 38.09, "reward": 0.6203962564468384, "reward_std": 0.05914079025387764, "rewards/VisualizationJSONCombinedORM/mean": 0.6203962564468384, "rewards/VisualizationJSONCombinedORM/std": 0.15977370738983154, "step": 4222, "train_speed(iter/s)": 0.186562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 293.375, "completions/min_length": 237.0, "epoch": 3.4929693961952024, "grad_norm": 0.1864316612482071, "kl": 0.080322265625, "learning_rate": 2.5217034568889e-06, "loss": 0.0008027777075767517, "memory(GiB)": 38.09, "reward": 0.5115736126899719, "reward_std": 0.05281049758195877, "rewards/VisualizationJSONCombinedORM/mean": 0.5115736126899719, "rewards/VisualizationJSONCombinedORM/std": 0.19446204602718353, "step": 4223, "train_speed(iter/s)": 0.186412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 286.1875, "completions/min_length": 247.0, "epoch": 3.4937965260545907, "grad_norm": 0.3194570541381836, "kl": 0.0738525390625, "learning_rate": 2.519196033199786e-06, "loss": 0.0007382482290267944, "memory(GiB)": 38.09, "reward": 0.4666372239589691, "reward_std": 0.07065387070178986, "rewards/VisualizationJSONCombinedORM/mean": 0.4666372239589691, "rewards/VisualizationJSONCombinedORM/std": 0.1530260443687439, "step": 4224, "train_speed(iter/s)": 0.186253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 321.5, "completions/min_length": 272.0, "epoch": 3.4946236559139785, "grad_norm": 0.18578346073627472, "kl": 0.0694580078125, "learning_rate": 2.51668943687041e-06, "loss": 0.0006961077451705933, "memory(GiB)": 38.09, "reward": 0.47786521911621094, "reward_std": 0.05222127214074135, "rewards/VisualizationJSONCombinedORM/mean": 0.47786521911621094, "rewards/VisualizationJSONCombinedORM/std": 0.18527686595916748, "step": 4225, "train_speed(iter/s)": 0.186126 }, { "epoch": 3.4946236559139785, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.7916666666667, "eval_completions/mean_length": 299.4791666666667, "eval_completions/min_length": 252.04166666666666, "eval_kl": 0.056976318359375, "eval_loss": 0.0005704077775590122, "eval_reward": 0.42965220908323926, "eval_reward_std": 0.04402969951964527, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.42965220908323926, "eval_rewards/VisualizationJSONCombinedORM/std": 0.044029697918934595, "eval_runtime": 306.4794, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 286.5625, "completions/min_length": 231.0, "epoch": 3.4954507857733663, "grad_norm": 0.19502031803131104, "kl": 0.03485107421875, "learning_rate": 2.5141836687367273e-06, "loss": 0.00034864433109760284, "memory(GiB)": 38.09, "reward": 0.4425848722457886, "reward_std": 0.038161877542734146, "rewards/VisualizationJSONCombinedORM/mean": 0.4425848722457886, "rewards/VisualizationJSONCombinedORM/std": 0.17163677513599396, "step": 4226, "train_speed(iter/s)": 0.183481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 303.6875, "completions/min_length": 229.0, "epoch": 3.4962779156327546, "grad_norm": 0.48021411895751953, "kl": 0.31121826171875, "learning_rate": 2.5116787296344313e-06, "loss": 0.003109227865934372, "memory(GiB)": 38.09, "reward": 0.3972685933113098, "reward_std": 0.034791115671396255, "rewards/VisualizationJSONCombinedORM/mean": 0.3972685933113098, "rewards/VisualizationJSONCombinedORM/std": 0.057131364941596985, "step": 4227, "train_speed(iter/s)": 0.183286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 308.8125, "completions/min_length": 247.0, "epoch": 3.4971050454921424, "grad_norm": 0.1990402340888977, "kl": 0.115478515625, "learning_rate": 2.509174620398924e-06, "loss": 0.0011540856212377548, "memory(GiB)": 38.09, "reward": 0.41316092014312744, "reward_std": 0.03753391280770302, "rewards/VisualizationJSONCombinedORM/mean": 0.41316092014312744, "rewards/VisualizationJSONCombinedORM/std": 0.03884849324822426, "step": 4228, "train_speed(iter/s)": 0.183122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 295.9375, "completions/min_length": 219.0, "epoch": 3.4979321753515302, "grad_norm": 0.20986750721931458, "kl": 0.08740234375, "learning_rate": 2.506671341865341e-06, "loss": 0.0008734911680221558, "memory(GiB)": 38.09, "reward": 0.3270009160041809, "reward_std": 0.03180820122361183, "rewards/VisualizationJSONCombinedORM/mean": 0.3270009160041809, "rewards/VisualizationJSONCombinedORM/std": 0.06840692460536957, "step": 4229, "train_speed(iter/s)": 0.182968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 293.9375, "completions/min_length": 234.0, "epoch": 3.498759305210918, "grad_norm": 0.23307017982006073, "kl": 0.07855224609375, "learning_rate": 2.5041688948685367e-06, "loss": 0.0007852762937545776, "memory(GiB)": 38.09, "reward": 0.4188937842845917, "reward_std": 0.03571189194917679, "rewards/VisualizationJSONCombinedORM/mean": 0.4188937842845917, "rewards/VisualizationJSONCombinedORM/std": 0.0654272735118866, "step": 4230, "train_speed(iter/s)": 0.182831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 270.1875, "completions/min_length": 205.0, "epoch": 3.499586435070306, "grad_norm": 0.1780574470758438, "kl": 0.085205078125, "learning_rate": 2.5016672802430904e-06, "loss": 0.0008520632982254028, "memory(GiB)": 38.09, "reward": 0.4202004075050354, "reward_std": 0.056016724556684494, "rewards/VisualizationJSONCombinedORM/mean": 0.4202004075050354, "rewards/VisualizationJSONCombinedORM/std": 0.0572323203086853, "step": 4231, "train_speed(iter/s)": 0.182701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 302.25, "completions/min_length": 242.0, "epoch": 3.500413564929694, "grad_norm": 0.1736358106136322, "kl": 0.048095703125, "learning_rate": 2.499166498823299e-06, "loss": 0.0004808977246284485, "memory(GiB)": 38.09, "reward": 0.5809221863746643, "reward_std": 0.04317416250705719, "rewards/VisualizationJSONCombinedORM/mean": 0.5809221863746643, "rewards/VisualizationJSONCombinedORM/std": 0.0675077810883522, "step": 4232, "train_speed(iter/s)": 0.18251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 291.9375, "completions/min_length": 217.0, "epoch": 3.501240694789082, "grad_norm": 0.179901123046875, "kl": 0.04156494140625, "learning_rate": 2.4966665514431863e-06, "loss": 0.0004154294729232788, "memory(GiB)": 38.09, "reward": 0.30765464901924133, "reward_std": 0.030673891305923462, "rewards/VisualizationJSONCombinedORM/mean": 0.30765464901924133, "rewards/VisualizationJSONCombinedORM/std": 0.08987594395875931, "step": 4233, "train_speed(iter/s)": 0.182371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 302.375, "completions/min_length": 240.0, "epoch": 3.5020678246484698, "grad_norm": 0.200156569480896, "kl": 0.085693359375, "learning_rate": 2.4941674389364974e-06, "loss": 0.0008561722934246063, "memory(GiB)": 38.09, "reward": 0.30083227157592773, "reward_std": 0.03699976205825806, "rewards/VisualizationJSONCombinedORM/mean": 0.30083227157592773, "rewards/VisualizationJSONCombinedORM/std": 0.04127175360918045, "step": 4234, "train_speed(iter/s)": 0.182208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 295.0, "completions/min_length": 225.0, "epoch": 3.5028949545078576, "grad_norm": 0.16329911351203918, "kl": 0.0335693359375, "learning_rate": 2.4916691621366984e-06, "loss": 0.00033571571111679077, "memory(GiB)": 38.09, "reward": 0.6131401658058167, "reward_std": 0.06498757004737854, "rewards/VisualizationJSONCombinedORM/mean": 0.6131401658058167, "rewards/VisualizationJSONCombinedORM/std": 0.08151140809059143, "step": 4235, "train_speed(iter/s)": 0.182038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 293.875, "completions/min_length": 229.0, "epoch": 3.5037220843672454, "grad_norm": 0.18595172464847565, "kl": 0.0513916015625, "learning_rate": 2.4891717218769722e-06, "loss": 0.0005145072937011719, "memory(GiB)": 38.09, "reward": 0.3928857445716858, "reward_std": 0.03763257712125778, "rewards/VisualizationJSONCombinedORM/mean": 0.3928857445716858, "rewards/VisualizationJSONCombinedORM/std": 0.03681287169456482, "step": 4236, "train_speed(iter/s)": 0.181852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 307.5, "completions/min_length": 246.0, "epoch": 3.5045492142266337, "grad_norm": 0.18436893820762634, "kl": 0.031341552734375, "learning_rate": 2.486675118990233e-06, "loss": 0.00031285360455513, "memory(GiB)": 38.09, "reward": 0.448038786649704, "reward_std": 0.04473840445280075, "rewards/VisualizationJSONCombinedORM/mean": 0.448038786649704, "rewards/VisualizationJSONCombinedORM/std": 0.08291970938444138, "step": 4237, "train_speed(iter/s)": 0.181696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 276.75, "completions/min_length": 210.0, "epoch": 3.5053763440860215, "grad_norm": 0.19679522514343262, "kl": 0.0345458984375, "learning_rate": 2.4841793543091053e-06, "loss": 0.00034549832344055176, "memory(GiB)": 38.09, "reward": 0.524441123008728, "reward_std": 0.05480070039629936, "rewards/VisualizationJSONCombinedORM/mean": 0.524441123008728, "rewards/VisualizationJSONCombinedORM/std": 0.28504472970962524, "step": 4238, "train_speed(iter/s)": 0.181531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 303.625, "completions/min_length": 247.0, "epoch": 3.5062034739454093, "grad_norm": 0.18947754800319672, "kl": 0.05224609375, "learning_rate": 2.4816844286659387e-06, "loss": 0.0005232244729995728, "memory(GiB)": 38.09, "reward": 0.3450854420661926, "reward_std": 0.030578400939702988, "rewards/VisualizationJSONCombinedORM/mean": 0.3450854420661926, "rewards/VisualizationJSONCombinedORM/std": 0.03231433779001236, "step": 4239, "train_speed(iter/s)": 0.181363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 290.625, "completions/min_length": 241.0, "epoch": 3.5070306038047976, "grad_norm": 0.2279254049062729, "kl": 0.05078125, "learning_rate": 2.479190342892804e-06, "loss": 0.0005089715123176575, "memory(GiB)": 38.09, "reward": 0.4289594292640686, "reward_std": 0.01979324221611023, "rewards/VisualizationJSONCombinedORM/mean": 0.4289594292640686, "rewards/VisualizationJSONCombinedORM/std": 0.21069279313087463, "step": 4240, "train_speed(iter/s)": 0.181223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 318.4375, "completions/min_length": 254.0, "epoch": 3.5078577336641854, "grad_norm": 0.15711914002895355, "kl": 0.05572509765625, "learning_rate": 2.476697097821491e-06, "loss": 0.0005557052791118622, "memory(GiB)": 38.09, "reward": 0.4143257141113281, "reward_std": 0.026600584387779236, "rewards/VisualizationJSONCombinedORM/mean": 0.4143257141113281, "rewards/VisualizationJSONCombinedORM/std": 0.036347270011901855, "step": 4241, "train_speed(iter/s)": 0.181098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 287.0625, "completions/min_length": 224.0, "epoch": 3.5086848635235732, "grad_norm": 0.16642603278160095, "kl": 0.045166015625, "learning_rate": 2.4742046942835035e-06, "loss": 0.0004511997103691101, "memory(GiB)": 38.09, "reward": 0.5862447023391724, "reward_std": 0.06702196598052979, "rewards/VisualizationJSONCombinedORM/mean": 0.5862447023391724, "rewards/VisualizationJSONCombinedORM/std": 0.09626927226781845, "step": 4242, "train_speed(iter/s)": 0.180959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 311.5, "completions/min_length": 220.0, "epoch": 3.509511993382961, "grad_norm": 0.2007083147764206, "kl": 0.0518798828125, "learning_rate": 2.471713133110078e-06, "loss": 0.0005178675055503845, "memory(GiB)": 38.09, "reward": 0.6350228786468506, "reward_std": 0.09651261568069458, "rewards/VisualizationJSONCombinedORM/mean": 0.6350228786468506, "rewards/VisualizationJSONCombinedORM/std": 0.09564556181430817, "step": 4243, "train_speed(iter/s)": 0.180788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 283.8125, "completions/min_length": 203.0, "epoch": 3.510339123242349, "grad_norm": 0.2098742574453354, "kl": 0.0682373046875, "learning_rate": 2.4692224151321555e-06, "loss": 0.0006834641098976135, "memory(GiB)": 38.09, "reward": 0.5089876055717468, "reward_std": 0.06892332434654236, "rewards/VisualizationJSONCombinedORM/mean": 0.5089876055717468, "rewards/VisualizationJSONCombinedORM/std": 0.11312918365001678, "step": 4244, "train_speed(iter/s)": 0.180662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 313.6875, "completions/min_length": 234.0, "epoch": 3.511166253101737, "grad_norm": 0.19507557153701782, "kl": 0.08233642578125, "learning_rate": 2.466732541180404e-06, "loss": 0.0008235089480876923, "memory(GiB)": 38.09, "reward": 0.29197829961776733, "reward_std": 0.02992812730371952, "rewards/VisualizationJSONCombinedORM/mean": 0.29197829961776733, "rewards/VisualizationJSONCombinedORM/std": 0.0401490181684494, "step": 4245, "train_speed(iter/s)": 0.180495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 307.5, "completions/min_length": 247.0, "epoch": 3.511993382961125, "grad_norm": 0.16650155186653137, "kl": 0.036895751953125, "learning_rate": 2.4642435120852076e-06, "loss": 0.00036793947219848633, "memory(GiB)": 38.09, "reward": 0.5701613426208496, "reward_std": 0.023007744923233986, "rewards/VisualizationJSONCombinedORM/mean": 0.5701613426208496, "rewards/VisualizationJSONCombinedORM/std": 0.10375302284955978, "step": 4246, "train_speed(iter/s)": 0.180344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 307.8125, "completions/min_length": 228.0, "epoch": 3.5128205128205128, "grad_norm": 0.17683738470077515, "kl": 0.07080078125, "learning_rate": 2.461755328676672e-06, "loss": 0.0007080715149641037, "memory(GiB)": 38.09, "reward": 0.5254108905792236, "reward_std": 0.051482293754816055, "rewards/VisualizationJSONCombinedORM/mean": 0.5254108905792236, "rewards/VisualizationJSONCombinedORM/std": 0.17042477428913116, "step": 4247, "train_speed(iter/s)": 0.180195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 298.0, "completions/min_length": 234.0, "epoch": 3.5136476426799006, "grad_norm": 0.20253734290599823, "kl": 0.0579833984375, "learning_rate": 2.459267991784614e-06, "loss": 0.0005794186145067215, "memory(GiB)": 38.09, "reward": 0.44988417625427246, "reward_std": 0.036888495087623596, "rewards/VisualizationJSONCombinedORM/mean": 0.44988417625427246, "rewards/VisualizationJSONCombinedORM/std": 0.03946923092007637, "step": 4248, "train_speed(iter/s)": 0.180022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 305.9375, "completions/min_length": 255.0, "epoch": 3.5144747725392884, "grad_norm": 0.2076679915189743, "kl": 0.046142578125, "learning_rate": 2.456781502238574e-06, "loss": 0.00046241655945777893, "memory(GiB)": 38.09, "reward": 0.4066026210784912, "reward_std": 0.03997064754366875, "rewards/VisualizationJSONCombinedORM/mean": 0.4066026210784912, "rewards/VisualizationJSONCombinedORM/std": 0.06793498247861862, "step": 4249, "train_speed(iter/s)": 0.179887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 277.5625, "completions/min_length": 202.0, "epoch": 3.5153019023986767, "grad_norm": 0.17226941883563995, "kl": 0.095458984375, "learning_rate": 2.4542958608678075e-06, "loss": 0.0009560734033584595, "memory(GiB)": 38.09, "reward": 0.33772939443588257, "reward_std": 0.03752344846725464, "rewards/VisualizationJSONCombinedORM/mean": 0.33772939443588257, "rewards/VisualizationJSONCombinedORM/std": 0.08343689143657684, "step": 4250, "train_speed(iter/s)": 0.179708 }, { "epoch": 3.5153019023986767, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 357.9166666666667, "eval_completions/mean_length": 294.1354166666667, "eval_completions/min_length": 248.83333333333334, "eval_kl": 0.057342529296875, "eval_loss": 0.0005773628945462406, "eval_reward": 0.42998154896001023, "eval_reward_std": 0.04443747893674299, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.42998154896001023, "eval_rewards/VisualizationJSONCombinedORM/std": 0.044437480721777924, "eval_runtime": 306.546, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 287.6875, "completions/min_length": 251.0, "epoch": 3.5161290322580645, "grad_norm": 0.20617738366127014, "kl": 0.0587158203125, "learning_rate": 2.4518110685012897e-06, "loss": 0.0005874484777450562, "memory(GiB)": 38.09, "reward": 0.5533851385116577, "reward_std": 0.03391101956367493, "rewards/VisualizationJSONCombinedORM/mean": 0.5533851385116577, "rewards/VisualizationJSONCombinedORM/std": 0.2628258168697357, "step": 4251, "train_speed(iter/s)": 0.177256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 304.0625, "completions/min_length": 226.0, "epoch": 3.5169561621174523, "grad_norm": 0.17122066020965576, "kl": 0.0843505859375, "learning_rate": 2.4493271259677047e-06, "loss": 0.0008453764021396637, "memory(GiB)": 38.09, "reward": 0.625400960445404, "reward_std": 0.030610837042331696, "rewards/VisualizationJSONCombinedORM/mean": 0.625400960445404, "rewards/VisualizationJSONCombinedORM/std": 0.14860936999320984, "step": 4252, "train_speed(iter/s)": 0.177107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 274.8125, "completions/min_length": 206.0, "epoch": 3.5177832919768406, "grad_norm": 0.17887969315052032, "kl": 0.04046630859375, "learning_rate": 2.4468440340954664e-06, "loss": 0.00040484964847564697, "memory(GiB)": 38.09, "reward": 0.5735102891921997, "reward_std": 0.035880617797374725, "rewards/VisualizationJSONCombinedORM/mean": 0.5735102891921997, "rewards/VisualizationJSONCombinedORM/std": 0.17088720202445984, "step": 4253, "train_speed(iter/s)": 0.176975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 284.5, "completions/min_length": 232.0, "epoch": 3.5186104218362284, "grad_norm": 0.19718551635742188, "kl": 0.0948486328125, "learning_rate": 2.4443617937126923e-06, "loss": 0.000948932021856308, "memory(GiB)": 38.09, "reward": 0.545441210269928, "reward_std": 0.05171950161457062, "rewards/VisualizationJSONCombinedORM/mean": 0.545441210269928, "rewards/VisualizationJSONCombinedORM/std": 0.22004219889640808, "step": 4254, "train_speed(iter/s)": 0.176798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 295.9375, "completions/min_length": 232.0, "epoch": 3.5194375516956162, "grad_norm": 0.20099782943725586, "kl": 0.050048828125, "learning_rate": 2.4418804056472228e-06, "loss": 0.0005008988082408905, "memory(GiB)": 38.09, "reward": 0.49226897954940796, "reward_std": 0.05600123852491379, "rewards/VisualizationJSONCombinedORM/mean": 0.49226897954940796, "rewards/VisualizationJSONCombinedORM/std": 0.16124841570854187, "step": 4255, "train_speed(iter/s)": 0.176653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 296.375, "completions/min_length": 227.0, "epoch": 3.520264681555004, "grad_norm": 0.21071752905845642, "kl": 0.052978515625, "learning_rate": 2.4393998707266124e-06, "loss": 0.0005296152085065842, "memory(GiB)": 38.09, "reward": 0.7430809736251831, "reward_std": 0.06902673840522766, "rewards/VisualizationJSONCombinedORM/mean": 0.7430809736251831, "rewards/VisualizationJSONCombinedORM/std": 0.07099010050296783, "step": 4256, "train_speed(iter/s)": 0.176502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 279.0625, "completions/min_length": 230.0, "epoch": 3.521091811414392, "grad_norm": 0.2609984874725342, "kl": 0.05792236328125, "learning_rate": 2.436920189778134e-06, "loss": 0.0005794316530227661, "memory(GiB)": 38.09, "reward": 0.4327763319015503, "reward_std": 0.03845818340778351, "rewards/VisualizationJSONCombinedORM/mean": 0.4327763319015503, "rewards/VisualizationJSONCombinedORM/std": 0.07211165875196457, "step": 4257, "train_speed(iter/s)": 0.176356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 301.25, "completions/min_length": 217.0, "epoch": 3.52191894127378, "grad_norm": 0.2043454796075821, "kl": 0.0693359375, "learning_rate": 2.434441363628765e-06, "loss": 0.0006954371929168701, "memory(GiB)": 38.09, "reward": 0.5791391730308533, "reward_std": 0.059253837913274765, "rewards/VisualizationJSONCombinedORM/mean": 0.5791391730308533, "rewards/VisualizationJSONCombinedORM/std": 0.10941345244646072, "step": 4258, "train_speed(iter/s)": 0.176174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 306.0, "completions/min_length": 246.0, "epoch": 3.522746071133168, "grad_norm": 0.2317187637090683, "kl": 0.039794921875, "learning_rate": 2.431963393105215e-06, "loss": 0.00039762258529663086, "memory(GiB)": 38.09, "reward": 0.6369837522506714, "reward_std": 0.04219885170459747, "rewards/VisualizationJSONCombinedORM/mean": 0.6369837522506714, "rewards/VisualizationJSONCombinedORM/std": 0.18146707117557526, "step": 4259, "train_speed(iter/s)": 0.175998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 285.3125, "completions/min_length": 249.0, "epoch": 3.5235732009925558, "grad_norm": 0.19068855047225952, "kl": 0.106689453125, "learning_rate": 2.429486279033892e-06, "loss": 0.001064881682395935, "memory(GiB)": 38.09, "reward": 0.5636167526245117, "reward_std": 0.06275439262390137, "rewards/VisualizationJSONCombinedORM/mean": 0.5636167526245117, "rewards/VisualizationJSONCombinedORM/std": 0.10156773030757904, "step": 4260, "train_speed(iter/s)": 0.175859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 284.0, "completions/min_length": 226.0, "epoch": 3.5244003308519436, "grad_norm": 0.18635644018650055, "kl": 0.05377197265625, "learning_rate": 2.4270100222409275e-06, "loss": 0.000538308173418045, "memory(GiB)": 38.09, "reward": 0.7411434650421143, "reward_std": 0.0697043240070343, "rewards/VisualizationJSONCombinedORM/mean": 0.7411434650421143, "rewards/VisualizationJSONCombinedORM/std": 0.0673452615737915, "step": 4261, "train_speed(iter/s)": 0.175722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 277.9375, "completions/min_length": 226.0, "epoch": 3.5252274607113314, "grad_norm": 0.20915694534778595, "kl": 0.0634765625, "learning_rate": 2.424534623552165e-06, "loss": 0.0006338953971862793, "memory(GiB)": 38.09, "reward": 0.7600463032722473, "reward_std": 0.09858682751655579, "rewards/VisualizationJSONCombinedORM/mean": 0.7600463032722473, "rewards/VisualizationJSONCombinedORM/std": 0.10309374332427979, "step": 4262, "train_speed(iter/s)": 0.175554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 301.6875, "completions/min_length": 244.0, "epoch": 3.5260545905707197, "grad_norm": 0.19878549873828888, "kl": 0.0657958984375, "learning_rate": 2.4220600837931614e-06, "loss": 0.0006573908030986786, "memory(GiB)": 38.09, "reward": 0.6257449984550476, "reward_std": 0.04611315578222275, "rewards/VisualizationJSONCombinedORM/mean": 0.6257449984550476, "rewards/VisualizationJSONCombinedORM/std": 0.12626366317272186, "step": 4263, "train_speed(iter/s)": 0.175378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 294.25, "completions/min_length": 222.0, "epoch": 3.5268817204301075, "grad_norm": 0.17505933344364166, "kl": 0.04681396484375, "learning_rate": 2.4195864037891886e-06, "loss": 0.0004680454730987549, "memory(GiB)": 38.09, "reward": 0.3925623893737793, "reward_std": 0.019272126257419586, "rewards/VisualizationJSONCombinedORM/mean": 0.3925623893737793, "rewards/VisualizationJSONCombinedORM/std": 0.031648583710193634, "step": 4264, "train_speed(iter/s)": 0.175205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 284.8125, "completions/min_length": 223.0, "epoch": 3.5277088502894953, "grad_norm": 0.21102039515972137, "kl": 0.10211181640625, "learning_rate": 2.4171135843652256e-06, "loss": 0.001021258533000946, "memory(GiB)": 38.09, "reward": 0.6035839319229126, "reward_std": 0.08939904719591141, "rewards/VisualizationJSONCombinedORM/mean": 0.6035839319229126, "rewards/VisualizationJSONCombinedORM/std": 0.1519239842891693, "step": 4265, "train_speed(iter/s)": 0.175076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 291.5625, "completions/min_length": 237.0, "epoch": 3.5285359801488836, "grad_norm": 0.22552408277988434, "kl": 0.11181640625, "learning_rate": 2.4146416263459754e-06, "loss": 0.0011154748499393463, "memory(GiB)": 38.09, "reward": 0.2891335189342499, "reward_std": 0.03103475458920002, "rewards/VisualizationJSONCombinedORM/mean": 0.2891335189342499, "rewards/VisualizationJSONCombinedORM/std": 0.0624898336827755, "step": 4266, "train_speed(iter/s)": 0.174884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 331.9375, "completions/min_length": 257.0, "epoch": 3.5293631100082714, "grad_norm": 0.17029139399528503, "kl": 0.0616455078125, "learning_rate": 2.412170530555844e-06, "loss": 0.0006163343787193298, "memory(GiB)": 38.09, "reward": 0.6919851899147034, "reward_std": 0.06965179741382599, "rewards/VisualizationJSONCombinedORM/mean": 0.6919851899147034, "rewards/VisualizationJSONCombinedORM/std": 0.08882444351911545, "step": 4267, "train_speed(iter/s)": 0.17471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 286.25, "completions/min_length": 199.0, "epoch": 3.5301902398676592, "grad_norm": 0.21411873400211334, "kl": 0.112548828125, "learning_rate": 2.409700297818954e-06, "loss": 0.001129206269979477, "memory(GiB)": 38.09, "reward": 0.5173805356025696, "reward_std": 0.050458867102861404, "rewards/VisualizationJSONCombinedORM/mean": 0.5173805356025696, "rewards/VisualizationJSONCombinedORM/std": 0.14426952600479126, "step": 4268, "train_speed(iter/s)": 0.174575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 292.625, "completions/min_length": 225.0, "epoch": 3.531017369727047, "grad_norm": 0.17016349732875824, "kl": 0.0997314453125, "learning_rate": 2.4072309289591394e-06, "loss": 0.0009986460208892822, "memory(GiB)": 38.09, "reward": 0.4589177370071411, "reward_std": 0.04482909291982651, "rewards/VisualizationJSONCombinedORM/mean": 0.4589177370071411, "rewards/VisualizationJSONCombinedORM/std": 0.19076968729496002, "step": 4269, "train_speed(iter/s)": 0.174403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 313.375, "completions/min_length": 213.0, "epoch": 3.531844499586435, "grad_norm": 0.2556258738040924, "kl": 0.108154296875, "learning_rate": 2.4047624247999484e-06, "loss": 0.0010819341987371445, "memory(GiB)": 38.09, "reward": 0.35520312190055847, "reward_std": 0.05459915101528168, "rewards/VisualizationJSONCombinedORM/mean": 0.35520312190055847, "rewards/VisualizationJSONCombinedORM/std": 0.05719660967588425, "step": 4270, "train_speed(iter/s)": 0.174269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 306.625, "completions/min_length": 248.0, "epoch": 3.532671629445823, "grad_norm": 0.18041349947452545, "kl": 0.09149169921875, "learning_rate": 2.4022947861646355e-06, "loss": 0.0009194202721118927, "memory(GiB)": 38.09, "reward": 0.634520947933197, "reward_std": 0.07217025011777878, "rewards/VisualizationJSONCombinedORM/mean": 0.634520947933197, "rewards/VisualizationJSONCombinedORM/std": 0.12471578270196915, "step": 4271, "train_speed(iter/s)": 0.17414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 299.75, "completions/min_length": 250.0, "epoch": 3.533498759305211, "grad_norm": 0.17055825889110565, "kl": 0.04461669921875, "learning_rate": 2.3998280138761715e-06, "loss": 0.00044605880975723267, "memory(GiB)": 38.09, "reward": 0.7845324277877808, "reward_std": 0.02523614838719368, "rewards/VisualizationJSONCombinedORM/mean": 0.7845324277877808, "rewards/VisualizationJSONCombinedORM/std": 0.04093541204929352, "step": 4272, "train_speed(iter/s)": 0.17401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 255.375, "completions/min_length": 200.0, "epoch": 3.5343258891645988, "grad_norm": 0.3024757504463196, "kl": 0.386474609375, "learning_rate": 2.397362108757236e-06, "loss": 0.0038623958826065063, "memory(GiB)": 38.09, "reward": 0.6854052543640137, "reward_std": 0.06183813512325287, "rewards/VisualizationJSONCombinedORM/mean": 0.6854052543640137, "rewards/VisualizationJSONCombinedORM/std": 0.09380420297384262, "step": 4273, "train_speed(iter/s)": 0.173844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 294.5, "completions/min_length": 233.0, "epoch": 3.535153019023987, "grad_norm": 0.18814751505851746, "kl": 0.0731201171875, "learning_rate": 2.3948970716302215e-06, "loss": 0.0007325392216444016, "memory(GiB)": 38.09, "reward": 0.5171592235565186, "reward_std": 0.046522483229637146, "rewards/VisualizationJSONCombinedORM/mean": 0.5171592235565186, "rewards/VisualizationJSONCombinedORM/std": 0.13092444837093353, "step": 4274, "train_speed(iter/s)": 0.173697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 301.625, "completions/min_length": 233.0, "epoch": 3.5359801488833744, "grad_norm": 0.2916709780693054, "kl": 0.16741943359375, "learning_rate": 2.3924329033172246e-06, "loss": 0.0016729924827814102, "memory(GiB)": 38.09, "reward": 0.3798249363899231, "reward_std": 0.05500555783510208, "rewards/VisualizationJSONCombinedORM/mean": 0.3798249363899231, "rewards/VisualizationJSONCombinedORM/std": 0.08147624880075455, "step": 4275, "train_speed(iter/s)": 0.173563 }, { "epoch": 3.5359801488833744, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 351.875, "eval_completions/mean_length": 293.453125, "eval_completions/min_length": 246.29166666666666, "eval_kl": 0.07558186848958333, "eval_loss": 0.0007580903475172818, "eval_reward": 0.4799024562040965, "eval_reward_std": 0.049520094122271985, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4799024562040965, "eval_rewards/VisualizationJSONCombinedORM/std": 0.049520096431175865, "eval_runtime": 303.1884, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 306.875, "completions/min_length": 256.0, "epoch": 3.5368072787427627, "grad_norm": 0.29400157928466797, "kl": 0.06390380859375, "learning_rate": 2.3899696046400645e-06, "loss": 0.0006395578384399414, "memory(GiB)": 38.09, "reward": 0.3882555365562439, "reward_std": 0.05715419724583626, "rewards/VisualizationJSONCombinedORM/mean": 0.3882555365562439, "rewards/VisualizationJSONCombinedORM/std": 0.0806436762213707, "step": 4276, "train_speed(iter/s)": 0.171303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 276.1875, "completions/min_length": 204.0, "epoch": 3.5376344086021505, "grad_norm": 0.22974415123462677, "kl": 0.09637451171875, "learning_rate": 2.387507176420256e-06, "loss": 0.0009618513286113739, "memory(GiB)": 38.09, "reward": 0.593591034412384, "reward_std": 0.126286581158638, "rewards/VisualizationJSONCombinedORM/mean": 0.593591034412384, "rewards/VisualizationJSONCombinedORM/std": 0.17102129757404327, "step": 4277, "train_speed(iter/s)": 0.171195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 263.25, "completions/min_length": 223.0, "epoch": 3.5384615384615383, "grad_norm": 0.16945670545101166, "kl": 0.10791015625, "learning_rate": 2.385045619479034e-06, "loss": 0.0010815896093845367, "memory(GiB)": 38.09, "reward": 0.7223191261291504, "reward_std": 0.08658581972122192, "rewards/VisualizationJSONCombinedORM/mean": 0.7223191261291504, "rewards/VisualizationJSONCombinedORM/std": 0.08971447497606277, "step": 4278, "train_speed(iter/s)": 0.171057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 269.0625, "completions/min_length": 218.0, "epoch": 3.5392886683209266, "grad_norm": 0.3228899836540222, "kl": 0.04571533203125, "learning_rate": 2.382584934637338e-06, "loss": 0.00045653246343135834, "memory(GiB)": 38.09, "reward": 0.4686213433742523, "reward_std": 0.09433839470148087, "rewards/VisualizationJSONCombinedORM/mean": 0.4686213433742523, "rewards/VisualizationJSONCombinedORM/std": 0.12906032800674438, "step": 4279, "train_speed(iter/s)": 0.170919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 307.6875, "completions/min_length": 245.0, "epoch": 3.5401157981803144, "grad_norm": 0.2052321434020996, "kl": 0.0394287109375, "learning_rate": 2.38012512271582e-06, "loss": 0.0003957897424697876, "memory(GiB)": 38.09, "reward": 0.6308790445327759, "reward_std": 0.0540832094848156, "rewards/VisualizationJSONCombinedORM/mean": 0.6308790445327759, "rewards/VisualizationJSONCombinedORM/std": 0.2723303735256195, "step": 4280, "train_speed(iter/s)": 0.170787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 249.75, "completions/min_length": 219.0, "epoch": 3.5409429280397022, "grad_norm": 0.15970441699028015, "kl": 0.2330322265625, "learning_rate": 2.3776661845348342e-06, "loss": 0.0023320764303207397, "memory(GiB)": 38.09, "reward": 0.46379581093788147, "reward_std": 0.03585263341665268, "rewards/VisualizationJSONCombinedORM/mean": 0.46379581093788147, "rewards/VisualizationJSONCombinedORM/std": 0.2189009189605713, "step": 4281, "train_speed(iter/s)": 0.170674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 274.6875, "completions/min_length": 228.0, "epoch": 3.54177005789909, "grad_norm": 0.22652368247509003, "kl": 0.1112060546875, "learning_rate": 2.3752081209144538e-06, "loss": 0.0011117011308670044, "memory(GiB)": 38.09, "reward": 0.5439147353172302, "reward_std": 0.06123356148600578, "rewards/VisualizationJSONCombinedORM/mean": 0.5439147353172302, "rewards/VisualizationJSONCombinedORM/std": 0.10761557519435883, "step": 4282, "train_speed(iter/s)": 0.170567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 305.3125, "completions/min_length": 236.0, "epoch": 3.542597187758478, "grad_norm": 0.17916759848594666, "kl": 0.037139892578125, "learning_rate": 2.3727509326744503e-06, "loss": 0.0003712400794029236, "memory(GiB)": 38.09, "reward": 0.6597981452941895, "reward_std": 0.06832227855920792, "rewards/VisualizationJSONCombinedORM/mean": 0.6597981452941895, "rewards/VisualizationJSONCombinedORM/std": 0.15284156799316406, "step": 4283, "train_speed(iter/s)": 0.170427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 269.0, "completions/min_length": 217.0, "epoch": 3.543424317617866, "grad_norm": 0.2370639592409134, "kl": 0.06158447265625, "learning_rate": 2.3702946206343086e-06, "loss": 0.0006159208714962006, "memory(GiB)": 38.09, "reward": 0.48379069566726685, "reward_std": 0.07226429879665375, "rewards/VisualizationJSONCombinedORM/mean": 0.48379069566726685, "rewards/VisualizationJSONCombinedORM/std": 0.08065706491470337, "step": 4284, "train_speed(iter/s)": 0.170263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 324.1875, "completions/min_length": 269.0, "epoch": 3.544251447477254, "grad_norm": 0.18924632668495178, "kl": 0.042236328125, "learning_rate": 2.3678391856132203e-06, "loss": 0.0004215538501739502, "memory(GiB)": 38.09, "reward": 0.6464354991912842, "reward_std": 0.03456813842058182, "rewards/VisualizationJSONCombinedORM/mean": 0.6464354991912842, "rewards/VisualizationJSONCombinedORM/std": 0.06588078290224075, "step": 4285, "train_speed(iter/s)": 0.170108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 287.3125, "completions/min_length": 225.0, "epoch": 3.545078577336642, "grad_norm": 0.18866190314292908, "kl": 0.0701904296875, "learning_rate": 2.3653846284300873e-06, "loss": 0.0007014460861682892, "memory(GiB)": 38.09, "reward": 0.6852422952651978, "reward_std": 0.0738348662853241, "rewards/VisualizationJSONCombinedORM/mean": 0.6852422952651978, "rewards/VisualizationJSONCombinedORM/std": 0.08287432789802551, "step": 4286, "train_speed(iter/s)": 0.169984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 293.625, "completions/min_length": 229.0, "epoch": 3.54590570719603, "grad_norm": 0.1656835973262787, "kl": 0.051513671875, "learning_rate": 2.3629309499035108e-06, "loss": 0.0005146116018295288, "memory(GiB)": 38.09, "reward": 0.7293339371681213, "reward_std": 0.02408427931368351, "rewards/VisualizationJSONCombinedORM/mean": 0.7293339371681213, "rewards/VisualizationJSONCombinedORM/std": 0.07017561048269272, "step": 4287, "train_speed(iter/s)": 0.169844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 320.375, "completions/min_length": 246.0, "epoch": 3.546732837055418, "grad_norm": 0.17481334507465363, "kl": 0.048095703125, "learning_rate": 2.3604781508518067e-06, "loss": 0.00048055499792099, "memory(GiB)": 38.09, "reward": 0.7804312705993652, "reward_std": 0.0718809962272644, "rewards/VisualizationJSONCombinedORM/mean": 0.7804312705993652, "rewards/VisualizationJSONCombinedORM/std": 0.08810308575630188, "step": 4288, "train_speed(iter/s)": 0.169692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 288.4375, "completions/min_length": 241.0, "epoch": 3.5475599669148057, "grad_norm": 0.20240390300750732, "kl": 0.07916259765625, "learning_rate": 2.358026232092995e-06, "loss": 0.000791698694229126, "memory(GiB)": 38.09, "reward": 0.623748779296875, "reward_std": 0.09516257792711258, "rewards/VisualizationJSONCombinedORM/mean": 0.623748779296875, "rewards/VisualizationJSONCombinedORM/std": 0.16902291774749756, "step": 4289, "train_speed(iter/s)": 0.169567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 272.8125, "completions/min_length": 201.0, "epoch": 3.5483870967741935, "grad_norm": 0.19591711461544037, "kl": 0.08837890625, "learning_rate": 2.3555751944448036e-06, "loss": 0.0008852062746882439, "memory(GiB)": 38.09, "reward": 0.5596638917922974, "reward_std": 0.06350168585777283, "rewards/VisualizationJSONCombinedORM/mean": 0.5596638917922974, "rewards/VisualizationJSONCombinedORM/std": 0.24037079513072968, "step": 4290, "train_speed(iter/s)": 0.169402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 264.125, "completions/min_length": 208.0, "epoch": 3.5492142266335813, "grad_norm": 0.17863455414772034, "kl": 0.050048828125, "learning_rate": 2.353125038724659e-06, "loss": 0.000500156544148922, "memory(GiB)": 38.09, "reward": 0.6239070892333984, "reward_std": 0.05683823674917221, "rewards/VisualizationJSONCombinedORM/mean": 0.6239070892333984, "rewards/VisualizationJSONCombinedORM/std": 0.13006076216697693, "step": 4291, "train_speed(iter/s)": 0.169272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 302.6875, "completions/min_length": 257.0, "epoch": 3.5500413564929696, "grad_norm": 0.21167930960655212, "kl": 0.028411865234375, "learning_rate": 2.3506757657497087e-06, "loss": 0.0002845264971256256, "memory(GiB)": 38.09, "reward": 0.49590712785720825, "reward_std": 0.0440969243645668, "rewards/VisualizationJSONCombinedORM/mean": 0.49590712785720825, "rewards/VisualizationJSONCombinedORM/std": 0.08752204477787018, "step": 4292, "train_speed(iter/s)": 0.16913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 290.75, "completions/min_length": 225.0, "epoch": 3.5508684863523574, "grad_norm": 0.2150869369506836, "kl": 0.103271484375, "learning_rate": 2.348227376336789e-06, "loss": 0.0010333210229873657, "memory(GiB)": 38.09, "reward": 0.6646192073822021, "reward_std": 0.05203153192996979, "rewards/VisualizationJSONCombinedORM/mean": 0.6646192073822021, "rewards/VisualizationJSONCombinedORM/std": 0.1595892757177353, "step": 4293, "train_speed(iter/s)": 0.168984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 297.5625, "completions/min_length": 222.0, "epoch": 3.5516956162117452, "grad_norm": 0.30083906650543213, "kl": 0.0693359375, "learning_rate": 2.345779871302453e-06, "loss": 0.0006933286786079407, "memory(GiB)": 38.09, "reward": 0.7224664688110352, "reward_std": 0.11915124952793121, "rewards/VisualizationJSONCombinedORM/mean": 0.7224664688110352, "rewards/VisualizationJSONCombinedORM/std": 0.11978157609701157, "step": 4294, "train_speed(iter/s)": 0.168848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 303.4375, "completions/min_length": 265.0, "epoch": 3.552522746071133, "grad_norm": 0.17947934567928314, "kl": 0.074951171875, "learning_rate": 2.343333251462954e-06, "loss": 0.0007494911551475525, "memory(GiB)": 38.09, "reward": 0.6030522584915161, "reward_std": 0.05665407329797745, "rewards/VisualizationJSONCombinedORM/mean": 0.6030522584915161, "rewards/VisualizationJSONCombinedORM/std": 0.199170783162117, "step": 4295, "train_speed(iter/s)": 0.16871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 295.875, "completions/min_length": 235.0, "epoch": 3.553349875930521, "grad_norm": 0.1858747899532318, "kl": 0.04345703125, "learning_rate": 2.3408875176342534e-06, "loss": 0.00043396465480327606, "memory(GiB)": 38.09, "reward": 0.7092459797859192, "reward_std": 0.09099699556827545, "rewards/VisualizationJSONCombinedORM/mean": 0.7092459797859192, "rewards/VisualizationJSONCombinedORM/std": 0.11369583755731583, "step": 4296, "train_speed(iter/s)": 0.168576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 289.1875, "completions/min_length": 250.0, "epoch": 3.554177005789909, "grad_norm": 0.26323390007019043, "kl": 0.062255859375, "learning_rate": 2.338442670632009e-06, "loss": 0.000622086226940155, "memory(GiB)": 38.09, "reward": 0.3584115207195282, "reward_std": 0.03630867227911949, "rewards/VisualizationJSONCombinedORM/mean": 0.3584115207195282, "rewards/VisualizationJSONCombinedORM/std": 0.036688875406980515, "step": 4297, "train_speed(iter/s)": 0.168451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 323.6875, "completions/min_length": 243.0, "epoch": 3.555004135649297, "grad_norm": 0.23099325597286224, "kl": 0.06671142578125, "learning_rate": 2.3359987112715963e-06, "loss": 0.0006668791174888611, "memory(GiB)": 38.09, "reward": 0.34263402223587036, "reward_std": 0.048400092869997025, "rewards/VisualizationJSONCombinedORM/mean": 0.34263402223587036, "rewards/VisualizationJSONCombinedORM/std": 0.062086399644613266, "step": 4298, "train_speed(iter/s)": 0.168322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 282.625, "completions/min_length": 233.0, "epoch": 3.555831265508685, "grad_norm": 0.16313713788986206, "kl": 0.033935546875, "learning_rate": 2.333555640368082e-06, "loss": 0.000339411199092865, "memory(GiB)": 38.09, "reward": 0.560279130935669, "reward_std": 0.028166722506284714, "rewards/VisualizationJSONCombinedORM/mean": 0.560279130935669, "rewards/VisualizationJSONCombinedORM/std": 0.15275134146213531, "step": 4299, "train_speed(iter/s)": 0.168186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 301.9375, "completions/min_length": 219.0, "epoch": 3.556658395368073, "grad_norm": 0.2151825726032257, "kl": 0.06878662109375, "learning_rate": 2.3311134587362426e-06, "loss": 0.0006875842809677124, "memory(GiB)": 38.09, "reward": 0.6424001455307007, "reward_std": 0.07906933128833771, "rewards/VisualizationJSONCombinedORM/mean": 0.6424001455307007, "rewards/VisualizationJSONCombinedORM/std": 0.09591806679964066, "step": 4300, "train_speed(iter/s)": 0.168047 }, { "epoch": 3.556658395368073, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 358.6666666666667, "eval_completions/mean_length": 301.8958333333333, "eval_completions/min_length": 252.91666666666666, "eval_kl": 0.0780029296875, "eval_loss": 0.0007871935958974063, "eval_reward": 0.4618927538394928, "eval_reward_std": 0.05278558000766983, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4618927538394928, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052785582141950727, "eval_runtime": 307.0345, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 286.5, "completions/min_length": 227.0, "epoch": 3.557485525227461, "grad_norm": 0.21751704812049866, "kl": 0.09130859375, "learning_rate": 2.328672167190558e-06, "loss": 0.000911954790353775, "memory(GiB)": 38.09, "reward": 0.586204469203949, "reward_std": 0.06696054339408875, "rewards/VisualizationJSONCombinedORM/mean": 0.586204469203949, "rewards/VisualizationJSONCombinedORM/std": 0.21778452396392822, "step": 4301, "train_speed(iter/s)": 0.165915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 311.9375, "completions/min_length": 237.0, "epoch": 3.5583126550868487, "grad_norm": 0.20133012533187866, "kl": 0.0909423828125, "learning_rate": 2.3262317665452123e-06, "loss": 0.0009094150736927986, "memory(GiB)": 38.09, "reward": 0.7197981476783752, "reward_std": 0.06608898937702179, "rewards/VisualizationJSONCombinedORM/mean": 0.7197981476783752, "rewards/VisualizationJSONCombinedORM/std": 0.07771197706460953, "step": 4302, "train_speed(iter/s)": 0.165793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 305.0625, "completions/min_length": 244.0, "epoch": 3.5591397849462365, "grad_norm": 0.19319581985473633, "kl": 0.0509033203125, "learning_rate": 2.323792257614086e-06, "loss": 0.0005077272653579712, "memory(GiB)": 38.09, "reward": 0.6018755435943604, "reward_std": 0.05545414239168167, "rewards/VisualizationJSONCombinedORM/mean": 0.6018755435943604, "rewards/VisualizationJSONCombinedORM/std": 0.07364480942487717, "step": 4303, "train_speed(iter/s)": 0.165667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 315.0625, "completions/min_length": 248.0, "epoch": 3.5599669148056243, "grad_norm": 0.2390500009059906, "kl": 0.04791259765625, "learning_rate": 2.321353641210769e-06, "loss": 0.00047871097922325134, "memory(GiB)": 38.09, "reward": 0.5610014796257019, "reward_std": 0.05755798518657684, "rewards/VisualizationJSONCombinedORM/mean": 0.5610014796257019, "rewards/VisualizationJSONCombinedORM/std": 0.18381376564502716, "step": 4304, "train_speed(iter/s)": 0.165545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 307.5, "completions/min_length": 224.0, "epoch": 3.5607940446650126, "grad_norm": 0.18998174369335175, "kl": 0.09234619140625, "learning_rate": 2.3189159181485517e-06, "loss": 0.0009222440421581268, "memory(GiB)": 38.09, "reward": 0.27207082509994507, "reward_std": 0.018644969910383224, "rewards/VisualizationJSONCombinedORM/mean": 0.27207082509994507, "rewards/VisualizationJSONCombinedORM/std": 0.10940732061862946, "step": 4305, "train_speed(iter/s)": 0.16541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 294.75, "completions/min_length": 199.0, "epoch": 3.5616211745244004, "grad_norm": 0.17916223406791687, "kl": 0.0347900390625, "learning_rate": 2.316479089240427e-06, "loss": 0.0003476850688457489, "memory(GiB)": 38.09, "reward": 0.6806787252426147, "reward_std": 0.17001895606517792, "rewards/VisualizationJSONCombinedORM/mean": 0.6806787252426147, "rewards/VisualizationJSONCombinedORM/std": 0.21097426116466522, "step": 4306, "train_speed(iter/s)": 0.165262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 310.125, "completions/min_length": 225.0, "epoch": 3.5624483043837882, "grad_norm": 0.29746443033218384, "kl": 0.05859375, "learning_rate": 2.3140431552990845e-06, "loss": 0.0005852328613400459, "memory(GiB)": 38.09, "reward": 0.47561579942703247, "reward_std": 0.05726056918501854, "rewards/VisualizationJSONCombinedORM/mean": 0.47561579942703247, "rewards/VisualizationJSONCombinedORM/std": 0.05757137015461922, "step": 4307, "train_speed(iter/s)": 0.165165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 261.3125, "completions/min_length": 221.0, "epoch": 3.563275434243176, "grad_norm": 0.2731650173664093, "kl": 0.1173095703125, "learning_rate": 2.311608117136926e-06, "loss": 0.0011732131242752075, "memory(GiB)": 38.09, "reward": 0.5021317005157471, "reward_std": 0.08124571293592453, "rewards/VisualizationJSONCombinedORM/mean": 0.5021317005157471, "rewards/VisualizationJSONCombinedORM/std": 0.21489426493644714, "step": 4308, "train_speed(iter/s)": 0.165052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 300.875, "completions/min_length": 221.0, "epoch": 3.564102564102564, "grad_norm": 0.1657591611146927, "kl": 0.043212890625, "learning_rate": 2.3091739755660425e-06, "loss": 0.00043310970067977905, "memory(GiB)": 38.09, "reward": 0.6654279828071594, "reward_std": 0.17401525378227234, "rewards/VisualizationJSONCombinedORM/mean": 0.6654279828071594, "rewards/VisualizationJSONCombinedORM/std": 0.1896139532327652, "step": 4309, "train_speed(iter/s)": 0.164915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 261.6875, "completions/min_length": 230.0, "epoch": 3.564929693961952, "grad_norm": 0.22054561972618103, "kl": 0.0872802734375, "learning_rate": 2.306740731398234e-06, "loss": 0.0008730441331863403, "memory(GiB)": 38.09, "reward": 0.5366008877754211, "reward_std": 0.06878066062927246, "rewards/VisualizationJSONCombinedORM/mean": 0.5366008877754211, "rewards/VisualizationJSONCombinedORM/std": 0.14516715705394745, "step": 4310, "train_speed(iter/s)": 0.164796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 319.25, "completions/min_length": 262.0, "epoch": 3.56575682382134, "grad_norm": 0.1813584566116333, "kl": 0.0445556640625, "learning_rate": 2.304308385444999e-06, "loss": 0.0004465058445930481, "memory(GiB)": 38.09, "reward": 0.7141643762588501, "reward_std": 0.060730863362550735, "rewards/VisualizationJSONCombinedORM/mean": 0.7141643762588501, "rewards/VisualizationJSONCombinedORM/std": 0.11456230282783508, "step": 4311, "train_speed(iter/s)": 0.164693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 258.9375, "completions/min_length": 221.0, "epoch": 3.566583953680728, "grad_norm": 0.14924484491348267, "kl": 0.05950927734375, "learning_rate": 2.3018769385175375e-06, "loss": 0.0005962029099464417, "memory(GiB)": 38.09, "reward": 0.7751761078834534, "reward_std": 0.028098370879888535, "rewards/VisualizationJSONCombinedORM/mean": 0.7751761078834534, "rewards/VisualizationJSONCombinedORM/std": 0.027195485308766365, "step": 4312, "train_speed(iter/s)": 0.16456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 299.75, "completions/min_length": 252.0, "epoch": 3.567411083540116, "grad_norm": 0.19737568497657776, "kl": 0.0697021484375, "learning_rate": 2.2994463914267435e-06, "loss": 0.0006971266120672226, "memory(GiB)": 38.09, "reward": 0.5811824798583984, "reward_std": 0.0703509971499443, "rewards/VisualizationJSONCombinedORM/mean": 0.5811824798583984, "rewards/VisualizationJSONCombinedORM/std": 0.17948126792907715, "step": 4313, "train_speed(iter/s)": 0.164432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 284.25, "completions/min_length": 227.0, "epoch": 3.568238213399504, "grad_norm": 0.17279383540153503, "kl": 0.038116455078125, "learning_rate": 2.297016744983222e-06, "loss": 0.00038164854049682617, "memory(GiB)": 38.09, "reward": 0.5760587453842163, "reward_std": 0.06107187271118164, "rewards/VisualizationJSONCombinedORM/mean": 0.5760587453842163, "rewards/VisualizationJSONCombinedORM/std": 0.0651281550526619, "step": 4314, "train_speed(iter/s)": 0.164321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 315.875, "completions/min_length": 270.0, "epoch": 3.5690653432588917, "grad_norm": 0.2970620393753052, "kl": 0.07928466796875, "learning_rate": 2.2945879999972676e-06, "loss": 0.0007930118590593338, "memory(GiB)": 38.09, "reward": 0.7369320392608643, "reward_std": 0.07160767912864685, "rewards/VisualizationJSONCombinedORM/mean": 0.7369320392608643, "rewards/VisualizationJSONCombinedORM/std": 0.07447343319654465, "step": 4315, "train_speed(iter/s)": 0.164192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 294.0, "completions/min_length": 224.0, "epoch": 3.5698924731182795, "grad_norm": 0.1755838841199875, "kl": 0.056396484375, "learning_rate": 2.292160157278879e-06, "loss": 0.0005639195442199707, "memory(GiB)": 38.09, "reward": 0.5665075778961182, "reward_std": 0.045047201216220856, "rewards/VisualizationJSONCombinedORM/mean": 0.5665075778961182, "rewards/VisualizationJSONCombinedORM/std": 0.280907541513443, "step": 4316, "train_speed(iter/s)": 0.164051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 281.5, "completions/min_length": 233.0, "epoch": 3.5707196029776673, "grad_norm": 0.19886460900306702, "kl": 0.1251220703125, "learning_rate": 2.289733217637753e-06, "loss": 0.001253381371498108, "memory(GiB)": 38.09, "reward": 0.3931150734424591, "reward_std": 0.06144627183675766, "rewards/VisualizationJSONCombinedORM/mean": 0.3931150734424591, "rewards/VisualizationJSONCombinedORM/std": 0.11999008059501648, "step": 4317, "train_speed(iter/s)": 0.163911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 295.6875, "completions/min_length": 244.0, "epoch": 3.5715467328370556, "grad_norm": 0.18455855548381805, "kl": 0.0869140625, "learning_rate": 2.2873071818832874e-06, "loss": 0.0008701421320438385, "memory(GiB)": 38.09, "reward": 0.27941346168518066, "reward_std": 0.03462512418627739, "rewards/VisualizationJSONCombinedORM/mean": 0.27941346168518066, "rewards/VisualizationJSONCombinedORM/std": 0.061917293816804886, "step": 4318, "train_speed(iter/s)": 0.163804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 314.3125, "completions/min_length": 262.0, "epoch": 3.5723738626964434, "grad_norm": 0.1913112998008728, "kl": 0.0577392578125, "learning_rate": 2.2848820508245733e-06, "loss": 0.0005765408277511597, "memory(GiB)": 38.09, "reward": 0.49757805466651917, "reward_std": 0.058127738535404205, "rewards/VisualizationJSONCombinedORM/mean": 0.49757805466651917, "rewards/VisualizationJSONCombinedORM/std": 0.1703481376171112, "step": 4319, "train_speed(iter/s)": 0.1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 288.8125, "completions/min_length": 241.0, "epoch": 3.5732009925558312, "grad_norm": 0.16815784573554993, "kl": 0.0416259765625, "learning_rate": 2.2824578252704042e-06, "loss": 0.00041794031858444214, "memory(GiB)": 38.09, "reward": 0.46513596177101135, "reward_std": 0.024211572483181953, "rewards/VisualizationJSONCombinedORM/mean": 0.46513596177101135, "rewards/VisualizationJSONCombinedORM/std": 0.03368213772773743, "step": 4320, "train_speed(iter/s)": 0.163594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 277.5, "completions/min_length": 200.0, "epoch": 3.574028122415219, "grad_norm": 0.17778056859970093, "kl": 0.08770751953125, "learning_rate": 2.2800345060292716e-06, "loss": 0.0008766166865825653, "memory(GiB)": 38.09, "reward": 0.6209350824356079, "reward_std": 0.08427615463733673, "rewards/VisualizationJSONCombinedORM/mean": 0.6209350824356079, "rewards/VisualizationJSONCombinedORM/std": 0.16299393773078918, "step": 4321, "train_speed(iter/s)": 0.163443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 270.1875, "completions/min_length": 209.0, "epoch": 3.574855252274607, "grad_norm": 0.19929777085781097, "kl": 0.06103515625, "learning_rate": 2.277612093909365e-06, "loss": 0.0006093010306358337, "memory(GiB)": 38.09, "reward": 0.5698957443237305, "reward_std": 0.06707363575696945, "rewards/VisualizationJSONCombinedORM/mean": 0.5698957443237305, "rewards/VisualizationJSONCombinedORM/std": 0.21418243646621704, "step": 4322, "train_speed(iter/s)": 0.163331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 293.0, "completions/min_length": 242.0, "epoch": 3.575682382133995, "grad_norm": 0.23323403298854828, "kl": 0.0672607421875, "learning_rate": 2.2751905897185646e-06, "loss": 0.0006710141897201538, "memory(GiB)": 38.09, "reward": 0.44337767362594604, "reward_std": 0.08950308710336685, "rewards/VisualizationJSONCombinedORM/mean": 0.44337767362594604, "rewards/VisualizationJSONCombinedORM/std": 0.17117787897586823, "step": 4323, "train_speed(iter/s)": 0.16323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 292.8125, "completions/min_length": 232.0, "epoch": 3.576509511993383, "grad_norm": 0.18897011876106262, "kl": 0.037933349609375, "learning_rate": 2.272769994264461e-06, "loss": 0.00038013607263565063, "memory(GiB)": 38.09, "reward": 0.7163127064704895, "reward_std": 0.0510636568069458, "rewards/VisualizationJSONCombinedORM/mean": 0.7163127064704895, "rewards/VisualizationJSONCombinedORM/std": 0.0645027682185173, "step": 4324, "train_speed(iter/s)": 0.1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 290.5, "completions/min_length": 223.0, "epoch": 3.577336641852771, "grad_norm": 0.1885707527399063, "kl": 0.049530029296875, "learning_rate": 2.2703503083543288e-06, "loss": 0.0004970021545886993, "memory(GiB)": 38.09, "reward": 0.7652237415313721, "reward_std": 0.07796170562505722, "rewards/VisualizationJSONCombinedORM/mean": 0.7652237415313721, "rewards/VisualizationJSONCombinedORM/std": 0.07605288177728653, "step": 4325, "train_speed(iter/s)": 0.162998 }, { "epoch": 3.577336641852771, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 360.0416666666667, "eval_completions/mean_length": 302.1979166666667, "eval_completions/min_length": 254.125, "eval_kl": 0.092742919921875, "eval_loss": 0.0009253136813640594, "eval_reward": 0.45671280225118, "eval_reward_std": 0.05363299980914841, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45671280225118, "eval_rewards/VisualizationJSONCombinedORM/std": 0.053633000468835235, "eval_runtime": 307.914, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 310.8125, "completions/min_length": 242.0, "epoch": 3.578163771712159, "grad_norm": 0.21847999095916748, "kl": 0.0677490234375, "learning_rate": 2.2679315327951458e-06, "loss": 0.0006785932928323746, "memory(GiB)": 38.09, "reward": 0.3729667067527771, "reward_std": 0.032992009073495865, "rewards/VisualizationJSONCombinedORM/mean": 0.3729667067527771, "rewards/VisualizationJSONCombinedORM/std": 0.08072920143604279, "step": 4326, "train_speed(iter/s)": 0.161022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 251.25, "completions/min_length": 196.0, "epoch": 3.578990901571547, "grad_norm": 0.2528632581233978, "kl": 0.0706787109375, "learning_rate": 2.265513668393586e-06, "loss": 0.0007103532552719116, "memory(GiB)": 38.09, "reward": 0.7298610210418701, "reward_std": 0.04989423602819443, "rewards/VisualizationJSONCombinedORM/mean": 0.7298610210418701, "rewards/VisualizationJSONCombinedORM/std": 0.08216512948274612, "step": 4327, "train_speed(iter/s)": 0.160922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 316.8125, "completions/min_length": 267.0, "epoch": 3.5798180314309347, "grad_norm": 0.169658824801445, "kl": 0.06390380859375, "learning_rate": 2.263096715956019e-06, "loss": 0.0006381403654813766, "memory(GiB)": 38.09, "reward": 0.43775391578674316, "reward_std": 0.030596517026424408, "rewards/VisualizationJSONCombinedORM/mean": 0.43775391578674316, "rewards/VisualizationJSONCombinedORM/std": 0.12942519783973694, "step": 4328, "train_speed(iter/s)": 0.160808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 294.9375, "completions/min_length": 229.0, "epoch": 3.5806451612903225, "grad_norm": 0.1596572995185852, "kl": 0.04248046875, "learning_rate": 2.2606806762885054e-06, "loss": 0.00042471030610613525, "memory(GiB)": 38.09, "reward": 0.46639978885650635, "reward_std": 0.04539312422275543, "rewards/VisualizationJSONCombinedORM/mean": 0.46639978885650635, "rewards/VisualizationJSONCombinedORM/std": 0.0656576007604599, "step": 4329, "train_speed(iter/s)": 0.160694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 306.3125, "completions/min_length": 259.0, "epoch": 3.5814722911497103, "grad_norm": 0.1622578501701355, "kl": 0.0433349609375, "learning_rate": 2.258265550196812e-06, "loss": 0.0004345253109931946, "memory(GiB)": 38.09, "reward": 0.3904818892478943, "reward_std": 0.02787908911705017, "rewards/VisualizationJSONCombinedORM/mean": 0.3904818892478943, "rewards/VisualizationJSONCombinedORM/std": 0.07386931031942368, "step": 4330, "train_speed(iter/s)": 0.160589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 283.625, "completions/min_length": 217.0, "epoch": 3.5822994210090986, "grad_norm": 0.20079456269741058, "kl": 0.0498046875, "learning_rate": 2.25585133848639e-06, "loss": 0.0004982054233551025, "memory(GiB)": 38.09, "reward": 0.6153436899185181, "reward_std": 0.08164794743061066, "rewards/VisualizationJSONCombinedORM/mean": 0.6153436899185181, "rewards/VisualizationJSONCombinedORM/std": 0.10106424242258072, "step": 4331, "train_speed(iter/s)": 0.160488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 311.5625, "completions/min_length": 227.0, "epoch": 3.5831265508684864, "grad_norm": 0.22197870910167694, "kl": 0.075439453125, "learning_rate": 2.2534380419623918e-06, "loss": 0.0007542446255683899, "memory(GiB)": 38.09, "reward": 0.5530123114585876, "reward_std": 0.055960677564144135, "rewards/VisualizationJSONCombinedORM/mean": 0.5530123114585876, "rewards/VisualizationJSONCombinedORM/std": 0.22546756267547607, "step": 4332, "train_speed(iter/s)": 0.160338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 282.375, "completions/min_length": 212.0, "epoch": 3.5839536807278742, "grad_norm": 0.2784064710140228, "kl": 0.295166015625, "learning_rate": 2.2510256614296638e-06, "loss": 0.0029436834156513214, "memory(GiB)": 38.09, "reward": 0.48835068941116333, "reward_std": 0.05517954379320145, "rewards/VisualizationJSONCombinedORM/mean": 0.48835068941116333, "rewards/VisualizationJSONCombinedORM/std": 0.12276040017604828, "step": 4333, "train_speed(iter/s)": 0.160248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 303.875, "completions/min_length": 247.0, "epoch": 3.584780810587262, "grad_norm": 0.20778152346611023, "kl": 0.09686279296875, "learning_rate": 2.248614197692747e-06, "loss": 0.0009690560400485992, "memory(GiB)": 38.09, "reward": 0.5783157348632812, "reward_std": 0.0589187815785408, "rewards/VisualizationJSONCombinedORM/mean": 0.5783157348632812, "rewards/VisualizationJSONCombinedORM/std": 0.07187098264694214, "step": 4334, "train_speed(iter/s)": 0.160123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 314.5625, "completions/min_length": 240.0, "epoch": 3.58560794044665, "grad_norm": 0.19091399013996124, "kl": 0.09564208984375, "learning_rate": 2.2462036515558726e-06, "loss": 0.0009569078683853149, "memory(GiB)": 38.09, "reward": 0.5634642839431763, "reward_std": 0.07260800153017044, "rewards/VisualizationJSONCombinedORM/mean": 0.5634642839431763, "rewards/VisualizationJSONCombinedORM/std": 0.17307981848716736, "step": 4335, "train_speed(iter/s)": 0.160002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 278.9375, "completions/min_length": 233.0, "epoch": 3.586435070306038, "grad_norm": 0.2176753431558609, "kl": 0.1214599609375, "learning_rate": 2.2437940238229705e-06, "loss": 0.0012159906327724457, "memory(GiB)": 38.09, "reward": 0.44733548164367676, "reward_std": 0.05630325525999069, "rewards/VisualizationJSONCombinedORM/mean": 0.44733548164367676, "rewards/VisualizationJSONCombinedORM/std": 0.14043079316616058, "step": 4336, "train_speed(iter/s)": 0.159901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 298.6875, "completions/min_length": 239.0, "epoch": 3.587262200165426, "grad_norm": 0.22045092284679413, "kl": 0.0498046875, "learning_rate": 2.241385315297664e-06, "loss": 0.0004977881908416748, "memory(GiB)": 38.09, "reward": 0.47429001331329346, "reward_std": 0.049645110964775085, "rewards/VisualizationJSONCombinedORM/mean": 0.47429001331329346, "rewards/VisualizationJSONCombinedORM/std": 0.13783389329910278, "step": 4337, "train_speed(iter/s)": 0.159802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 285.125, "completions/min_length": 213.0, "epoch": 3.588089330024814, "grad_norm": 0.22509677708148956, "kl": 0.07281494140625, "learning_rate": 2.2389775267832692e-06, "loss": 0.0007280372083187103, "memory(GiB)": 38.09, "reward": 0.6257654428482056, "reward_std": 0.0736386850476265, "rewards/VisualizationJSONCombinedORM/mean": 0.6257654428482056, "rewards/VisualizationJSONCombinedORM/std": 0.10987511277198792, "step": 4338, "train_speed(iter/s)": 0.159656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 309.875, "completions/min_length": 252.0, "epoch": 3.588916459884202, "grad_norm": 0.19759294390678406, "kl": 0.07379150390625, "learning_rate": 2.2365706590827897e-06, "loss": 0.000738929957151413, "memory(GiB)": 38.1, "reward": 0.5582168102264404, "reward_std": 0.06271808594465256, "rewards/VisualizationJSONCombinedORM/mean": 0.5582168102264404, "rewards/VisualizationJSONCombinedORM/std": 0.20707915723323822, "step": 4339, "train_speed(iter/s)": 0.159507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 327.125, "completions/min_length": 234.0, "epoch": 3.58974358974359, "grad_norm": 0.14207103848457336, "kl": 0.03753662109375, "learning_rate": 2.234164712998935e-06, "loss": 0.00037573277950286865, "memory(GiB)": 38.1, "reward": 0.3737400770187378, "reward_std": 0.033994078636169434, "rewards/VisualizationJSONCombinedORM/mean": 0.3737400770187378, "rewards/VisualizationJSONCombinedORM/std": 0.03955207020044327, "step": 4340, "train_speed(iter/s)": 0.159391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 298.9375, "completions/min_length": 248.0, "epoch": 3.5905707196029777, "grad_norm": 0.22470521926879883, "kl": 0.0992431640625, "learning_rate": 2.2317596893340924e-06, "loss": 0.0009933598339557648, "memory(GiB)": 38.1, "reward": 0.4283868670463562, "reward_std": 0.05150889977812767, "rewards/VisualizationJSONCombinedORM/mean": 0.4283868670463562, "rewards/VisualizationJSONCombinedORM/std": 0.2007589489221573, "step": 4341, "train_speed(iter/s)": 0.159279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 297.0625, "completions/min_length": 232.0, "epoch": 3.5913978494623655, "grad_norm": 0.18768860399723053, "kl": 0.14013671875, "learning_rate": 2.2293555888903524e-06, "loss": 0.001402437686920166, "memory(GiB)": 38.1, "reward": 0.7941322326660156, "reward_std": 0.08506178855895996, "rewards/VisualizationJSONCombinedORM/mean": 0.7941322326660156, "rewards/VisualizationJSONCombinedORM/std": 0.0936126708984375, "step": 4342, "train_speed(iter/s)": 0.159137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 306.1875, "completions/min_length": 229.0, "epoch": 3.5922249793217533, "grad_norm": 0.2135149985551834, "kl": 0.1663818359375, "learning_rate": 2.226952412469493e-06, "loss": 0.0016631875187158585, "memory(GiB)": 38.1, "reward": 0.5461787581443787, "reward_std": 0.055594079196453094, "rewards/VisualizationJSONCombinedORM/mean": 0.5461787581443787, "rewards/VisualizationJSONCombinedORM/std": 0.08836561441421509, "step": 4343, "train_speed(iter/s)": 0.159016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 279.375, "completions/min_length": 223.0, "epoch": 3.5930521091811416, "grad_norm": 0.25246521830558777, "kl": 0.13922119140625, "learning_rate": 2.224550160872986e-06, "loss": 0.0013964567333459854, "memory(GiB)": 38.1, "reward": 0.6175005435943604, "reward_std": 0.058721184730529785, "rewards/VisualizationJSONCombinedORM/mean": 0.6175005435943604, "rewards/VisualizationJSONCombinedORM/std": 0.22761976718902588, "step": 4344, "train_speed(iter/s)": 0.158905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 302.5625, "completions/min_length": 229.0, "epoch": 3.5938792390405294, "grad_norm": 0.22488121688365936, "kl": 0.08544921875, "learning_rate": 2.2221488349019903e-06, "loss": 0.0008536465466022491, "memory(GiB)": 38.1, "reward": 0.6263779401779175, "reward_std": 0.06747405976057053, "rewards/VisualizationJSONCombinedORM/mean": 0.6263779401779175, "rewards/VisualizationJSONCombinedORM/std": 0.2011108100414276, "step": 4345, "train_speed(iter/s)": 0.158779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 261.4375, "completions/min_length": 204.0, "epoch": 3.5947063688999172, "grad_norm": 0.2250993847846985, "kl": 0.0858154296875, "learning_rate": 2.2197484353573595e-06, "loss": 0.000858008861541748, "memory(GiB)": 38.1, "reward": 0.5854385495185852, "reward_std": 0.09024901688098907, "rewards/VisualizationJSONCombinedORM/mean": 0.5854385495185852, "rewards/VisualizationJSONCombinedORM/std": 0.16191859543323517, "step": 4346, "train_speed(iter/s)": 0.15868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 314.125, "completions/min_length": 220.0, "epoch": 3.595533498759305, "grad_norm": 0.18542709946632385, "kl": 0.1435546875, "learning_rate": 2.2173489630396435e-06, "loss": 0.0014325156807899475, "memory(GiB)": 38.1, "reward": 0.6758012771606445, "reward_std": 0.04093559831380844, "rewards/VisualizationJSONCombinedORM/mean": 0.6758012771606445, "rewards/VisualizationJSONCombinedORM/std": 0.0401846319437027, "step": 4347, "train_speed(iter/s)": 0.158552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 313.625, "completions/min_length": 254.0, "epoch": 3.596360628618693, "grad_norm": 0.24748264253139496, "kl": 0.06256103515625, "learning_rate": 2.2149504187490718e-06, "loss": 0.0006244070827960968, "memory(GiB)": 38.1, "reward": 0.4093714654445648, "reward_std": 0.060446687042713165, "rewards/VisualizationJSONCombinedORM/mean": 0.4093714654445648, "rewards/VisualizationJSONCombinedORM/std": 0.08942588418722153, "step": 4348, "train_speed(iter/s)": 0.158438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 283.5625, "completions/min_length": 240.0, "epoch": 3.597187758478081, "grad_norm": 0.18030409514904022, "kl": 0.0445556640625, "learning_rate": 2.2125528032855727e-06, "loss": 0.0004455931484699249, "memory(GiB)": 38.1, "reward": 0.6405482292175293, "reward_std": 0.04338742047548294, "rewards/VisualizationJSONCombinedORM/mean": 0.6405482292175293, "rewards/VisualizationJSONCombinedORM/std": 0.18040074408054352, "step": 4349, "train_speed(iter/s)": 0.158327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 292.1875, "completions/min_length": 238.0, "epoch": 3.598014888337469, "grad_norm": 0.2137364000082016, "kl": 0.09130859375, "learning_rate": 2.2101561174487606e-06, "loss": 0.0009140074253082275, "memory(GiB)": 38.1, "reward": 0.4789774417877197, "reward_std": 0.03328486904501915, "rewards/VisualizationJSONCombinedORM/mean": 0.4789774417877197, "rewards/VisualizationJSONCombinedORM/std": 0.28021925687789917, "step": 4350, "train_speed(iter/s)": 0.158208 }, { "epoch": 3.598014888337469, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 360.9583333333333, "eval_completions/mean_length": 302.7447916666667, "eval_completions/min_length": 255.16666666666666, "eval_kl": 0.12315877278645833, "eval_loss": 0.0012333790073171258, "eval_reward": 0.45773230120539665, "eval_reward_std": 0.05571993568446487, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45773230120539665, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05571993855604281, "eval_runtime": 308.7255, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 295.6875, "completions/min_length": 234.0, "epoch": 3.598842018196857, "grad_norm": 0.16792674362659454, "kl": 0.067626953125, "learning_rate": 2.2077603620379457e-06, "loss": 0.0006749182939529419, "memory(GiB)": 38.1, "reward": 0.48177143931388855, "reward_std": 0.04460631310939789, "rewards/VisualizationJSONCombinedORM/mean": 0.48177143931388855, "rewards/VisualizationJSONCombinedORM/std": 0.2022998332977295, "step": 4351, "train_speed(iter/s)": 0.156338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 289.375, "completions/min_length": 244.0, "epoch": 3.599669148056245, "grad_norm": 0.20423714816570282, "kl": 0.0697021484375, "learning_rate": 2.205365537852116e-06, "loss": 0.0006994716823101044, "memory(GiB)": 38.1, "reward": 0.6030652523040771, "reward_std": 0.05097276344895363, "rewards/VisualizationJSONCombinedORM/mean": 0.6030652523040771, "rewards/VisualizationJSONCombinedORM/std": 0.14194999635219574, "step": 4352, "train_speed(iter/s)": 0.156231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 285.125, "completions/min_length": 218.0, "epoch": 3.600496277915633, "grad_norm": 0.20588679611682892, "kl": 0.08544921875, "learning_rate": 2.2029716456899645e-06, "loss": 0.0008552372455596924, "memory(GiB)": 38.1, "reward": 0.7229832410812378, "reward_std": 0.04657192528247833, "rewards/VisualizationJSONCombinedORM/mean": 0.7229832410812378, "rewards/VisualizationJSONCombinedORM/std": 0.04799487441778183, "step": 4353, "train_speed(iter/s)": 0.15612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 342.1875, "completions/min_length": 254.0, "epoch": 3.6013234077750207, "grad_norm": 0.2185322791337967, "kl": 0.0550537109375, "learning_rate": 2.2005786863498607e-06, "loss": 0.0005507543683052063, "memory(GiB)": 38.1, "reward": 0.44709306955337524, "reward_std": 0.0867544636130333, "rewards/VisualizationJSONCombinedORM/mean": 0.44709306955337524, "rewards/VisualizationJSONCombinedORM/std": 0.2053765505552292, "step": 4354, "train_speed(iter/s)": 0.155991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 296.0625, "completions/min_length": 239.0, "epoch": 3.6021505376344085, "grad_norm": 0.2049688994884491, "kl": 0.08648681640625, "learning_rate": 2.1981866606298684e-06, "loss": 0.000866524875164032, "memory(GiB)": 38.1, "reward": 0.5856238007545471, "reward_std": 0.07179045677185059, "rewards/VisualizationJSONCombinedORM/mean": 0.5856238007545471, "rewards/VisualizationJSONCombinedORM/std": 0.18914712965488434, "step": 4355, "train_speed(iter/s)": 0.155889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 299.6875, "completions/min_length": 229.0, "epoch": 3.6029776674937963, "grad_norm": 0.16745002567768097, "kl": 0.0850830078125, "learning_rate": 2.1957955693277405e-06, "loss": 0.0008505135774612427, "memory(GiB)": 38.1, "reward": 0.4582139849662781, "reward_std": 0.05230609327554703, "rewards/VisualizationJSONCombinedORM/mean": 0.4582139849662781, "rewards/VisualizationJSONCombinedORM/std": 0.2044195979833603, "step": 4356, "train_speed(iter/s)": 0.155779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 310.625, "completions/min_length": 250.0, "epoch": 3.6038047973531846, "grad_norm": 0.20658452808856964, "kl": 0.05926513671875, "learning_rate": 2.1934054132409183e-06, "loss": 0.0005930140614509583, "memory(GiB)": 38.1, "reward": 0.5019242167472839, "reward_std": 0.045077819377183914, "rewards/VisualizationJSONCombinedORM/mean": 0.5019242167472839, "rewards/VisualizationJSONCombinedORM/std": 0.06491388380527496, "step": 4357, "train_speed(iter/s)": 0.155667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 297.0, "completions/min_length": 199.0, "epoch": 3.6046319272125724, "grad_norm": 0.1762295365333557, "kl": 0.045928955078125, "learning_rate": 2.1910161931665263e-06, "loss": 0.00045956671237945557, "memory(GiB)": 38.1, "reward": 0.5817322731018066, "reward_std": 0.05463561415672302, "rewards/VisualizationJSONCombinedORM/mean": 0.5817322731018066, "rewards/VisualizationJSONCombinedORM/std": 0.07324624806642532, "step": 4358, "train_speed(iter/s)": 0.15557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 292.5, "completions/min_length": 233.0, "epoch": 3.6054590570719602, "grad_norm": 0.183712437748909, "kl": 0.1177978515625, "learning_rate": 2.188627909901383e-06, "loss": 0.001176442950963974, "memory(GiB)": 38.1, "reward": 0.6410471200942993, "reward_std": 0.055047813802957535, "rewards/VisualizationJSONCombinedORM/mean": 0.6410471200942993, "rewards/VisualizationJSONCombinedORM/std": 0.08122678846120834, "step": 4359, "train_speed(iter/s)": 0.155457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 324.3125, "completions/min_length": 252.0, "epoch": 3.606286186931348, "grad_norm": 0.20960164070129395, "kl": 0.04730224609375, "learning_rate": 2.186240564241992e-06, "loss": 0.0004729628562927246, "memory(GiB)": 38.1, "reward": 0.614108681678772, "reward_std": 0.043601904064416885, "rewards/VisualizationJSONCombinedORM/mean": 0.614108681678772, "rewards/VisualizationJSONCombinedORM/std": 0.12221918255090714, "step": 4360, "train_speed(iter/s)": 0.15535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 289.25, "completions/min_length": 235.0, "epoch": 3.607113316790736, "grad_norm": 0.19902080297470093, "kl": 0.083740234375, "learning_rate": 2.183854156984545e-06, "loss": 0.0008342228829860687, "memory(GiB)": 38.1, "reward": 0.47141051292419434, "reward_std": 0.043479979038238525, "rewards/VisualizationJSONCombinedORM/mean": 0.47141051292419434, "rewards/VisualizationJSONCombinedORM/std": 0.19958220422267914, "step": 4361, "train_speed(iter/s)": 0.155266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 322.1875, "completions/min_length": 243.0, "epoch": 3.607940446650124, "grad_norm": 0.17712871730327606, "kl": 0.04364013671875, "learning_rate": 2.181468688924916e-06, "loss": 0.0004361681640148163, "memory(GiB)": 38.1, "reward": 0.6327286958694458, "reward_std": 0.043327219784259796, "rewards/VisualizationJSONCombinedORM/mean": 0.6327286958694458, "rewards/VisualizationJSONCombinedORM/std": 0.07954448461532593, "step": 4362, "train_speed(iter/s)": 0.155144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 304.8125, "completions/min_length": 262.0, "epoch": 3.608767576509512, "grad_norm": 0.2114996463060379, "kl": 0.113037109375, "learning_rate": 2.179084160858676e-06, "loss": 0.0011295732110738754, "memory(GiB)": 38.1, "reward": 0.4478745460510254, "reward_std": 0.040749821811914444, "rewards/VisualizationJSONCombinedORM/mean": 0.4478745460510254, "rewards/VisualizationJSONCombinedORM/std": 0.14970172941684723, "step": 4363, "train_speed(iter/s)": 0.155044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 310.0, "completions/min_length": 227.0, "epoch": 3.6095947063689, "grad_norm": 0.17972324788570404, "kl": 0.06805419921875, "learning_rate": 2.176700573581071e-06, "loss": 0.0006813034415245056, "memory(GiB)": 38.1, "reward": 0.2803072929382324, "reward_std": 0.031562838703393936, "rewards/VisualizationJSONCombinedORM/mean": 0.2803072929382324, "rewards/VisualizationJSONCombinedORM/std": 0.15168629586696625, "step": 4364, "train_speed(iter/s)": 0.154934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 331.9375, "completions/min_length": 259.0, "epoch": 3.610421836228288, "grad_norm": 0.160337895154953, "kl": 0.05419921875, "learning_rate": 2.174317927887041e-06, "loss": 0.0005420027300715446, "memory(GiB)": 38.1, "reward": 0.4575774669647217, "reward_std": 0.05080041289329529, "rewards/VisualizationJSONCombinedORM/mean": 0.4575774669647217, "rewards/VisualizationJSONCombinedORM/std": 0.20431335270404816, "step": 4365, "train_speed(iter/s)": 0.154789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 319.0625, "completions/min_length": 229.0, "epoch": 3.611248966087676, "grad_norm": 0.1929345577955246, "kl": 0.08233642578125, "learning_rate": 2.1719362245712085e-06, "loss": 0.0008228421211242676, "memory(GiB)": 38.1, "reward": 0.5751289129257202, "reward_std": 0.06423628330230713, "rewards/VisualizationJSONCombinedORM/mean": 0.5751289129257202, "rewards/VisualizationJSONCombinedORM/std": 0.06393162161111832, "step": 4366, "train_speed(iter/s)": 0.154691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 318.8125, "completions/min_length": 257.0, "epoch": 3.6120760959470637, "grad_norm": 0.20245692133903503, "kl": 0.0694580078125, "learning_rate": 2.169555464427885e-06, "loss": 0.0006935372948646545, "memory(GiB)": 38.1, "reward": 0.4769417643547058, "reward_std": 0.056671060621738434, "rewards/VisualizationJSONCombinedORM/mean": 0.4769417643547058, "rewards/VisualizationJSONCombinedORM/std": 0.14056424796581268, "step": 4367, "train_speed(iter/s)": 0.154567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 333.5625, "completions/min_length": 260.0, "epoch": 3.6129032258064515, "grad_norm": 0.19134421646595, "kl": 0.0567626953125, "learning_rate": 2.1671756482510597e-06, "loss": 0.0005679205060005188, "memory(GiB)": 38.1, "reward": 0.5844646692276001, "reward_std": 0.018605174496769905, "rewards/VisualizationJSONCombinedORM/mean": 0.5844646692276001, "rewards/VisualizationJSONCombinedORM/std": 0.026455506682395935, "step": 4368, "train_speed(iter/s)": 0.154425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 282.9375, "completions/min_length": 225.0, "epoch": 3.6137303556658393, "grad_norm": 0.22774887084960938, "kl": 0.14013671875, "learning_rate": 2.1647967768344204e-06, "loss": 0.0014029145240783691, "memory(GiB)": 38.1, "reward": 0.5399464964866638, "reward_std": 0.07826842367649078, "rewards/VisualizationJSONCombinedORM/mean": 0.5399464964866638, "rewards/VisualizationJSONCombinedORM/std": 0.11743434518575668, "step": 4369, "train_speed(iter/s)": 0.154287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 322.25, "completions/min_length": 257.0, "epoch": 3.6145574855252276, "grad_norm": 0.24665379524230957, "kl": 0.07232666015625, "learning_rate": 2.162418850971325e-06, "loss": 0.0007231086492538452, "memory(GiB)": 38.1, "reward": 0.248980313539505, "reward_std": 0.04051520302891731, "rewards/VisualizationJSONCombinedORM/mean": 0.248980313539505, "rewards/VisualizationJSONCombinedORM/std": 0.04588285833597183, "step": 4370, "train_speed(iter/s)": 0.154173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 316.75, "completions/min_length": 255.0, "epoch": 3.6153846153846154, "grad_norm": 0.2887614369392395, "kl": 0.0849609375, "learning_rate": 2.1600418714548254e-06, "loss": 0.0008496949449181557, "memory(GiB)": 38.1, "reward": 0.48603111505508423, "reward_std": 0.0633603185415268, "rewards/VisualizationJSONCombinedORM/mean": 0.48603111505508423, "rewards/VisualizationJSONCombinedORM/std": 0.14004433155059814, "step": 4371, "train_speed(iter/s)": 0.154046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 285.4375, "completions/min_length": 232.0, "epoch": 3.6162117452440032, "grad_norm": 0.2167021483182907, "kl": 0.12786865234375, "learning_rate": 2.157665839077656e-06, "loss": 0.001278504729270935, "memory(GiB)": 38.1, "reward": 0.40141987800598145, "reward_std": 0.03909852355718613, "rewards/VisualizationJSONCombinedORM/mean": 0.40141987800598145, "rewards/VisualizationJSONCombinedORM/std": 0.1504349410533905, "step": 4372, "train_speed(iter/s)": 0.153929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 306.6875, "completions/min_length": 269.0, "epoch": 3.6170388751033915, "grad_norm": 0.25393760204315186, "kl": 0.090576171875, "learning_rate": 2.1552907546322356e-06, "loss": 0.0009053610265254974, "memory(GiB)": 38.1, "reward": 0.4474848210811615, "reward_std": 0.053377456963062286, "rewards/VisualizationJSONCombinedORM/mean": 0.4474848210811615, "rewards/VisualizationJSONCombinedORM/std": 0.19590187072753906, "step": 4373, "train_speed(iter/s)": 0.15382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 317.6875, "completions/min_length": 245.0, "epoch": 3.617866004962779, "grad_norm": 0.17593272030353546, "kl": 0.09423828125, "learning_rate": 2.1529166189106626e-06, "loss": 0.0009413734078407288, "memory(GiB)": 38.1, "reward": 0.48272523283958435, "reward_std": 0.040117036551237106, "rewards/VisualizationJSONCombinedORM/mean": 0.48272523283958435, "rewards/VisualizationJSONCombinedORM/std": 0.15639998018741608, "step": 4374, "train_speed(iter/s)": 0.153739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 312.375, "completions/min_length": 243.0, "epoch": 3.618693134822167, "grad_norm": 0.27609118819236755, "kl": 0.232666015625, "learning_rate": 2.1505434327047246e-06, "loss": 0.0023246631026268005, "memory(GiB)": 38.1, "reward": 0.3614465892314911, "reward_std": 0.05535595118999481, "rewards/VisualizationJSONCombinedORM/mean": 0.3614465892314911, "rewards/VisualizationJSONCombinedORM/std": 0.12888677418231964, "step": 4375, "train_speed(iter/s)": 0.153631 }, { "epoch": 3.618693134822167, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 377.375, "eval_completions/mean_length": 308.78125, "eval_completions/min_length": 253.20833333333334, "eval_kl": 0.07155354817708333, "eval_loss": 0.0007244248990900815, "eval_reward": 0.4396257034192483, "eval_reward_std": 0.047761498911616705, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4396257034192483, "eval_rewards/VisualizationJSONCombinedORM/std": 0.047761501162312925, "eval_runtime": 317.7431, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 312.25, "completions/min_length": 261.0, "epoch": 3.619520264681555, "grad_norm": 0.19300667941570282, "kl": 0.11083984375, "learning_rate": 2.1481711968058897e-06, "loss": 0.0011079758405685425, "memory(GiB)": 38.1, "reward": 0.686652660369873, "reward_std": 0.07053529471158981, "rewards/VisualizationJSONCombinedORM/mean": 0.686652660369873, "rewards/VisualizationJSONCombinedORM/std": 0.07441480457782745, "step": 4376, "train_speed(iter/s)": 0.15184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 238.9375, "completions/min_length": 200.0, "epoch": 3.620347394540943, "grad_norm": 0.2367931604385376, "kl": 0.06201171875, "learning_rate": 2.1457999120053125e-06, "loss": 0.0006202459335327148, "memory(GiB)": 38.1, "reward": 0.3305528163909912, "reward_std": 0.04780307412147522, "rewards/VisualizationJSONCombinedORM/mean": 0.3305528163909912, "rewards/VisualizationJSONCombinedORM/std": 0.0745406374335289, "step": 4377, "train_speed(iter/s)": 0.151749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 332.6875, "completions/min_length": 262.0, "epoch": 3.621174524400331, "grad_norm": 0.1909850388765335, "kl": 0.04400634765625, "learning_rate": 2.1434295790938207e-06, "loss": 0.00044048577547073364, "memory(GiB)": 38.1, "reward": 0.302243709564209, "reward_std": 0.02742731012403965, "rewards/VisualizationJSONCombinedORM/mean": 0.302243709564209, "rewards/VisualizationJSONCombinedORM/std": 0.12540708482265472, "step": 4378, "train_speed(iter/s)": 0.151644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 298.125, "completions/min_length": 220.0, "epoch": 3.622001654259719, "grad_norm": 0.2016131579875946, "kl": 0.04168701171875, "learning_rate": 2.1410601988619394e-06, "loss": 0.00041566044092178345, "memory(GiB)": 38.1, "reward": 0.5353708863258362, "reward_std": 0.031813010573387146, "rewards/VisualizationJSONCombinedORM/mean": 0.5353708863258362, "rewards/VisualizationJSONCombinedORM/std": 0.2802218496799469, "step": 4379, "train_speed(iter/s)": 0.151553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 323.9375, "completions/min_length": 272.0, "epoch": 3.6228287841191067, "grad_norm": 0.1631011813879013, "kl": 0.29833984375, "learning_rate": 2.138691772099863e-06, "loss": 0.0029831156134605408, "memory(GiB)": 38.1, "reward": 0.38313940167427063, "reward_std": 0.05399502068758011, "rewards/VisualizationJSONCombinedORM/mean": 0.38313940167427063, "rewards/VisualizationJSONCombinedORM/std": 0.16812019050121307, "step": 4380, "train_speed(iter/s)": 0.151455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 316.4375, "completions/min_length": 234.0, "epoch": 3.6236559139784945, "grad_norm": 0.20670856535434723, "kl": 0.040771484375, "learning_rate": 2.136324299597474e-06, "loss": 0.00040822476148605347, "memory(GiB)": 38.1, "reward": 0.5780565738677979, "reward_std": 0.030040070414543152, "rewards/VisualizationJSONCombinedORM/mean": 0.5780565738677979, "rewards/VisualizationJSONCombinedORM/std": 0.1677747368812561, "step": 4381, "train_speed(iter/s)": 0.151355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 300.1875, "completions/min_length": 236.0, "epoch": 3.6244830438378823, "grad_norm": 0.23043714463710785, "kl": 0.0985107421875, "learning_rate": 2.1339577821443363e-06, "loss": 0.0009829439222812653, "memory(GiB)": 38.1, "reward": 0.4951396882534027, "reward_std": 0.09398147463798523, "rewards/VisualizationJSONCombinedORM/mean": 0.4951396882534027, "rewards/VisualizationJSONCombinedORM/std": 0.09586983174085617, "step": 4382, "train_speed(iter/s)": 0.151261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 289.8125, "completions/min_length": 245.0, "epoch": 3.6253101736972706, "grad_norm": 0.1941826194524765, "kl": 0.0443115234375, "learning_rate": 2.1315922205296957e-06, "loss": 0.00044338032603263855, "memory(GiB)": 38.1, "reward": 0.549445390701294, "reward_std": 0.07100844383239746, "rewards/VisualizationJSONCombinedORM/mean": 0.549445390701294, "rewards/VisualizationJSONCombinedORM/std": 0.07569462805986404, "step": 4383, "train_speed(iter/s)": 0.151177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 317.0625, "completions/min_length": 250.0, "epoch": 3.6261373035566584, "grad_norm": 0.19606523215770721, "kl": 0.1060791015625, "learning_rate": 2.1292276155424727e-06, "loss": 0.0010613389313220978, "memory(GiB)": 38.1, "reward": 0.5155699849128723, "reward_std": 0.05317128449678421, "rewards/VisualizationJSONCombinedORM/mean": 0.5155699849128723, "rewards/VisualizationJSONCombinedORM/std": 0.12663602828979492, "step": 4384, "train_speed(iter/s)": 0.15105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 317.3125, "completions/min_length": 259.0, "epoch": 3.6269644334160462, "grad_norm": 0.18770566582679749, "kl": 0.04669189453125, "learning_rate": 2.1268639679712814e-06, "loss": 0.00046656280755996704, "memory(GiB)": 38.1, "reward": 0.48463988304138184, "reward_std": 0.046320393681526184, "rewards/VisualizationJSONCombinedORM/mean": 0.48463988304138184, "rewards/VisualizationJSONCombinedORM/std": 0.09892549365758896, "step": 4385, "train_speed(iter/s)": 0.150924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 294.625, "completions/min_length": 201.0, "epoch": 3.6277915632754345, "grad_norm": 0.1861661821603775, "kl": 0.03271484375, "learning_rate": 2.1245012786044045e-06, "loss": 0.0003273710608482361, "memory(GiB)": 38.1, "reward": 0.5891519784927368, "reward_std": 0.0709955245256424, "rewards/VisualizationJSONCombinedORM/mean": 0.5891519784927368, "rewards/VisualizationJSONCombinedORM/std": 0.2607361972332001, "step": 4386, "train_speed(iter/s)": 0.150794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 274.4375, "completions/min_length": 219.0, "epoch": 3.6286186931348223, "grad_norm": 0.17872430384159088, "kl": 0.0496826171875, "learning_rate": 2.1221395482298113e-06, "loss": 0.0004970543086528778, "memory(GiB)": 38.1, "reward": 0.7379418611526489, "reward_std": 0.029233790934085846, "rewards/VisualizationJSONCombinedORM/mean": 0.7379418611526489, "rewards/VisualizationJSONCombinedORM/std": 0.12857405841350555, "step": 4387, "train_speed(iter/s)": 0.150677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 308.0625, "completions/min_length": 245.0, "epoch": 3.62944582299421, "grad_norm": 0.1758142113685608, "kl": 0.03564453125, "learning_rate": 2.1197787776351498e-06, "loss": 0.0003568492829799652, "memory(GiB)": 38.1, "reward": 0.6905816197395325, "reward_std": 0.050319522619247437, "rewards/VisualizationJSONCombinedORM/mean": 0.6905816197395325, "rewards/VisualizationJSONCombinedORM/std": 0.11816728115081787, "step": 4388, "train_speed(iter/s)": 0.150573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 293.75, "completions/min_length": 220.0, "epoch": 3.630272952853598, "grad_norm": 0.2083444446325302, "kl": 0.10546875, "learning_rate": 2.11741896760775e-06, "loss": 0.0010509192943572998, "memory(GiB)": 38.1, "reward": 0.357197642326355, "reward_std": 0.05085035413503647, "rewards/VisualizationJSONCombinedORM/mean": 0.357197642326355, "rewards/VisualizationJSONCombinedORM/std": 0.09177714586257935, "step": 4389, "train_speed(iter/s)": 0.150453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 300.4375, "completions/min_length": 243.0, "epoch": 3.631100082712986, "grad_norm": 0.18620087206363678, "kl": 0.067138671875, "learning_rate": 2.115060118934616e-06, "loss": 0.0006718039512634277, "memory(GiB)": 38.1, "reward": 0.46791332960128784, "reward_std": 0.025739002972841263, "rewards/VisualizationJSONCombinedORM/mean": 0.46791332960128784, "rewards/VisualizationJSONCombinedORM/std": 0.23092621564865112, "step": 4390, "train_speed(iter/s)": 0.150368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 321.5, "completions/min_length": 282.0, "epoch": 3.631927212572374, "grad_norm": 0.1913139969110489, "kl": 0.0728759765625, "learning_rate": 2.1127022324024365e-06, "loss": 0.0007308609783649445, "memory(GiB)": 38.1, "reward": 0.5281243920326233, "reward_std": 0.04988322779536247, "rewards/VisualizationJSONCombinedORM/mean": 0.5281243920326233, "rewards/VisualizationJSONCombinedORM/std": 0.20968101918697357, "step": 4391, "train_speed(iter/s)": 0.150264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 327.5, "completions/min_length": 252.0, "epoch": 3.632754342431762, "grad_norm": 0.1624196618795395, "kl": 0.03729248046875, "learning_rate": 2.1103453087975776e-06, "loss": 0.00037296488881111145, "memory(GiB)": 38.1, "reward": 0.5127363801002502, "reward_std": 0.03917286545038223, "rewards/VisualizationJSONCombinedORM/mean": 0.5127363801002502, "rewards/VisualizationJSONCombinedORM/std": 0.21324367821216583, "step": 4392, "train_speed(iter/s)": 0.150151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 307.8125, "completions/min_length": 233.0, "epoch": 3.6335814722911497, "grad_norm": 0.2142123430967331, "kl": 0.084716796875, "learning_rate": 2.107989348906087e-06, "loss": 0.0008480697870254517, "memory(GiB)": 38.1, "reward": 0.3193129897117615, "reward_std": 0.023515181615948677, "rewards/VisualizationJSONCombinedORM/mean": 0.3193129897117615, "rewards/VisualizationJSONCombinedORM/std": 0.13021574914455414, "step": 4393, "train_speed(iter/s)": 0.150056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 331.6875, "completions/min_length": 280.0, "epoch": 3.6344086021505375, "grad_norm": 0.19530370831489563, "kl": 0.0821533203125, "learning_rate": 2.105634353513682e-06, "loss": 0.0008224397897720337, "memory(GiB)": 38.1, "reward": 0.468764990568161, "reward_std": 0.04647260159254074, "rewards/VisualizationJSONCombinedORM/mean": 0.468764990568161, "rewards/VisualizationJSONCombinedORM/std": 0.2833581268787384, "step": 4394, "train_speed(iter/s)": 0.14996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 285.5625, "completions/min_length": 251.0, "epoch": 3.6352357320099253, "grad_norm": 0.16894075274467468, "kl": 0.06683349609375, "learning_rate": 2.1032803234057725e-06, "loss": 0.0006674341857433319, "memory(GiB)": 38.1, "reward": 0.38435861468315125, "reward_std": 0.04961084574460983, "rewards/VisualizationJSONCombinedORM/mean": 0.38435861468315125, "rewards/VisualizationJSONCombinedORM/std": 0.05419817566871643, "step": 4395, "train_speed(iter/s)": 0.149862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 324.75, "completions/min_length": 256.0, "epoch": 3.6360628618693136, "grad_norm": 0.19584910571575165, "kl": 0.06243896484375, "learning_rate": 2.1009272593674323e-06, "loss": 0.0006251558661460876, "memory(GiB)": 38.1, "reward": 0.7199757099151611, "reward_std": 0.11657536774873734, "rewards/VisualizationJSONCombinedORM/mean": 0.7199757099151611, "rewards/VisualizationJSONCombinedORM/std": 0.15561744570732117, "step": 4396, "train_speed(iter/s)": 0.149777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 304.4375, "completions/min_length": 224.0, "epoch": 3.6368899917287014, "grad_norm": 0.19129066169261932, "kl": 0.03564453125, "learning_rate": 2.098575162183422e-06, "loss": 0.00035685673356056213, "memory(GiB)": 38.1, "reward": 0.28607019782066345, "reward_std": 0.027455566450953484, "rewards/VisualizationJSONCombinedORM/mean": 0.28607019782066345, "rewards/VisualizationJSONCombinedORM/std": 0.11642378568649292, "step": 4397, "train_speed(iter/s)": 0.149677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 307.4375, "completions/min_length": 230.0, "epoch": 3.6377171215880892, "grad_norm": 0.22145044803619385, "kl": 0.0887451171875, "learning_rate": 2.096224032638176e-06, "loss": 0.0008868277072906494, "memory(GiB)": 38.1, "reward": 0.25378215312957764, "reward_std": 0.022990744560956955, "rewards/VisualizationJSONCombinedORM/mean": 0.25378215312957764, "rewards/VisualizationJSONCombinedORM/std": 0.06702114641666412, "step": 4398, "train_speed(iter/s)": 0.149558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 302.8125, "completions/min_length": 236.0, "epoch": 3.6385442514474775, "grad_norm": 0.15720486640930176, "kl": 0.03564453125, "learning_rate": 2.09387387151581e-06, "loss": 0.00035597383975982666, "memory(GiB)": 38.1, "reward": 0.7388414144515991, "reward_std": 0.07917056977748871, "rewards/VisualizationJSONCombinedORM/mean": 0.7388414144515991, "rewards/VisualizationJSONCombinedORM/std": 0.10014791041612625, "step": 4399, "train_speed(iter/s)": 0.149456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 312.8125, "completions/min_length": 235.0, "epoch": 3.6393713813068653, "grad_norm": 0.23130780458450317, "kl": 0.0533447265625, "learning_rate": 2.0915246796001077e-06, "loss": 0.0005336292088031769, "memory(GiB)": 38.1, "reward": 0.5761905908584595, "reward_std": 0.06858660280704498, "rewards/VisualizationJSONCombinedORM/mean": 0.5761905908584595, "rewards/VisualizationJSONCombinedORM/std": 0.09413714706897736, "step": 4400, "train_speed(iter/s)": 0.149355 }, { "epoch": 3.6393713813068653, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 361.5, "eval_completions/mean_length": 307.3072916666667, "eval_completions/min_length": 261.2916666666667, "eval_kl": 0.07187906901041667, "eval_loss": 0.0007210833136923611, "eval_reward": 0.44505611931284267, "eval_reward_std": 0.04712310107424855, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44505611931284267, "eval_rewards/VisualizationJSONCombinedORM/std": 0.047123100880223014, "eval_runtime": 308.7697, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 298.375, "completions/min_length": 257.0, "epoch": 3.640198511166253, "grad_norm": 0.2055889070034027, "kl": 0.0975341796875, "learning_rate": 2.0891764576745426e-06, "loss": 0.0009771287441253662, "memory(GiB)": 38.1, "reward": 0.3145136535167694, "reward_std": 0.032453443855047226, "rewards/VisualizationJSONCombinedORM/mean": 0.3145136535167694, "rewards/VisualizationJSONCombinedORM/std": 0.034271709620952606, "step": 4401, "train_speed(iter/s)": 0.147711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 298.8125, "completions/min_length": 230.0, "epoch": 3.641025641025641, "grad_norm": 0.2172260880470276, "kl": 0.039947509765625, "learning_rate": 2.086829206522253e-06, "loss": 0.0003988973330706358, "memory(GiB)": 38.1, "reward": 0.555956244468689, "reward_std": 0.08642025291919708, "rewards/VisualizationJSONCombinedORM/mean": 0.555956244468689, "rewards/VisualizationJSONCombinedORM/std": 0.0905647799372673, "step": 4402, "train_speed(iter/s)": 0.147632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 304.4375, "completions/min_length": 250.0, "epoch": 3.641852770885029, "grad_norm": 0.278748095035553, "kl": 0.04766845703125, "learning_rate": 2.0844829269260592e-06, "loss": 0.00047717243432998657, "memory(GiB)": 38.1, "reward": 0.47150176763534546, "reward_std": 0.0735175609588623, "rewards/VisualizationJSONCombinedORM/mean": 0.47150176763534546, "rewards/VisualizationJSONCombinedORM/std": 0.13153283298015594, "step": 4403, "train_speed(iter/s)": 0.14751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 315.0, "completions/min_length": 268.0, "epoch": 3.642679900744417, "grad_norm": 0.23093144595623016, "kl": 0.0902099609375, "learning_rate": 2.082137619668457e-06, "loss": 0.0009019598364830017, "memory(GiB)": 38.1, "reward": 0.4311881959438324, "reward_std": 0.03874148055911064, "rewards/VisualizationJSONCombinedORM/mean": 0.4311881959438324, "rewards/VisualizationJSONCombinedORM/std": 0.11241324245929718, "step": 4404, "train_speed(iter/s)": 0.147415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 335.0625, "completions/min_length": 265.0, "epoch": 3.643507030603805, "grad_norm": 0.19825686514377594, "kl": 0.05914306640625, "learning_rate": 2.0797932855316183e-06, "loss": 0.0005910247564315796, "memory(GiB)": 38.1, "reward": 0.5013685822486877, "reward_std": 0.03799856826663017, "rewards/VisualizationJSONCombinedORM/mean": 0.5013685822486877, "rewards/VisualizationJSONCombinedORM/std": 0.14875459671020508, "step": 4405, "train_speed(iter/s)": 0.147315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 323.4375, "completions/min_length": 272.0, "epoch": 3.6443341604631927, "grad_norm": 0.15263810753822327, "kl": 0.03570556640625, "learning_rate": 2.077449925297387e-06, "loss": 0.0003572516143321991, "memory(GiB)": 38.1, "reward": 0.7920595407485962, "reward_std": 0.06824815273284912, "rewards/VisualizationJSONCombinedORM/mean": 0.7920595407485962, "rewards/VisualizationJSONCombinedORM/std": 0.06811415404081345, "step": 4406, "train_speed(iter/s)": 0.147222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 325.875, "completions/min_length": 250.0, "epoch": 3.6451612903225805, "grad_norm": 0.20221249759197235, "kl": 0.05511474609375, "learning_rate": 2.0751075397472853e-06, "loss": 0.00054970383644104, "memory(GiB)": 38.1, "reward": 0.6464412808418274, "reward_std": 0.026332827284932137, "rewards/VisualizationJSONCombinedORM/mean": 0.6464412808418274, "rewards/VisualizationJSONCombinedORM/std": 0.19824090600013733, "step": 4407, "train_speed(iter/s)": 0.147138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 265.3125, "completions/min_length": 218.0, "epoch": 3.6459884201819683, "grad_norm": 0.19355067610740662, "kl": 0.0430908203125, "learning_rate": 2.0727661296625108e-06, "loss": 0.0004305299371480942, "memory(GiB)": 38.1, "reward": 0.5748094320297241, "reward_std": 0.0716063603758812, "rewards/VisualizationJSONCombinedORM/mean": 0.5748094320297241, "rewards/VisualizationJSONCombinedORM/std": 0.07248654961585999, "step": 4408, "train_speed(iter/s)": 0.147053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 300.4375, "completions/min_length": 247.0, "epoch": 3.6468155500413566, "grad_norm": 0.19623436033725739, "kl": 0.0853271484375, "learning_rate": 2.070425695823936e-06, "loss": 0.0008566156029701233, "memory(GiB)": 38.1, "reward": 0.6536940336227417, "reward_std": 0.07438383996486664, "rewards/VisualizationJSONCombinedORM/mean": 0.6536940336227417, "rewards/VisualizationJSONCombinedORM/std": 0.10673464089632034, "step": 4409, "train_speed(iter/s)": 0.146943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 310.6875, "completions/min_length": 249.0, "epoch": 3.6476426799007444, "grad_norm": 0.1614624410867691, "kl": 0.082275390625, "learning_rate": 2.0680862390121015e-06, "loss": 0.0008240491151809692, "memory(GiB)": 38.1, "reward": 0.5793967247009277, "reward_std": 0.034745749086141586, "rewards/VisualizationJSONCombinedORM/mean": 0.5793967247009277, "rewards/VisualizationJSONCombinedORM/std": 0.2631780505180359, "step": 4410, "train_speed(iter/s)": 0.146849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 295.375, "completions/min_length": 232.0, "epoch": 3.6484698097601322, "grad_norm": 0.22812293469905853, "kl": 0.066162109375, "learning_rate": 2.0657477600072345e-06, "loss": 0.0006608515977859497, "memory(GiB)": 38.1, "reward": 0.5740771293640137, "reward_std": 0.08527272939682007, "rewards/VisualizationJSONCombinedORM/mean": 0.5740771293640137, "rewards/VisualizationJSONCombinedORM/std": 0.13029853999614716, "step": 4411, "train_speed(iter/s)": 0.146733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 306.9375, "completions/min_length": 257.0, "epoch": 3.6492969396195205, "grad_norm": 0.18756543099880219, "kl": 0.05389404296875, "learning_rate": 2.0634102595892224e-06, "loss": 0.0005389340221881866, "memory(GiB)": 38.1, "reward": 0.47398871183395386, "reward_std": 0.04554261639714241, "rewards/VisualizationJSONCombinedORM/mean": 0.47398871183395386, "rewards/VisualizationJSONCombinedORM/std": 0.21128059923648834, "step": 4412, "train_speed(iter/s)": 0.146628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 323.4375, "completions/min_length": 245.0, "epoch": 3.6501240694789083, "grad_norm": 0.1847415268421173, "kl": 0.03619384765625, "learning_rate": 2.061073738537635e-06, "loss": 0.0003623254597187042, "memory(GiB)": 38.1, "reward": 0.4883124530315399, "reward_std": 0.05156569927930832, "rewards/VisualizationJSONCombinedORM/mean": 0.4883124530315399, "rewards/VisualizationJSONCombinedORM/std": 0.26213109493255615, "step": 4413, "train_speed(iter/s)": 0.146555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 321.5625, "completions/min_length": 241.0, "epoch": 3.650951199338296, "grad_norm": 0.22004994750022888, "kl": 0.0906982421875, "learning_rate": 2.0587381976317128e-06, "loss": 0.0009069740772247314, "memory(GiB)": 38.1, "reward": 0.4901461899280548, "reward_std": 0.1546129733324051, "rewards/VisualizationJSONCombinedORM/mean": 0.4901461899280548, "rewards/VisualizationJSONCombinedORM/std": 0.18583010137081146, "step": 4414, "train_speed(iter/s)": 0.146465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 294.3125, "completions/min_length": 217.0, "epoch": 3.651778329197684, "grad_norm": 0.16680367290973663, "kl": 0.05255126953125, "learning_rate": 2.056403637650371e-06, "loss": 0.000525839626789093, "memory(GiB)": 38.1, "reward": 0.5742433071136475, "reward_std": 0.04698086529970169, "rewards/VisualizationJSONCombinedORM/mean": 0.5742433071136475, "rewards/VisualizationJSONCombinedORM/std": 0.06795376539230347, "step": 4415, "train_speed(iter/s)": 0.14638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 322.25, "completions/min_length": 263.0, "epoch": 3.652605459057072, "grad_norm": 0.23056210577487946, "kl": 0.190185546875, "learning_rate": 2.0540700593721916e-06, "loss": 0.0019104927778244019, "memory(GiB)": 38.1, "reward": 0.5098655819892883, "reward_std": 0.05891647934913635, "rewards/VisualizationJSONCombinedORM/mean": 0.5098655819892883, "rewards/VisualizationJSONCombinedORM/std": 0.2581554353237152, "step": 4416, "train_speed(iter/s)": 0.146255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 319.875, "completions/min_length": 227.0, "epoch": 3.65343258891646, "grad_norm": 0.2546626925468445, "kl": 0.0423583984375, "learning_rate": 2.051737463575441e-06, "loss": 0.0004235580563545227, "memory(GiB)": 38.1, "reward": 0.6494572758674622, "reward_std": 0.0382988415658474, "rewards/VisualizationJSONCombinedORM/mean": 0.6494572758674622, "rewards/VisualizationJSONCombinedORM/std": 0.13949072360992432, "step": 4417, "train_speed(iter/s)": 0.146133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 301.1875, "completions/min_length": 221.0, "epoch": 3.654259718775848, "grad_norm": 0.19532504677772522, "kl": 0.0672607421875, "learning_rate": 2.0494058510380453e-06, "loss": 0.0006735429633408785, "memory(GiB)": 38.1, "reward": 0.6004490852355957, "reward_std": 0.07677547633647919, "rewards/VisualizationJSONCombinedORM/mean": 0.6004490852355957, "rewards/VisualizationJSONCombinedORM/std": 0.13182038068771362, "step": 4418, "train_speed(iter/s)": 0.146041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 309.0, "completions/min_length": 251.0, "epoch": 3.6550868486352357, "grad_norm": 0.2828711271286011, "kl": 0.06805419921875, "learning_rate": 2.04707522253761e-06, "loss": 0.000680144876241684, "memory(GiB)": 38.1, "reward": 0.6662569642066956, "reward_std": 0.0858142226934433, "rewards/VisualizationJSONCombinedORM/mean": 0.6662569642066956, "rewards/VisualizationJSONCombinedORM/std": 0.15144096314907074, "step": 4419, "train_speed(iter/s)": 0.145922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 312.75, "completions/min_length": 251.0, "epoch": 3.6559139784946235, "grad_norm": 0.21985454857349396, "kl": 0.0361328125, "learning_rate": 2.0447455788514105e-06, "loss": 0.00036135315895080566, "memory(GiB)": 38.1, "reward": 0.8051959276199341, "reward_std": 0.02003713697195053, "rewards/VisualizationJSONCombinedORM/mean": 0.8051959276199341, "rewards/VisualizationJSONCombinedORM/std": 0.059197310358285904, "step": 4420, "train_speed(iter/s)": 0.145835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 293.25, "completions/min_length": 258.0, "epoch": 3.6567411083540113, "grad_norm": 0.23245227336883545, "kl": 0.083740234375, "learning_rate": 2.0424169207563954e-06, "loss": 0.0008364543318748474, "memory(GiB)": 38.1, "reward": 0.33132877945899963, "reward_std": 0.03213914483785629, "rewards/VisualizationJSONCombinedORM/mean": 0.33132877945899963, "rewards/VisualizationJSONCombinedORM/std": 0.04205701872706413, "step": 4421, "train_speed(iter/s)": 0.145764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 320.6875, "completions/min_length": 230.0, "epoch": 3.6575682382133996, "grad_norm": 0.20470993220806122, "kl": 0.03533935546875, "learning_rate": 2.0400892490291795e-06, "loss": 0.0003522038459777832, "memory(GiB)": 38.1, "reward": 0.6888868808746338, "reward_std": 0.04945005476474762, "rewards/VisualizationJSONCombinedORM/mean": 0.6888868808746338, "rewards/VisualizationJSONCombinedORM/std": 0.15829138457775116, "step": 4422, "train_speed(iter/s)": 0.14564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 287.3125, "completions/min_length": 222.0, "epoch": 3.6583953680727874, "grad_norm": 0.16959089040756226, "kl": 0.08502197265625, "learning_rate": 2.037762564446055e-06, "loss": 0.000849992036819458, "memory(GiB)": 38.1, "reward": 0.7566247582435608, "reward_std": 0.04176151379942894, "rewards/VisualizationJSONCombinedORM/mean": 0.7566247582435608, "rewards/VisualizationJSONCombinedORM/std": 0.05279770493507385, "step": 4423, "train_speed(iter/s)": 0.145517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 326.125, "completions/min_length": 270.0, "epoch": 3.6592224979321752, "grad_norm": 0.21360252797603607, "kl": 0.1080322265625, "learning_rate": 2.035436867782981e-06, "loss": 0.001080334186553955, "memory(GiB)": 38.1, "reward": 0.4698418378829956, "reward_std": 0.06076361611485481, "rewards/VisualizationJSONCombinedORM/mean": 0.4698418378829956, "rewards/VisualizationJSONCombinedORM/std": 0.12161868065595627, "step": 4424, "train_speed(iter/s)": 0.145417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 324.875, "completions/min_length": 226.0, "epoch": 3.6600496277915635, "grad_norm": 0.18504105508327484, "kl": 0.03131103515625, "learning_rate": 2.0331121598155905e-06, "loss": 0.0003130212426185608, "memory(GiB)": 38.1, "reward": 0.4161365032196045, "reward_std": 0.04615921527147293, "rewards/VisualizationJSONCombinedORM/mean": 0.4161365032196045, "rewards/VisualizationJSONCombinedORM/std": 0.1596243977546692, "step": 4425, "train_speed(iter/s)": 0.145318 }, { "epoch": 3.6600496277915635, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.2083333333333, "eval_completions/mean_length": 311.578125, "eval_completions/min_length": 251.79166666666666, "eval_kl": 0.073211669921875, "eval_loss": 0.0007445647497661412, "eval_reward": 0.4326996225863695, "eval_reward_std": 0.044995553170641266, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4326996225863695, "eval_rewards/VisualizationJSONCombinedORM/std": 0.044995552588564657, "eval_runtime": 319.3812, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 311.875, "completions/min_length": 252.0, "epoch": 3.6608767576509513, "grad_norm": 0.20981381833553314, "kl": 0.0679931640625, "learning_rate": 2.0307884413191794e-06, "loss": 0.0006792079657316208, "memory(GiB)": 38.1, "reward": 0.6039369702339172, "reward_std": 0.061555467545986176, "rewards/VisualizationJSONCombinedORM/mean": 0.6039369702339172, "rewards/VisualizationJSONCombinedORM/std": 0.0887698382139206, "step": 4426, "train_speed(iter/s)": 0.143719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 337.1875, "completions/min_length": 263.0, "epoch": 3.661703887510339, "grad_norm": 0.16510199010372162, "kl": 0.05621337890625, "learning_rate": 2.028465713068725e-06, "loss": 0.0005618985742330551, "memory(GiB)": 38.1, "reward": 0.31480252742767334, "reward_std": 0.018066639080643654, "rewards/VisualizationJSONCombinedORM/mean": 0.31480252742767334, "rewards/VisualizationJSONCombinedORM/std": 0.116568423807621, "step": 4427, "train_speed(iter/s)": 0.143632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 300.625, "completions/min_length": 264.0, "epoch": 3.662531017369727, "grad_norm": 0.22520531713962555, "kl": 0.05450439453125, "learning_rate": 2.0261439758388637e-06, "loss": 0.0005451962351799011, "memory(GiB)": 38.1, "reward": 0.4025159180164337, "reward_std": 0.04274006187915802, "rewards/VisualizationJSONCombinedORM/mean": 0.4025159180164337, "rewards/VisualizationJSONCombinedORM/std": 0.06554897129535675, "step": 4428, "train_speed(iter/s)": 0.143553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 318.6875, "completions/min_length": 250.0, "epoch": 3.663358147229115, "grad_norm": 0.18708345293998718, "kl": 0.0797119140625, "learning_rate": 2.023823230403907e-06, "loss": 0.0007969029247760773, "memory(GiB)": 38.1, "reward": 0.7227416038513184, "reward_std": 0.031015798449516296, "rewards/VisualizationJSONCombinedORM/mean": 0.7227416038513184, "rewards/VisualizationJSONCombinedORM/std": 0.03714507818222046, "step": 4429, "train_speed(iter/s)": 0.143459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 296.9375, "completions/min_length": 247.0, "epoch": 3.664185277088503, "grad_norm": 0.17426814138889313, "kl": 0.0926513671875, "learning_rate": 2.0215034775378336e-06, "loss": 0.0009255111217498779, "memory(GiB)": 38.1, "reward": 0.5432248115539551, "reward_std": 0.05148138478398323, "rewards/VisualizationJSONCombinedORM/mean": 0.5432248115539551, "rewards/VisualizationJSONCombinedORM/std": 0.17764601111412048, "step": 4430, "train_speed(iter/s)": 0.143375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 322.8125, "completions/min_length": 269.0, "epoch": 3.665012406947891, "grad_norm": 0.1872449517250061, "kl": 0.03802490234375, "learning_rate": 2.019184718014293e-06, "loss": 0.00038039684295654297, "memory(GiB)": 38.1, "reward": 0.5333788394927979, "reward_std": 0.05281324312090874, "rewards/VisualizationJSONCombinedORM/mean": 0.5333788394927979, "rewards/VisualizationJSONCombinedORM/std": 0.09725984930992126, "step": 4431, "train_speed(iter/s)": 0.143291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 335.0625, "completions/min_length": 238.0, "epoch": 3.6658395368072787, "grad_norm": 0.19091452658176422, "kl": 0.06732177734375, "learning_rate": 2.0168669526066044e-06, "loss": 0.000672508031129837, "memory(GiB)": 38.1, "reward": 0.5033513307571411, "reward_std": 0.025986865162849426, "rewards/VisualizationJSONCombinedORM/mean": 0.5033513307571411, "rewards/VisualizationJSONCombinedORM/std": 0.13275234401226044, "step": 4432, "train_speed(iter/s)": 0.143205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 297.75, "completions/min_length": 231.0, "epoch": 3.6666666666666665, "grad_norm": 0.22930209338665009, "kl": 0.082275390625, "learning_rate": 2.0145501820877468e-06, "loss": 0.0008228793740272522, "memory(GiB)": 38.1, "reward": 0.5985690355300903, "reward_std": 0.06270796805620193, "rewards/VisualizationJSONCombinedORM/mean": 0.5985690355300903, "rewards/VisualizationJSONCombinedORM/std": 0.13623535633087158, "step": 4433, "train_speed(iter/s)": 0.143089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 324.5, "completions/min_length": 255.0, "epoch": 3.6674937965260543, "grad_norm": 0.11570949852466583, "kl": 0.0244140625, "learning_rate": 2.012234407230382e-06, "loss": 0.00024472689256072044, "memory(GiB)": 38.1, "reward": 0.5851728916168213, "reward_std": 0.01010800525546074, "rewards/VisualizationJSONCombinedORM/mean": 0.5851728916168213, "rewards/VisualizationJSONCombinedORM/std": 0.013913308270275593, "step": 4434, "train_speed(iter/s)": 0.142996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 312.8125, "completions/min_length": 234.0, "epoch": 3.6683209263854426, "grad_norm": 0.30105963349342346, "kl": 0.06219482421875, "learning_rate": 2.009919628806826e-06, "loss": 0.000622827559709549, "memory(GiB)": 38.1, "reward": 0.5635235905647278, "reward_std": 0.0803002417087555, "rewards/VisualizationJSONCombinedORM/mean": 0.5635235905647278, "rewards/VisualizationJSONCombinedORM/std": 0.2075900137424469, "step": 4435, "train_speed(iter/s)": 0.142884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 301.5625, "completions/min_length": 231.0, "epoch": 3.6691480562448304, "grad_norm": 0.2947137653827667, "kl": 0.1495361328125, "learning_rate": 2.007605847589071e-06, "loss": 0.001497127115726471, "memory(GiB)": 38.1, "reward": 0.45207712054252625, "reward_std": 0.08198106288909912, "rewards/VisualizationJSONCombinedORM/mean": 0.45207712054252625, "rewards/VisualizationJSONCombinedORM/std": 0.23638038337230682, "step": 4436, "train_speed(iter/s)": 0.142768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 319.6875, "completions/min_length": 254.0, "epoch": 3.6699751861042182, "grad_norm": 0.2207127809524536, "kl": 0.046630859375, "learning_rate": 2.005293064348773e-06, "loss": 0.00046672672033309937, "memory(GiB)": 38.1, "reward": 0.4332481026649475, "reward_std": 0.04489195719361305, "rewards/VisualizationJSONCombinedORM/mean": 0.4332481026649475, "rewards/VisualizationJSONCombinedORM/std": 0.04451370984315872, "step": 4437, "train_speed(iter/s)": 0.14267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 286.1875, "completions/min_length": 255.0, "epoch": 3.6708023159636065, "grad_norm": 0.2651973366737366, "kl": 0.09710693359375, "learning_rate": 2.002981279857257e-06, "loss": 0.000968538224697113, "memory(GiB)": 38.1, "reward": 0.44042515754699707, "reward_std": 0.05894637852907181, "rewards/VisualizationJSONCombinedORM/mean": 0.44042515754699707, "rewards/VisualizationJSONCombinedORM/std": 0.05876437947154045, "step": 4438, "train_speed(iter/s)": 0.142577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 309.6875, "completions/min_length": 238.0, "epoch": 3.6716294458229943, "grad_norm": 0.1936643123626709, "kl": 0.0870361328125, "learning_rate": 2.000670494885511e-06, "loss": 0.0008685067296028137, "memory(GiB)": 38.1, "reward": 0.569447934627533, "reward_std": 0.0355309396982193, "rewards/VisualizationJSONCombinedORM/mean": 0.569447934627533, "rewards/VisualizationJSONCombinedORM/std": 0.17267964780330658, "step": 4439, "train_speed(iter/s)": 0.142484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 296.5625, "completions/min_length": 218.0, "epoch": 3.672456575682382, "grad_norm": 0.2150871753692627, "kl": 0.08062744140625, "learning_rate": 1.9983607102041974e-06, "loss": 0.0008073300123214722, "memory(GiB)": 38.1, "reward": 0.639926016330719, "reward_std": 0.09618154913187027, "rewards/VisualizationJSONCombinedORM/mean": 0.639926016330719, "rewards/VisualizationJSONCombinedORM/std": 0.09508426487445831, "step": 4440, "train_speed(iter/s)": 0.142403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 316.0, "completions/min_length": 251.0, "epoch": 3.67328370554177, "grad_norm": 0.1739567518234253, "kl": 0.06536865234375, "learning_rate": 1.996051926583636e-06, "loss": 0.0006536468863487244, "memory(GiB)": 38.1, "reward": 0.4767296612262726, "reward_std": 0.03706752508878708, "rewards/VisualizationJSONCombinedORM/mean": 0.4767296612262726, "rewards/VisualizationJSONCombinedORM/std": 0.19511274993419647, "step": 4441, "train_speed(iter/s)": 0.142312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 304.1875, "completions/min_length": 229.0, "epoch": 3.674110835401158, "grad_norm": 0.3129945993423462, "kl": 0.0992431640625, "learning_rate": 1.9937441447938182e-06, "loss": 0.0009929686784744263, "memory(GiB)": 38.1, "reward": 0.5200687646865845, "reward_std": 0.05807890370488167, "rewards/VisualizationJSONCombinedORM/mean": 0.5200687646865845, "rewards/VisualizationJSONCombinedORM/std": 0.19607847929000854, "step": 4442, "train_speed(iter/s)": 0.142221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 286.9375, "completions/min_length": 231.0, "epoch": 3.674937965260546, "grad_norm": 0.2078964114189148, "kl": 0.1429443359375, "learning_rate": 1.991437365604401e-06, "loss": 0.001431681215763092, "memory(GiB)": 38.1, "reward": 0.7515047788619995, "reward_std": 0.042179226875305176, "rewards/VisualizationJSONCombinedORM/mean": 0.7515047788619995, "rewards/VisualizationJSONCombinedORM/std": 0.0491596944630146, "step": 4443, "train_speed(iter/s)": 0.142112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/mean_length": 331.5625, "completions/min_length": 263.0, "epoch": 3.675765095119934, "grad_norm": 0.16993041336536407, "kl": 0.0948486328125, "learning_rate": 1.9891315897847064e-06, "loss": 0.0009497776627540588, "memory(GiB)": 38.1, "reward": 0.5079708099365234, "reward_std": 0.06507918238639832, "rewards/VisualizationJSONCombinedORM/mean": 0.5079708099365234, "rewards/VisualizationJSONCombinedORM/std": 0.09549389779567719, "step": 4444, "train_speed(iter/s)": 0.141987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 274.5625, "completions/min_length": 230.0, "epoch": 3.6765922249793217, "grad_norm": 0.209366112947464, "kl": 0.056640625, "learning_rate": 1.9868268181037186e-06, "loss": 0.0005656257271766663, "memory(GiB)": 38.1, "reward": 0.40112876892089844, "reward_std": 0.04957873746752739, "rewards/VisualizationJSONCombinedORM/mean": 0.40112876892089844, "rewards/VisualizationJSONCombinedORM/std": 0.05039539933204651, "step": 4445, "train_speed(iter/s)": 0.141888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 318.875, "completions/min_length": 270.0, "epoch": 3.6774193548387095, "grad_norm": 0.18123328685760498, "kl": 0.093505859375, "learning_rate": 1.9845230513300922e-06, "loss": 0.0009363889694213867, "memory(GiB)": 38.1, "reward": 0.5472064018249512, "reward_std": 0.048643484711647034, "rewards/VisualizationJSONCombinedORM/mean": 0.5472064018249512, "rewards/VisualizationJSONCombinedORM/std": 0.0692901536822319, "step": 4446, "train_speed(iter/s)": 0.141808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 315.125, "completions/min_length": 236.0, "epoch": 3.6782464846980973, "grad_norm": 0.18201488256454468, "kl": 0.0594482421875, "learning_rate": 1.982220290232143e-06, "loss": 0.0005949176847934723, "memory(GiB)": 38.1, "reward": 0.6397022604942322, "reward_std": 0.07403427362442017, "rewards/VisualizationJSONCombinedORM/mean": 0.6397022604942322, "rewards/VisualizationJSONCombinedORM/std": 0.11545054614543915, "step": 4447, "train_speed(iter/s)": 0.14171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 312.1875, "completions/min_length": 245.0, "epoch": 3.6790736145574856, "grad_norm": 0.21043433248996735, "kl": 0.071044921875, "learning_rate": 1.979918535577855e-06, "loss": 0.000710412859916687, "memory(GiB)": 38.1, "reward": 0.411643922328949, "reward_std": 0.0759488195180893, "rewards/VisualizationJSONCombinedORM/mean": 0.411643922328949, "rewards/VisualizationJSONCombinedORM/std": 0.14572006464004517, "step": 4448, "train_speed(iter/s)": 0.14161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 302.75, "completions/min_length": 246.0, "epoch": 3.6799007444168734, "grad_norm": 0.287098228931427, "kl": 0.09027099609375, "learning_rate": 1.977617788134869e-06, "loss": 0.0009044324979186058, "memory(GiB)": 38.1, "reward": 0.581317126750946, "reward_std": 0.0949598103761673, "rewards/VisualizationJSONCombinedORM/mean": 0.581317126750946, "rewards/VisualizationJSONCombinedORM/std": 0.10399533063173294, "step": 4449, "train_speed(iter/s)": 0.141526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 312.9375, "completions/min_length": 226.0, "epoch": 3.6807278742762612, "grad_norm": 0.21700847148895264, "kl": 0.11248779296875, "learning_rate": 1.9753180486705013e-06, "loss": 0.001123093068599701, "memory(GiB)": 38.1, "reward": 0.5727353692054749, "reward_std": 0.043637704104185104, "rewards/VisualizationJSONCombinedORM/mean": 0.5727353692054749, "rewards/VisualizationJSONCombinedORM/std": 0.17465831339359283, "step": 4450, "train_speed(iter/s)": 0.14142 }, { "epoch": 3.6807278742762612, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.5833333333333, "eval_completions/mean_length": 308.84375, "eval_completions/min_length": 255.125, "eval_kl": 0.07558186848958333, "eval_loss": 0.0007597158546559513, "eval_reward": 0.4385921222468217, "eval_reward_std": 0.042850702380140625, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4385921222468217, "eval_rewards/VisualizationJSONCombinedORM/std": 0.042850703039827444, "eval_runtime": 310.4735, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 323.6875, "completions/min_length": 263.0, "epoch": 3.6815550041356495, "grad_norm": 0.20582307875156403, "kl": 0.1087646484375, "learning_rate": 1.9730193179517216e-06, "loss": 0.0010882839560508728, "memory(GiB)": 38.1, "reward": 0.6666254997253418, "reward_std": 0.08011582493782043, "rewards/VisualizationJSONCombinedORM/mean": 0.6666254997253418, "rewards/VisualizationJSONCombinedORM/std": 0.10500841587781906, "step": 4451, "train_speed(iter/s)": 0.139959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 311.1875, "completions/min_length": 263.0, "epoch": 3.6823821339950373, "grad_norm": 0.177106574177742, "kl": 0.06317138671875, "learning_rate": 1.970721596745169e-06, "loss": 0.0006319992244243622, "memory(GiB)": 38.1, "reward": 0.5722681879997253, "reward_std": 0.062269099056720734, "rewards/VisualizationJSONCombinedORM/mean": 0.5722681879997253, "rewards/VisualizationJSONCombinedORM/std": 0.07431337982416153, "step": 4452, "train_speed(iter/s)": 0.139876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 288.25, "completions/min_length": 218.0, "epoch": 3.683209263854425, "grad_norm": 0.1856181025505066, "kl": 0.1292724609375, "learning_rate": 1.968424885817143e-06, "loss": 0.0012923553586006165, "memory(GiB)": 38.1, "reward": 0.3970642387866974, "reward_std": 0.07089720666408539, "rewards/VisualizationJSONCombinedORM/mean": 0.3970642387866974, "rewards/VisualizationJSONCombinedORM/std": 0.10432002693414688, "step": 4453, "train_speed(iter/s)": 0.139799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 310.5625, "completions/min_length": 244.0, "epoch": 3.684036393713813, "grad_norm": 0.1633487045764923, "kl": 0.06024169921875, "learning_rate": 1.9661291859336103e-06, "loss": 0.0006010532379150391, "memory(GiB)": 38.1, "reward": 0.5419535636901855, "reward_std": 0.017128707841038704, "rewards/VisualizationJSONCombinedORM/mean": 0.5419535636901855, "rewards/VisualizationJSONCombinedORM/std": 0.22269923985004425, "step": 4454, "train_speed(iter/s)": 0.139708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 316.8125, "completions/min_length": 235.0, "epoch": 3.684863523573201, "grad_norm": 0.2191779911518097, "kl": 0.1260986328125, "learning_rate": 1.963834497860192e-06, "loss": 0.001258745789527893, "memory(GiB)": 38.1, "reward": 0.4033524692058563, "reward_std": 0.036214858293533325, "rewards/VisualizationJSONCombinedORM/mean": 0.4033524692058563, "rewards/VisualizationJSONCombinedORM/std": 0.20088399946689606, "step": 4455, "train_speed(iter/s)": 0.139606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 280.0625, "completions/min_length": 224.0, "epoch": 3.685690653432589, "grad_norm": 0.21641084551811218, "kl": 0.1234130859375, "learning_rate": 1.9615408223621848e-06, "loss": 0.0012328922748565674, "memory(GiB)": 38.1, "reward": 0.6246309280395508, "reward_std": 0.12061381340026855, "rewards/VisualizationJSONCombinedORM/mean": 0.6246309280395508, "rewards/VisualizationJSONCombinedORM/std": 0.11721630394458771, "step": 4456, "train_speed(iter/s)": 0.139528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 335.625, "completions/min_length": 226.0, "epoch": 3.686517783291977, "grad_norm": 0.1742507815361023, "kl": 0.04010009765625, "learning_rate": 1.959248160204534e-06, "loss": 0.00040046870708465576, "memory(GiB)": 38.1, "reward": 0.6668314933776855, "reward_std": 0.07325546443462372, "rewards/VisualizationJSONCombinedORM/mean": 0.6668314933776855, "rewards/VisualizationJSONCombinedORM/std": 0.11976904422044754, "step": 4457, "train_speed(iter/s)": 0.139415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 296.8125, "completions/min_length": 222.0, "epoch": 3.6873449131513647, "grad_norm": 0.17531631886959076, "kl": 0.0625, "learning_rate": 1.956956512151856e-06, "loss": 0.0006248503923416138, "memory(GiB)": 38.1, "reward": 0.4828648269176483, "reward_std": 0.06270988285541534, "rewards/VisualizationJSONCombinedORM/mean": 0.4828648269176483, "rewards/VisualizationJSONCombinedORM/std": 0.20504751801490784, "step": 4458, "train_speed(iter/s)": 0.139333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 323.0625, "completions/min_length": 272.0, "epoch": 3.688172043010753, "grad_norm": 0.2416405975818634, "kl": 0.089111328125, "learning_rate": 1.954665878968425e-06, "loss": 0.0008920971304178238, "memory(GiB)": 38.1, "reward": 0.4750140309333801, "reward_std": 0.043527714908123016, "rewards/VisualizationJSONCombinedORM/mean": 0.4750140309333801, "rewards/VisualizationJSONCombinedORM/std": 0.0632709264755249, "step": 4459, "train_speed(iter/s)": 0.13927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 313.125, "completions/min_length": 239.0, "epoch": 3.6889991728701403, "grad_norm": 0.21129855513572693, "kl": 0.044677734375, "learning_rate": 1.95237626141818e-06, "loss": 0.0004467591643333435, "memory(GiB)": 38.1, "reward": 0.5601096749305725, "reward_std": 0.05838003009557724, "rewards/VisualizationJSONCombinedORM/mean": 0.5601096749305725, "rewards/VisualizationJSONCombinedORM/std": 0.24484704434871674, "step": 4460, "train_speed(iter/s)": 0.13919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 284.3125, "completions/min_length": 227.0, "epoch": 3.6898263027295286, "grad_norm": 0.25217205286026, "kl": 0.063232421875, "learning_rate": 1.9500876602647167e-06, "loss": 0.0006327517330646515, "memory(GiB)": 38.1, "reward": 0.6468088626861572, "reward_std": 0.09230609238147736, "rewards/VisualizationJSONCombinedORM/mean": 0.6468088626861572, "rewards/VisualizationJSONCombinedORM/std": 0.09127438068389893, "step": 4461, "train_speed(iter/s)": 0.139089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 329.625, "completions/min_length": 287.0, "epoch": 3.6906534325889164, "grad_norm": 0.1812768578529358, "kl": 0.060546875, "learning_rate": 1.947800076271295e-06, "loss": 0.0006056949496269226, "memory(GiB)": 38.1, "reward": 0.6690551042556763, "reward_std": 0.056627947837114334, "rewards/VisualizationJSONCombinedORM/mean": 0.6690551042556763, "rewards/VisualizationJSONCombinedORM/std": 0.07526352256536484, "step": 4462, "train_speed(iter/s)": 0.139025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 309.125, "completions/min_length": 264.0, "epoch": 3.6914805624483042, "grad_norm": 0.1868017166852951, "kl": 0.08184814453125, "learning_rate": 1.945513510200835e-06, "loss": 0.0008188188076019287, "memory(GiB)": 38.1, "reward": 0.5894688963890076, "reward_std": 0.04521424323320389, "rewards/VisualizationJSONCombinedORM/mean": 0.5894688963890076, "rewards/VisualizationJSONCombinedORM/std": 0.06036766618490219, "step": 4463, "train_speed(iter/s)": 0.138932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 313.4375, "completions/min_length": 238.0, "epoch": 3.6923076923076925, "grad_norm": 0.14751553535461426, "kl": 0.081787109375, "learning_rate": 1.9432279628159188e-06, "loss": 0.0008193813264369965, "memory(GiB)": 38.1, "reward": 0.5162839889526367, "reward_std": 0.03684673830866814, "rewards/VisualizationJSONCombinedORM/mean": 0.5162839889526367, "rewards/VisualizationJSONCombinedORM/std": 0.07479778677225113, "step": 4464, "train_speed(iter/s)": 0.138859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 311.0625, "completions/min_length": 256.0, "epoch": 3.6931348221670803, "grad_norm": 0.16060440242290497, "kl": 0.03887939453125, "learning_rate": 1.9409434348787824e-06, "loss": 0.000387558713555336, "memory(GiB)": 38.1, "reward": 0.37478840351104736, "reward_std": 0.034941595047712326, "rewards/VisualizationJSONCombinedORM/mean": 0.37478840351104736, "rewards/VisualizationJSONCombinedORM/std": 0.18486061692237854, "step": 4465, "train_speed(iter/s)": 0.138775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 306.0625, "completions/min_length": 239.0, "epoch": 3.693961952026468, "grad_norm": 0.1862695813179016, "kl": 0.0478515625, "learning_rate": 1.938659927151334e-06, "loss": 0.0004790574312210083, "memory(GiB)": 38.1, "reward": 0.3999531865119934, "reward_std": 0.044007349759340286, "rewards/VisualizationJSONCombinedORM/mean": 0.3999531865119934, "rewards/VisualizationJSONCombinedORM/std": 0.09474950283765793, "step": 4466, "train_speed(iter/s)": 0.138689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 313.625, "completions/min_length": 253.0, "epoch": 3.694789081885856, "grad_norm": 0.1580415964126587, "kl": 0.0419921875, "learning_rate": 1.9363774403951274e-06, "loss": 0.00041928142309188843, "memory(GiB)": 38.1, "reward": 0.6075505614280701, "reward_std": 0.05788631737232208, "rewards/VisualizationJSONCombinedORM/mean": 0.6075505614280701, "rewards/VisualizationJSONCombinedORM/std": 0.1371135413646698, "step": 4467, "train_speed(iter/s)": 0.138592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 341.625, "completions/min_length": 281.0, "epoch": 3.695616211745244, "grad_norm": 0.3312670588493347, "kl": 0.06390380859375, "learning_rate": 1.9340959753713856e-06, "loss": 0.0006380379199981689, "memory(GiB)": 38.1, "reward": 0.44078874588012695, "reward_std": 0.21298284828662872, "rewards/VisualizationJSONCombinedORM/mean": 0.44078874588012695, "rewards/VisualizationJSONCombinedORM/std": 0.29456889629364014, "step": 4468, "train_speed(iter/s)": 0.138508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 291.1875, "completions/min_length": 237.0, "epoch": 3.696443341604632, "grad_norm": 0.1802363246679306, "kl": 0.05316162109375, "learning_rate": 1.931815532840987e-06, "loss": 0.0005318969488143921, "memory(GiB)": 38.1, "reward": 0.4930018186569214, "reward_std": 0.0463838055729866, "rewards/VisualizationJSONCombinedORM/mean": 0.4930018186569214, "rewards/VisualizationJSONCombinedORM/std": 0.13999751210212708, "step": 4469, "train_speed(iter/s)": 0.138427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 303.75, "completions/min_length": 247.0, "epoch": 3.69727047146402, "grad_norm": 0.1681516170501709, "kl": 0.06854248046875, "learning_rate": 1.9295361135644724e-06, "loss": 0.0006850399076938629, "memory(GiB)": 38.1, "reward": 0.4789656102657318, "reward_std": 0.045392900705337524, "rewards/VisualizationJSONCombinedORM/mean": 0.4789656102657318, "rewards/VisualizationJSONCombinedORM/std": 0.2733277380466461, "step": 4470, "train_speed(iter/s)": 0.138357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 308.4375, "completions/min_length": 248.0, "epoch": 3.6980976013234077, "grad_norm": 0.16133275628089905, "kl": 0.02459716796875, "learning_rate": 1.927257718302033e-06, "loss": 0.000246390700340271, "memory(GiB)": 38.1, "reward": 0.6819610595703125, "reward_std": 0.06396781653165817, "rewards/VisualizationJSONCombinedORM/mean": 0.6819610595703125, "rewards/VisualizationJSONCombinedORM/std": 0.07179085910320282, "step": 4471, "train_speed(iter/s)": 0.138272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 332.4375, "completions/min_length": 235.0, "epoch": 3.698924731182796, "grad_norm": 0.15957807004451752, "kl": 0.04803466796875, "learning_rate": 1.9249803478135317e-06, "loss": 0.00048043933929875493, "memory(GiB)": 38.1, "reward": 0.6859489679336548, "reward_std": 0.028949324041604996, "rewards/VisualizationJSONCombinedORM/mean": 0.6859489679336548, "rewards/VisualizationJSONCombinedORM/std": 0.08399604260921478, "step": 4472, "train_speed(iter/s)": 0.138171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 323.0, "completions/min_length": 254.0, "epoch": 3.699751861042184, "grad_norm": 0.31640034914016724, "kl": 0.05499267578125, "learning_rate": 1.922704002858476e-06, "loss": 0.0005490332841873169, "memory(GiB)": 38.1, "reward": 0.6516426801681519, "reward_std": 0.09737580269575119, "rewards/VisualizationJSONCombinedORM/mean": 0.6516426801681519, "rewards/VisualizationJSONCombinedORM/std": 0.15342465043067932, "step": 4473, "train_speed(iter/s)": 0.138097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 314.75, "completions/min_length": 243.0, "epoch": 3.7005789909015716, "grad_norm": 0.17289967834949493, "kl": 0.0687255859375, "learning_rate": 1.920428684196041e-06, "loss": 0.0006856583058834076, "memory(GiB)": 38.1, "reward": 0.7330412864685059, "reward_std": 0.07464771717786789, "rewards/VisualizationJSONCombinedORM/mean": 0.7330412864685059, "rewards/VisualizationJSONCombinedORM/std": 0.07391975075006485, "step": 4474, "train_speed(iter/s)": 0.138013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 328.0, "completions/min_length": 267.0, "epoch": 3.7014061207609594, "grad_norm": 0.21497410535812378, "kl": 0.10107421875, "learning_rate": 1.9181543925850544e-06, "loss": 0.0010112002491950989, "memory(GiB)": 38.1, "reward": 0.6270381212234497, "reward_std": 0.0438300296664238, "rewards/VisualizationJSONCombinedORM/mean": 0.6270381212234497, "rewards/VisualizationJSONCombinedORM/std": 0.22072839736938477, "step": 4475, "train_speed(iter/s)": 0.137936 }, { "epoch": 3.7014061207609594, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.3333333333333, "eval_completions/mean_length": 311.171875, "eval_completions/min_length": 258.5416666666667, "eval_kl": 0.07688395182291667, "eval_loss": 0.0007773202960379422, "eval_reward": 0.43162602248291176, "eval_reward_std": 0.05004397428516919, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43162602248291176, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05004397539111475, "eval_runtime": 310.7424, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 300.75, "completions/min_length": 209.0, "epoch": 3.7022332506203472, "grad_norm": 0.26545265316963196, "kl": 0.07110595703125, "learning_rate": 1.9158811287840064e-06, "loss": 0.00070953369140625, "memory(GiB)": 38.1, "reward": 0.6748697757720947, "reward_std": 0.09136277437210083, "rewards/VisualizationJSONCombinedORM/mean": 0.6748697757720947, "rewards/VisualizationJSONCombinedORM/std": 0.13625016808509827, "step": 4476, "train_speed(iter/s)": 0.136554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 298.5, "completions/min_length": 234.0, "epoch": 3.7030603804797355, "grad_norm": 0.2873138189315796, "kl": 0.040313720703125, "learning_rate": 1.913608893551036e-06, "loss": 0.0004038885235786438, "memory(GiB)": 38.1, "reward": 0.6976702213287354, "reward_std": 0.1016017496585846, "rewards/VisualizationJSONCombinedORM/mean": 0.6976702213287354, "rewards/VisualizationJSONCombinedORM/std": 0.12738406658172607, "step": 4477, "train_speed(iter/s)": 0.136477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 316.625, "completions/min_length": 224.0, "epoch": 3.7038875103391233, "grad_norm": 0.23367539048194885, "kl": 0.06878662109375, "learning_rate": 1.9113376876439477e-06, "loss": 0.0006878003478050232, "memory(GiB)": 38.1, "reward": 0.4481596350669861, "reward_std": 0.06006740778684616, "rewards/VisualizationJSONCombinedORM/mean": 0.4481596350669861, "rewards/VisualizationJSONCombinedORM/std": 0.07318174093961716, "step": 4478, "train_speed(iter/s)": 0.136372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 295.5, "completions/min_length": 239.0, "epoch": 3.704714640198511, "grad_norm": 0.2033822238445282, "kl": 0.0753173828125, "learning_rate": 1.9090675118201977e-06, "loss": 0.0007529668509960175, "memory(GiB)": 38.1, "reward": 0.6687496900558472, "reward_std": 0.07950368523597717, "rewards/VisualizationJSONCombinedORM/mean": 0.6687496900558472, "rewards/VisualizationJSONCombinedORM/std": 0.08439881354570389, "step": 4479, "train_speed(iter/s)": 0.136304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 303.4375, "completions/min_length": 234.0, "epoch": 3.705541770057899, "grad_norm": 0.19134169816970825, "kl": 0.05938720703125, "learning_rate": 1.9067983668369038e-06, "loss": 0.0005934573709964752, "memory(GiB)": 38.1, "reward": 0.5326033234596252, "reward_std": 0.05228082835674286, "rewards/VisualizationJSONCombinedORM/mean": 0.5326033234596252, "rewards/VisualizationJSONCombinedORM/std": 0.11850772798061371, "step": 4480, "train_speed(iter/s)": 0.136237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 304.6875, "completions/min_length": 221.0, "epoch": 3.706368899917287, "grad_norm": 0.27325090765953064, "kl": 0.0687255859375, "learning_rate": 1.9045302534508298e-06, "loss": 0.0006874687969684601, "memory(GiB)": 38.1, "reward": 0.5325945019721985, "reward_std": 0.06621342897415161, "rewards/VisualizationJSONCombinedORM/mean": 0.5325945019721985, "rewards/VisualizationJSONCombinedORM/std": 0.18678100407123566, "step": 4481, "train_speed(iter/s)": 0.13617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 276.9375, "completions/min_length": 219.0, "epoch": 3.707196029776675, "grad_norm": 0.18623919785022736, "kl": 0.104736328125, "learning_rate": 1.9022631724184093e-06, "loss": 0.0010472089052200317, "memory(GiB)": 38.1, "reward": 0.6497452259063721, "reward_std": 0.07704474031925201, "rewards/VisualizationJSONCombinedORM/mean": 0.6497452259063721, "rewards/VisualizationJSONCombinedORM/std": 0.12398795783519745, "step": 4482, "train_speed(iter/s)": 0.136095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 332.75, "completions/min_length": 251.0, "epoch": 3.708023159636063, "grad_norm": 0.20058830082416534, "kl": 0.0718994140625, "learning_rate": 1.8999971244957199e-06, "loss": 0.0007206052541732788, "memory(GiB)": 38.1, "reward": 0.3307449519634247, "reward_std": 0.03654284030199051, "rewards/VisualizationJSONCombinedORM/mean": 0.3307449519634247, "rewards/VisualizationJSONCombinedORM/std": 0.08881569653749466, "step": 4483, "train_speed(iter/s)": 0.136006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 344.0, "completions/min_length": 277.0, "epoch": 3.7088502894954507, "grad_norm": 0.19287264347076416, "kl": 0.0380859375, "learning_rate": 1.8977321104385006e-06, "loss": 0.0003795735538005829, "memory(GiB)": 38.1, "reward": 0.2051497995853424, "reward_std": 0.008868809789419174, "rewards/VisualizationJSONCombinedORM/mean": 0.2051497995853424, "rewards/VisualizationJSONCombinedORM/std": 0.0506633035838604, "step": 4484, "train_speed(iter/s)": 0.135922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 297.75, "completions/min_length": 230.0, "epoch": 3.709677419354839, "grad_norm": 0.20168426632881165, "kl": 0.0469970703125, "learning_rate": 1.8954681310021434e-06, "loss": 0.00047001615166664124, "memory(GiB)": 38.1, "reward": 0.4785583019256592, "reward_std": 0.07373426109552383, "rewards/VisualizationJSONCombinedORM/mean": 0.4785583019256592, "rewards/VisualizationJSONCombinedORM/std": 0.1883004605770111, "step": 4485, "train_speed(iter/s)": 0.135841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 305.75, "completions/min_length": 238.0, "epoch": 3.710504549214227, "grad_norm": 0.2200694978237152, "kl": 0.060302734375, "learning_rate": 1.893205186941699e-06, "loss": 0.0006024353206157684, "memory(GiB)": 38.1, "reward": 0.6603696346282959, "reward_std": 0.07494062930345535, "rewards/VisualizationJSONCombinedORM/mean": 0.6603696346282959, "rewards/VisualizationJSONCombinedORM/std": 0.1153269112110138, "step": 4486, "train_speed(iter/s)": 0.135783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 315.0, "completions/min_length": 217.0, "epoch": 3.7113316790736146, "grad_norm": 0.1707535833120346, "kl": 0.04620361328125, "learning_rate": 1.8909432790118643e-06, "loss": 0.0004628896713256836, "memory(GiB)": 38.1, "reward": 0.5407034754753113, "reward_std": 0.030829021707177162, "rewards/VisualizationJSONCombinedORM/mean": 0.5407034754753113, "rewards/VisualizationJSONCombinedORM/std": 0.056088194251060486, "step": 4487, "train_speed(iter/s)": 0.135678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 263.5, "completions/min_length": 217.0, "epoch": 3.7121588089330024, "grad_norm": 0.17048193514347076, "kl": 0.032012939453125, "learning_rate": 1.8886824079670025e-06, "loss": 0.0003199651837348938, "memory(GiB)": 38.1, "reward": 0.6140236258506775, "reward_std": 0.09190647304058075, "rewards/VisualizationJSONCombinedORM/mean": 0.6140236258506775, "rewards/VisualizationJSONCombinedORM/std": 0.11210498958826065, "step": 4488, "train_speed(iter/s)": 0.135608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 292.75, "completions/min_length": 237.0, "epoch": 3.7129859387923903, "grad_norm": 0.22207291424274445, "kl": 0.18115234375, "learning_rate": 1.8864225745611197e-06, "loss": 0.0018118959851562977, "memory(GiB)": 38.1, "reward": 0.41040778160095215, "reward_std": 0.0815829187631607, "rewards/VisualizationJSONCombinedORM/mean": 0.41040778160095215, "rewards/VisualizationJSONCombinedORM/std": 0.17772483825683594, "step": 4489, "train_speed(iter/s)": 0.135533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 308.6875, "completions/min_length": 238.0, "epoch": 3.7138130686517785, "grad_norm": 0.22925753891468048, "kl": 0.0556640625, "learning_rate": 1.8841637795478835e-06, "loss": 0.0005569085478782654, "memory(GiB)": 38.1, "reward": 0.739608645439148, "reward_std": 0.05397956445813179, "rewards/VisualizationJSONCombinedORM/mean": 0.739608645439148, "rewards/VisualizationJSONCombinedORM/std": 0.08169915527105331, "step": 4490, "train_speed(iter/s)": 0.135455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 315.5625, "completions/min_length": 240.0, "epoch": 3.7146401985111663, "grad_norm": 0.1441790610551834, "kl": 0.02099609375, "learning_rate": 1.8819060236806114e-06, "loss": 0.00021045655012130737, "memory(GiB)": 38.1, "reward": 0.5379143953323364, "reward_std": 0.02174106054008007, "rewards/VisualizationJSONCombinedORM/mean": 0.5379143953323364, "rewards/VisualizationJSONCombinedORM/std": 0.1262965053319931, "step": 4491, "train_speed(iter/s)": 0.135374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 309.0625, "completions/min_length": 247.0, "epoch": 3.715467328370554, "grad_norm": 0.17751862108707428, "kl": 0.0634765625, "learning_rate": 1.8796493077122785e-06, "loss": 0.0006353110074996948, "memory(GiB)": 38.1, "reward": 0.3307327628135681, "reward_std": 0.012630362063646317, "rewards/VisualizationJSONCombinedORM/mean": 0.3307327628135681, "rewards/VisualizationJSONCombinedORM/std": 0.16084007918834686, "step": 4492, "train_speed(iter/s)": 0.135304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 343.6875, "completions/min_length": 268.0, "epoch": 3.716294458229942, "grad_norm": 0.24834200739860535, "kl": 0.082763671875, "learning_rate": 1.8773936323955055e-06, "loss": 0.0008282219059765339, "memory(GiB)": 38.1, "reward": 0.2795052230358124, "reward_std": 0.03361527621746063, "rewards/VisualizationJSONCombinedORM/mean": 0.2795052230358124, "rewards/VisualizationJSONCombinedORM/std": 0.0955730751156807, "step": 4493, "train_speed(iter/s)": 0.135212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 327.875, "completions/min_length": 257.0, "epoch": 3.71712158808933, "grad_norm": 0.2994110882282257, "kl": 0.08050537109375, "learning_rate": 1.875138998482573e-06, "loss": 0.0008065700531005859, "memory(GiB)": 38.1, "reward": 0.5472477674484253, "reward_std": 0.057217296212911606, "rewards/VisualizationJSONCombinedORM/mean": 0.5472477674484253, "rewards/VisualizationJSONCombinedORM/std": 0.183952197432518, "step": 4494, "train_speed(iter/s)": 0.135139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 302.1875, "completions/min_length": 240.0, "epoch": 3.717948717948718, "grad_norm": 0.24909429252147675, "kl": 0.2232666015625, "learning_rate": 1.872885406725412e-06, "loss": 0.0022340789437294006, "memory(GiB)": 38.1, "reward": 0.6418373584747314, "reward_std": 0.084806889295578, "rewards/VisualizationJSONCombinedORM/mean": 0.6418373584747314, "rewards/VisualizationJSONCombinedORM/std": 0.0873802900314331, "step": 4495, "train_speed(iter/s)": 0.135069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 298.3125, "completions/min_length": 241.0, "epoch": 3.718775847808106, "grad_norm": 0.19215154647827148, "kl": 0.14373779296875, "learning_rate": 1.8706328578756072e-06, "loss": 0.0014301985502243042, "memory(GiB)": 38.1, "reward": 0.3984520435333252, "reward_std": 0.05499331280589104, "rewards/VisualizationJSONCombinedORM/mean": 0.3984520435333252, "rewards/VisualizationJSONCombinedORM/std": 0.06416846066713333, "step": 4496, "train_speed(iter/s)": 0.134995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 354.0625, "completions/min_length": 246.0, "epoch": 3.7196029776674937, "grad_norm": 0.19388653337955475, "kl": 0.0816650390625, "learning_rate": 1.8683813526843898e-06, "loss": 0.0008165165781974792, "memory(GiB)": 38.1, "reward": 0.4581138491630554, "reward_std": 0.05023710057139397, "rewards/VisualizationJSONCombinedORM/mean": 0.4581138491630554, "rewards/VisualizationJSONCombinedORM/std": 0.04936665669083595, "step": 4497, "train_speed(iter/s)": 0.134926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 333.75, "completions/min_length": 270.0, "epoch": 3.720430107526882, "grad_norm": 0.22573363780975342, "kl": 0.1663818359375, "learning_rate": 1.8661308919026533e-06, "loss": 0.0016635581851005554, "memory(GiB)": 38.1, "reward": 0.6541741490364075, "reward_std": 0.08064550161361694, "rewards/VisualizationJSONCombinedORM/mean": 0.6541741490364075, "rewards/VisualizationJSONCombinedORM/std": 0.08568704128265381, "step": 4498, "train_speed(iter/s)": 0.134857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 321.8125, "completions/min_length": 260.0, "epoch": 3.72125723738627, "grad_norm": 0.18694210052490234, "kl": 0.0794677734375, "learning_rate": 1.8638814762809327e-06, "loss": 0.0007942765951156616, "memory(GiB)": 38.1, "reward": 0.5988044738769531, "reward_std": 0.05638293921947479, "rewards/VisualizationJSONCombinedORM/mean": 0.5988044738769531, "rewards/VisualizationJSONCombinedORM/std": 0.0710834488272667, "step": 4499, "train_speed(iter/s)": 0.134775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 301.0, "completions/min_length": 238.0, "epoch": 3.7220843672456576, "grad_norm": 0.2291308492422104, "kl": 0.040283203125, "learning_rate": 1.8616331065694193e-06, "loss": 0.00040408968925476074, "memory(GiB)": 38.1, "reward": 0.7399416565895081, "reward_std": 0.058899253606796265, "rewards/VisualizationJSONCombinedORM/mean": 0.7399416565895081, "rewards/VisualizationJSONCombinedORM/std": 0.12079842388629913, "step": 4500, "train_speed(iter/s)": 0.13471 }, { "epoch": 3.7220843672456576, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.5, "eval_completions/mean_length": 310.46875, "eval_completions/min_length": 263.0833333333333, "eval_kl": 0.12223307291666667, "eval_loss": 0.0012299692025408149, "eval_reward": 0.4422303394724925, "eval_reward_std": 0.05328756840511536, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4422303394724925, "eval_rewards/VisualizationJSONCombinedORM/std": 0.053287567861843854, "eval_runtime": 315.6227, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 302.375, "completions/min_length": 255.0, "epoch": 3.7229114971050454, "grad_norm": 0.18296323716640472, "kl": 0.05780029296875, "learning_rate": 1.8593857835179557e-06, "loss": 0.0005780011415481567, "memory(GiB)": 38.1, "reward": 0.6592392921447754, "reward_std": 0.07372245192527771, "rewards/VisualizationJSONCombinedORM/mean": 0.6592392921447754, "rewards/VisualizationJSONCombinedORM/std": 0.14533473551273346, "step": 4501, "train_speed(iter/s)": 0.133373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 285.375, "completions/min_length": 218.0, "epoch": 3.7237386269644333, "grad_norm": 0.20365822315216064, "kl": 0.0733642578125, "learning_rate": 1.8571395078760362e-06, "loss": 0.000733107328414917, "memory(GiB)": 38.1, "reward": 0.4195273220539093, "reward_std": 0.06726425886154175, "rewards/VisualizationJSONCombinedORM/mean": 0.4195273220539093, "rewards/VisualizationJSONCombinedORM/std": 0.2731260657310486, "step": 4502, "train_speed(iter/s)": 0.133291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 309.875, "completions/min_length": 237.0, "epoch": 3.7245657568238215, "grad_norm": 0.25046032667160034, "kl": 0.04315185546875, "learning_rate": 1.8548942803927988e-06, "loss": 0.0004323795437812805, "memory(GiB)": 38.1, "reward": 0.43028926849365234, "reward_std": 0.054475851356983185, "rewards/VisualizationJSONCombinedORM/mean": 0.43028926849365234, "rewards/VisualizationJSONCombinedORM/std": 0.06739174574613571, "step": 4503, "train_speed(iter/s)": 0.133222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 311.5, "completions/min_length": 247.0, "epoch": 3.7253928866832093, "grad_norm": 0.2504066824913025, "kl": 0.07562255859375, "learning_rate": 1.852650101817045e-06, "loss": 0.00075526162981987, "memory(GiB)": 38.1, "reward": 0.45823174715042114, "reward_std": 0.07234829664230347, "rewards/VisualizationJSONCombinedORM/mean": 0.45823174715042114, "rewards/VisualizationJSONCombinedORM/std": 0.17831847071647644, "step": 4504, "train_speed(iter/s)": 0.133146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 292.6875, "completions/min_length": 226.0, "epoch": 3.726220016542597, "grad_norm": 0.22095081210136414, "kl": 0.284912109375, "learning_rate": 1.8504069728972124e-06, "loss": 0.002850787015631795, "memory(GiB)": 38.1, "reward": 0.6332910656929016, "reward_std": 0.014718993566930294, "rewards/VisualizationJSONCombinedORM/mean": 0.6332910656929016, "rewards/VisualizationJSONCombinedORM/std": 0.1306740641593933, "step": 4505, "train_speed(iter/s)": 0.133067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 333.75, "completions/min_length": 273.0, "epoch": 3.727047146401985, "grad_norm": 0.1912071257829666, "kl": 0.0838623046875, "learning_rate": 1.8481648943813978e-06, "loss": 0.000837225466966629, "memory(GiB)": 38.1, "reward": 0.43407464027404785, "reward_std": 0.03478595241904259, "rewards/VisualizationJSONCombinedORM/mean": 0.43407464027404785, "rewards/VisualizationJSONCombinedORM/std": 0.1274765133857727, "step": 4506, "train_speed(iter/s)": 0.132981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 303.1875, "completions/min_length": 239.0, "epoch": 3.727874276261373, "grad_norm": 0.21336577832698822, "kl": 0.0303955078125, "learning_rate": 1.8459238670173452e-06, "loss": 0.00030332058668136597, "memory(GiB)": 38.1, "reward": 0.6923770308494568, "reward_std": 0.06357627362012863, "rewards/VisualizationJSONCombinedORM/mean": 0.6923770308494568, "rewards/VisualizationJSONCombinedORM/std": 0.07267250120639801, "step": 4507, "train_speed(iter/s)": 0.132896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 302.625, "completions/min_length": 238.0, "epoch": 3.728701406120761, "grad_norm": 0.2561436593532562, "kl": 0.0577392578125, "learning_rate": 1.843683891552448e-06, "loss": 0.0005769655108451843, "memory(GiB)": 38.1, "reward": 0.6570179462432861, "reward_std": 0.07770240306854248, "rewards/VisualizationJSONCombinedORM/mean": 0.6570179462432861, "rewards/VisualizationJSONCombinedORM/std": 0.17727629840373993, "step": 4508, "train_speed(iter/s)": 0.132822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 290.3125, "completions/min_length": 252.0, "epoch": 3.729528535980149, "grad_norm": 0.18546491861343384, "kl": 0.06304931640625, "learning_rate": 1.8414449687337467e-06, "loss": 0.0006313920021057129, "memory(GiB)": 38.1, "reward": 0.6133675575256348, "reward_std": 0.07069817185401917, "rewards/VisualizationJSONCombinedORM/mean": 0.6133675575256348, "rewards/VisualizationJSONCombinedORM/std": 0.1611354947090149, "step": 4509, "train_speed(iter/s)": 0.132758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 330.5, "completions/min_length": 262.0, "epoch": 3.7303556658395367, "grad_norm": 0.1860310286283493, "kl": 0.04364013671875, "learning_rate": 1.8392070993079326e-06, "loss": 0.0004361346364021301, "memory(GiB)": 38.1, "reward": 0.5310320258140564, "reward_std": 0.05474768579006195, "rewards/VisualizationJSONCombinedORM/mean": 0.5310320258140564, "rewards/VisualizationJSONCombinedORM/std": 0.13013780117034912, "step": 4510, "train_speed(iter/s)": 0.132677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 321.25, "completions/min_length": 248.0, "epoch": 3.731182795698925, "grad_norm": 0.19479116797447205, "kl": 0.1239013671875, "learning_rate": 1.8369702840213466e-06, "loss": 0.0012396499514579773, "memory(GiB)": 38.1, "reward": 0.5780817270278931, "reward_std": 0.08495646715164185, "rewards/VisualizationJSONCombinedORM/mean": 0.5780817270278931, "rewards/VisualizationJSONCombinedORM/std": 0.08400774747133255, "step": 4511, "train_speed(iter/s)": 0.132605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 292.125, "completions/min_length": 228.0, "epoch": 3.732009925558313, "grad_norm": 0.2236987054347992, "kl": 0.1375732421875, "learning_rate": 1.8347345236199787e-06, "loss": 0.0013783127069473267, "memory(GiB)": 38.1, "reward": 0.6016219854354858, "reward_std": 0.08322060108184814, "rewards/VisualizationJSONCombinedORM/mean": 0.6016219854354858, "rewards/VisualizationJSONCombinedORM/std": 0.2355429083108902, "step": 4512, "train_speed(iter/s)": 0.13254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 288.3125, "completions/min_length": 226.0, "epoch": 3.7328370554177006, "grad_norm": 0.5148903131484985, "kl": 0.69189453125, "learning_rate": 1.8324998188494608e-06, "loss": 0.006937641650438309, "memory(GiB)": 38.1, "reward": 0.6560748219490051, "reward_std": 0.0774988979101181, "rewards/VisualizationJSONCombinedORM/mean": 0.6560748219490051, "rewards/VisualizationJSONCombinedORM/std": 0.14009127020835876, "step": 4513, "train_speed(iter/s)": 0.132479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 322.5625, "completions/min_length": 248.0, "epoch": 3.7336641852770884, "grad_norm": 0.22374649345874786, "kl": 0.09991455078125, "learning_rate": 1.8302661704550828e-06, "loss": 0.001000676304101944, "memory(GiB)": 38.1, "reward": 0.5032941102981567, "reward_std": 0.07247567176818848, "rewards/VisualizationJSONCombinedORM/mean": 0.5032941102981567, "rewards/VisualizationJSONCombinedORM/std": 0.07833276689052582, "step": 4514, "train_speed(iter/s)": 0.13241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 310.75, "completions/min_length": 226.0, "epoch": 3.7344913151364763, "grad_norm": 0.12025461345911026, "kl": 0.0460205078125, "learning_rate": 1.8280335791817733e-06, "loss": 0.0004595149657689035, "memory(GiB)": 38.1, "reward": 0.31884676218032837, "reward_std": 0.017558936029672623, "rewards/VisualizationJSONCombinedORM/mean": 0.31884676218032837, "rewards/VisualizationJSONCombinedORM/std": 0.13222472369670868, "step": 4515, "train_speed(iter/s)": 0.13235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 337.875, "completions/min_length": 230.0, "epoch": 3.7353184449958645, "grad_norm": 0.1878761500120163, "kl": 0.04656982421875, "learning_rate": 1.8258020457741132e-06, "loss": 0.000466175377368927, "memory(GiB)": 38.1, "reward": 0.6890987157821655, "reward_std": 0.06463837623596191, "rewards/VisualizationJSONCombinedORM/mean": 0.6890987157821655, "rewards/VisualizationJSONCombinedORM/std": 0.06417384743690491, "step": 4516, "train_speed(iter/s)": 0.132271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 281.1875, "completions/min_length": 245.0, "epoch": 3.7361455748552523, "grad_norm": 0.2360237091779709, "kl": 0.08660888671875, "learning_rate": 1.8235715709763285e-06, "loss": 0.0008655153214931488, "memory(GiB)": 38.1, "reward": 0.42199501395225525, "reward_std": 0.03939523920416832, "rewards/VisualizationJSONCombinedORM/mean": 0.42199501395225525, "rewards/VisualizationJSONCombinedORM/std": 0.2242378294467926, "step": 4517, "train_speed(iter/s)": 0.132203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 324.5, "completions/min_length": 241.0, "epoch": 3.73697270471464, "grad_norm": 0.17378926277160645, "kl": 0.123046875, "learning_rate": 1.821342155532294e-06, "loss": 0.0012271665036678314, "memory(GiB)": 38.1, "reward": 0.24284744262695312, "reward_std": 0.01655951887369156, "rewards/VisualizationJSONCombinedORM/mean": 0.24284744262695312, "rewards/VisualizationJSONCombinedORM/std": 0.018858201801776886, "step": 4518, "train_speed(iter/s)": 0.132129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 261.9375, "completions/min_length": 225.0, "epoch": 3.737799834574028, "grad_norm": 0.2262648195028305, "kl": 0.05181884765625, "learning_rate": 1.819113800185532e-06, "loss": 0.0005180295556783676, "memory(GiB)": 38.1, "reward": 0.40412914752960205, "reward_std": 0.047948017716407776, "rewards/VisualizationJSONCombinedORM/mean": 0.40412914752960205, "rewards/VisualizationJSONCombinedORM/std": 0.1032034382224083, "step": 4519, "train_speed(iter/s)": 0.132068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 287.875, "completions/min_length": 230.0, "epoch": 3.738626964433416, "grad_norm": 0.2162555456161499, "kl": 0.05450439453125, "learning_rate": 1.8168865056792029e-06, "loss": 0.0005455613136291504, "memory(GiB)": 38.1, "reward": 0.7145569920539856, "reward_std": 0.0726546049118042, "rewards/VisualizationJSONCombinedORM/mean": 0.7145569920539856, "rewards/VisualizationJSONCombinedORM/std": 0.07129557430744171, "step": 4520, "train_speed(iter/s)": 0.132013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 318.375, "completions/min_length": 277.0, "epoch": 3.739454094292804, "grad_norm": 0.20064358413219452, "kl": 0.03948974609375, "learning_rate": 1.8146602727561286e-06, "loss": 0.00039469823241233826, "memory(GiB)": 38.1, "reward": 0.37535911798477173, "reward_std": 0.04638257250189781, "rewards/VisualizationJSONCombinedORM/mean": 0.37535911798477173, "rewards/VisualizationJSONCombinedORM/std": 0.05123450234532356, "step": 4521, "train_speed(iter/s)": 0.13194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 304.3125, "completions/min_length": 247.0, "epoch": 3.740281224152192, "grad_norm": 0.21695829927921295, "kl": 0.04559326171875, "learning_rate": 1.8124351021587616e-06, "loss": 0.0004556328058242798, "memory(GiB)": 38.1, "reward": 0.49084848165512085, "reward_std": 0.08685596287250519, "rewards/VisualizationJSONCombinedORM/mean": 0.49084848165512085, "rewards/VisualizationJSONCombinedORM/std": 0.10032728314399719, "step": 4522, "train_speed(iter/s)": 0.131862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 317.0, "completions/min_length": 254.0, "epoch": 3.7411083540115797, "grad_norm": 0.1813019961118698, "kl": 0.05938720703125, "learning_rate": 1.810210994629209e-06, "loss": 0.0005933567881584167, "memory(GiB)": 38.1, "reward": 0.5129302740097046, "reward_std": 0.033887676894664764, "rewards/VisualizationJSONCombinedORM/mean": 0.5129302740097046, "rewards/VisualizationJSONCombinedORM/std": 0.29431647062301636, "step": 4523, "train_speed(iter/s)": 0.131785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 284.1875, "completions/min_length": 242.0, "epoch": 3.741935483870968, "grad_norm": 0.18409784138202667, "kl": 0.04473876953125, "learning_rate": 1.8079879509092208e-06, "loss": 0.0004474520683288574, "memory(GiB)": 38.1, "reward": 0.7030383348464966, "reward_std": 0.06428775936365128, "rewards/VisualizationJSONCombinedORM/mean": 0.7030383348464966, "rewards/VisualizationJSONCombinedORM/std": 0.07172953337430954, "step": 4524, "train_speed(iter/s)": 0.131716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 295.1875, "completions/min_length": 228.0, "epoch": 3.742762613730356, "grad_norm": 0.2070680856704712, "kl": 0.18487548828125, "learning_rate": 1.8057659717401948e-06, "loss": 0.001855216920375824, "memory(GiB)": 38.1, "reward": 0.5001773238182068, "reward_std": 0.0700036808848381, "rewards/VisualizationJSONCombinedORM/mean": 0.5001773238182068, "rewards/VisualizationJSONCombinedORM/std": 0.13787543773651123, "step": 4525, "train_speed(iter/s)": 0.131666 }, { "epoch": 3.742762613730356, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 364.0, "eval_completions/mean_length": 306.4895833333333, "eval_completions/min_length": 255.0, "eval_kl": 0.07364908854166667, "eval_loss": 0.0007391807739622891, "eval_reward": 0.4493069698413213, "eval_reward_std": 0.05560139217413962, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4493069698413213, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05560139620987078, "eval_runtime": 310.2408, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 315.0, "completions/min_length": 241.0, "epoch": 3.7435897435897436, "grad_norm": 0.18743275105953217, "kl": 0.070556640625, "learning_rate": 1.8035450578631652e-06, "loss": 0.0007063895463943481, "memory(GiB)": 38.1, "reward": 0.752438485622406, "reward_std": 0.03540381044149399, "rewards/VisualizationJSONCombinedORM/mean": 0.752438485622406, "rewards/VisualizationJSONCombinedORM/std": 0.03520060330629349, "step": 4526, "train_speed(iter/s)": 0.130422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 275.5625, "completions/min_length": 230.0, "epoch": 3.7444168734491314, "grad_norm": 0.2529289126396179, "kl": 0.1195068359375, "learning_rate": 1.8013252100188255e-06, "loss": 0.0011961720883846283, "memory(GiB)": 38.1, "reward": 0.47742921113967896, "reward_std": 0.09586267173290253, "rewards/VisualizationJSONCombinedORM/mean": 0.47742921113967896, "rewards/VisualizationJSONCombinedORM/std": 0.13699406385421753, "step": 4527, "train_speed(iter/s)": 0.130362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 316.5625, "completions/min_length": 244.0, "epoch": 3.7452440033085193, "grad_norm": 0.18125751614570618, "kl": 0.037322998046875, "learning_rate": 1.7991064289474992e-06, "loss": 0.0003735944628715515, "memory(GiB)": 38.1, "reward": 0.6114107370376587, "reward_std": 0.049681320786476135, "rewards/VisualizationJSONCombinedORM/mean": 0.6114107370376587, "rewards/VisualizationJSONCombinedORM/std": 0.12751612067222595, "step": 4528, "train_speed(iter/s)": 0.130272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 285.1875, "completions/min_length": 218.0, "epoch": 3.7460711331679075, "grad_norm": 0.2421983778476715, "kl": 0.0931396484375, "learning_rate": 1.7968887153891622e-06, "loss": 0.000930100679397583, "memory(GiB)": 38.1, "reward": 0.4904102087020874, "reward_std": 0.067512646317482, "rewards/VisualizationJSONCombinedORM/mean": 0.4904102087020874, "rewards/VisualizationJSONCombinedORM/std": 0.20516929030418396, "step": 4529, "train_speed(iter/s)": 0.130191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 313.9375, "completions/min_length": 236.0, "epoch": 3.7468982630272953, "grad_norm": 0.20564116537570953, "kl": 0.05218505859375, "learning_rate": 1.7946720700834324e-06, "loss": 0.0005224719643592834, "memory(GiB)": 38.1, "reward": 0.6826577186584473, "reward_std": 0.045931290835142136, "rewards/VisualizationJSONCombinedORM/mean": 0.6826577186584473, "rewards/VisualizationJSONCombinedORM/std": 0.16778923571109772, "step": 4530, "train_speed(iter/s)": 0.130139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 309.125, "completions/min_length": 229.0, "epoch": 3.747725392886683, "grad_norm": 0.19723418354988098, "kl": 0.04425048828125, "learning_rate": 1.7924564937695727e-06, "loss": 0.00044299662113189697, "memory(GiB)": 38.1, "reward": 0.46763747930526733, "reward_std": 0.020472772419452667, "rewards/VisualizationJSONCombinedORM/mean": 0.46763747930526733, "rewards/VisualizationJSONCombinedORM/std": 0.06585924327373505, "step": 4531, "train_speed(iter/s)": 0.130079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 289.5625, "completions/min_length": 231.0, "epoch": 3.748552522746071, "grad_norm": 0.1887672245502472, "kl": 0.13427734375, "learning_rate": 1.790241987186485e-06, "loss": 0.0013405755162239075, "memory(GiB)": 38.1, "reward": 0.39556536078453064, "reward_std": 0.050842806696891785, "rewards/VisualizationJSONCombinedORM/mean": 0.39556536078453064, "rewards/VisualizationJSONCombinedORM/std": 0.0645560696721077, "step": 4532, "train_speed(iter/s)": 0.130023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 288.625, "completions/min_length": 224.0, "epoch": 3.749379652605459, "grad_norm": 0.2645881772041321, "kl": 0.10009765625, "learning_rate": 1.7880285510727197e-06, "loss": 0.0009993240237236023, "memory(GiB)": 38.1, "reward": 0.2987893223762512, "reward_std": 0.03927520662546158, "rewards/VisualizationJSONCombinedORM/mean": 0.2987893223762512, "rewards/VisualizationJSONCombinedORM/std": 0.13003404438495636, "step": 4533, "train_speed(iter/s)": 0.12995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 301.1875, "completions/min_length": 226.0, "epoch": 3.750206782464847, "grad_norm": 0.18725815415382385, "kl": 0.0875244140625, "learning_rate": 1.7858161861664674e-06, "loss": 0.0008756928145885468, "memory(GiB)": 38.1, "reward": 0.5332354307174683, "reward_std": 0.050929054617881775, "rewards/VisualizationJSONCombinedORM/mean": 0.5332354307174683, "rewards/VisualizationJSONCombinedORM/std": 0.05331858620047569, "step": 4534, "train_speed(iter/s)": 0.12988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 296.3125, "completions/min_length": 218.0, "epoch": 3.751033912324235, "grad_norm": 0.18884994089603424, "kl": 0.0797119140625, "learning_rate": 1.7836048932055643e-06, "loss": 0.0007950030267238617, "memory(GiB)": 38.1, "reward": 0.4140201807022095, "reward_std": 0.03162183240056038, "rewards/VisualizationJSONCombinedORM/mean": 0.4140201807022095, "rewards/VisualizationJSONCombinedORM/std": 0.03241322934627533, "step": 4535, "train_speed(iter/s)": 0.129802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 316.25, "completions/min_length": 265.0, "epoch": 3.7518610421836227, "grad_norm": 0.19056759774684906, "kl": 0.080810546875, "learning_rate": 1.7813946729274822e-06, "loss": 0.0008069220930337906, "memory(GiB)": 38.1, "reward": 0.5228817462921143, "reward_std": 0.07765805721282959, "rewards/VisualizationJSONCombinedORM/mean": 0.5228817462921143, "rewards/VisualizationJSONCombinedORM/std": 0.081417515873909, "step": 4536, "train_speed(iter/s)": 0.129725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 315.0625, "completions/min_length": 246.0, "epoch": 3.752688172043011, "grad_norm": 0.3010864555835724, "kl": 0.07244873046875, "learning_rate": 1.7791855260693458e-06, "loss": 0.0007224045693874359, "memory(GiB)": 38.1, "reward": 0.36752793192863464, "reward_std": 0.03427344560623169, "rewards/VisualizationJSONCombinedORM/mean": 0.36752793192863464, "rewards/VisualizationJSONCombinedORM/std": 0.11833145469427109, "step": 4537, "train_speed(iter/s)": 0.129641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 310.6875, "completions/min_length": 231.0, "epoch": 3.753515301902399, "grad_norm": 0.1848812848329544, "kl": 0.0556640625, "learning_rate": 1.7769774533679112e-06, "loss": 0.0005559325218200684, "memory(GiB)": 38.1, "reward": 0.6784825325012207, "reward_std": 0.09572307765483856, "rewards/VisualizationJSONCombinedORM/mean": 0.6784825325012207, "rewards/VisualizationJSONCombinedORM/std": 0.10613194108009338, "step": 4538, "train_speed(iter/s)": 0.129553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 319.3125, "completions/min_length": 238.0, "epoch": 3.7543424317617866, "grad_norm": 0.20782285928726196, "kl": 0.0606689453125, "learning_rate": 1.774770455559583e-06, "loss": 0.0006085783243179321, "memory(GiB)": 38.1, "reward": 0.664040207862854, "reward_std": 0.054675593972206116, "rewards/VisualizationJSONCombinedORM/mean": 0.664040207862854, "rewards/VisualizationJSONCombinedORM/std": 0.08039633184671402, "step": 4539, "train_speed(iter/s)": 0.129481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 316.5, "completions/min_length": 259.0, "epoch": 3.7551695616211744, "grad_norm": 0.18496915698051453, "kl": 0.0728759765625, "learning_rate": 1.7725645333804054e-06, "loss": 0.0007288865745067596, "memory(GiB)": 38.1, "reward": 0.5445343852043152, "reward_std": 0.06304699182510376, "rewards/VisualizationJSONCombinedORM/mean": 0.5445343852043152, "rewards/VisualizationJSONCombinedORM/std": 0.08709783852100372, "step": 4540, "train_speed(iter/s)": 0.129409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 324.375, "completions/min_length": 265.0, "epoch": 3.7559966914805623, "grad_norm": 0.1983175128698349, "kl": 0.08856201171875, "learning_rate": 1.7703596875660645e-06, "loss": 0.0008825547993183136, "memory(GiB)": 38.1, "reward": 0.6929219961166382, "reward_std": 0.06683224439620972, "rewards/VisualizationJSONCombinedORM/mean": 0.6929219961166382, "rewards/VisualizationJSONCombinedORM/std": 0.11560198664665222, "step": 4541, "train_speed(iter/s)": 0.129335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 306.4375, "completions/min_length": 249.0, "epoch": 3.7568238213399505, "grad_norm": 0.18412049114704132, "kl": 0.0587158203125, "learning_rate": 1.7681559188518827e-06, "loss": 0.0005860887467861176, "memory(GiB)": 38.1, "reward": 0.4692745506763458, "reward_std": 0.07303668558597565, "rewards/VisualizationJSONCombinedORM/mean": 0.4692745506763458, "rewards/VisualizationJSONCombinedORM/std": 0.08965487778186798, "step": 4542, "train_speed(iter/s)": 0.12927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 308.875, "completions/min_length": 230.0, "epoch": 3.7576509511993383, "grad_norm": 0.1624225378036499, "kl": 0.0455322265625, "learning_rate": 1.7659532279728336e-06, "loss": 0.0004552081227302551, "memory(GiB)": 38.1, "reward": 0.4787401854991913, "reward_std": 0.021708471700549126, "rewards/VisualizationJSONCombinedORM/mean": 0.4787401854991913, "rewards/VisualizationJSONCombinedORM/std": 0.05766611546278, "step": 4543, "train_speed(iter/s)": 0.129199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 316.625, "completions/min_length": 277.0, "epoch": 3.758478081058726, "grad_norm": 0.1957409679889679, "kl": 0.05126953125, "learning_rate": 1.7637516156635193e-06, "loss": 0.0005128495395183563, "memory(GiB)": 38.1, "reward": 0.36089563369750977, "reward_std": 0.04569630324840546, "rewards/VisualizationJSONCombinedORM/mean": 0.36089563369750977, "rewards/VisualizationJSONCombinedORM/std": 0.04543503746390343, "step": 4544, "train_speed(iter/s)": 0.129137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 306.4375, "completions/min_length": 245.0, "epoch": 3.759305210918114, "grad_norm": 0.22611092031002045, "kl": 0.228515625, "learning_rate": 1.7615510826581906e-06, "loss": 0.0022860318422317505, "memory(GiB)": 38.1, "reward": 0.5761091113090515, "reward_std": 0.05406472831964493, "rewards/VisualizationJSONCombinedORM/mean": 0.5761091113090515, "rewards/VisualizationJSONCombinedORM/std": 0.18205255270004272, "step": 4545, "train_speed(iter/s)": 0.129058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 286.375, "completions/min_length": 201.0, "epoch": 3.760132340777502, "grad_norm": 0.3439127504825592, "kl": 0.1195068359375, "learning_rate": 1.7593516296907342e-06, "loss": 0.0011947304010391235, "memory(GiB)": 38.1, "reward": 0.40390193462371826, "reward_std": 0.05874204635620117, "rewards/VisualizationJSONCombinedORM/mean": 0.40390193462371826, "rewards/VisualizationJSONCombinedORM/std": 0.08254217356443405, "step": 4546, "train_speed(iter/s)": 0.128994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 318.6875, "completions/min_length": 278.0, "epoch": 3.76095947063689, "grad_norm": 0.192379429936409, "kl": 0.0875244140625, "learning_rate": 1.7571532574946808e-06, "loss": 0.0008754990994930267, "memory(GiB)": 38.1, "reward": 0.46260714530944824, "reward_std": 0.01753142848610878, "rewards/VisualizationJSONCombinedORM/mean": 0.46260714530944824, "rewards/VisualizationJSONCombinedORM/std": 0.018203891813755035, "step": 4547, "train_speed(iter/s)": 0.128921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 324.9375, "completions/min_length": 255.0, "epoch": 3.761786600496278, "grad_norm": 0.16559147834777832, "kl": 0.04425048828125, "learning_rate": 1.754955966803194e-06, "loss": 0.00044043734669685364, "memory(GiB)": 38.1, "reward": 0.5568482875823975, "reward_std": 0.025283459573984146, "rewards/VisualizationJSONCombinedORM/mean": 0.5568482875823975, "rewards/VisualizationJSONCombinedORM/std": 0.2838621437549591, "step": 4548, "train_speed(iter/s)": 0.128844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 318.1875, "completions/min_length": 226.0, "epoch": 3.7626137303556657, "grad_norm": 0.1911117136478424, "kl": 0.0863037109375, "learning_rate": 1.7527597583490825e-06, "loss": 0.0008627958595752716, "memory(GiB)": 38.1, "reward": 0.5104142427444458, "reward_std": 0.0794898271560669, "rewards/VisualizationJSONCombinedORM/mean": 0.5104142427444458, "rewards/VisualizationJSONCombinedORM/std": 0.1219857782125473, "step": 4549, "train_speed(iter/s)": 0.128761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 332.5, "completions/min_length": 254.0, "epoch": 3.763440860215054, "grad_norm": 0.1933245062828064, "kl": 0.08587646484375, "learning_rate": 1.7505646328647913e-06, "loss": 0.0008587464690208435, "memory(GiB)": 38.1, "reward": 0.5337837934494019, "reward_std": 0.04688074067234993, "rewards/VisualizationJSONCombinedORM/mean": 0.5337837934494019, "rewards/VisualizationJSONCombinedORM/std": 0.19780419766902924, "step": 4550, "train_speed(iter/s)": 0.128693 }, { "epoch": 3.763440860215054, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 378.3333333333333, "eval_completions/mean_length": 312.3020833333333, "eval_completions/min_length": 257.8333333333333, "eval_kl": 0.07621256510416667, "eval_loss": 0.0007710705394856632, "eval_reward": 0.4544585843880971, "eval_reward_std": 0.047068969307777785, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4544585843880971, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04706897051073611, "eval_runtime": 319.795, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 316.5, "completions/min_length": 237.0, "epoch": 3.764267990074442, "grad_norm": 0.18784907460212708, "kl": 0.06427001953125, "learning_rate": 1.7483705910824072e-06, "loss": 0.0006426428444683552, "memory(GiB)": 38.1, "reward": 0.4613325595855713, "reward_std": 0.046585727483034134, "rewards/VisualizationJSONCombinedORM/mean": 0.4613325595855713, "rewards/VisualizationJSONCombinedORM/std": 0.09835246205329895, "step": 4551, "train_speed(iter/s)": 0.127475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 309.4375, "completions/min_length": 251.0, "epoch": 3.7650951199338296, "grad_norm": 0.19498218595981598, "kl": 0.0404052734375, "learning_rate": 1.7461776337336489e-06, "loss": 0.00040386617183685303, "memory(GiB)": 38.1, "reward": 0.6943421363830566, "reward_std": 0.05712573230266571, "rewards/VisualizationJSONCombinedORM/mean": 0.6943421363830566, "rewards/VisualizationJSONCombinedORM/std": 0.2325981706380844, "step": 4552, "train_speed(iter/s)": 0.12741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 318.1875, "completions/min_length": 213.0, "epoch": 3.7659222497932174, "grad_norm": 0.19540958106517792, "kl": 0.03704833984375, "learning_rate": 1.743985761549884e-06, "loss": 0.000371001660823822, "memory(GiB)": 38.1, "reward": 0.4865647256374359, "reward_std": 0.03857922554016113, "rewards/VisualizationJSONCombinedORM/mean": 0.4865647256374359, "rewards/VisualizationJSONCombinedORM/std": 0.03861221298575401, "step": 4553, "train_speed(iter/s)": 0.127341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 303.5625, "completions/min_length": 227.0, "epoch": 3.7667493796526053, "grad_norm": 0.2192549854516983, "kl": 0.0458984375, "learning_rate": 1.7417949752621066e-06, "loss": 0.00045993924140930176, "memory(GiB)": 38.1, "reward": 0.5664600133895874, "reward_std": 0.04341389238834381, "rewards/VisualizationJSONCombinedORM/mean": 0.5664600133895874, "rewards/VisualizationJSONCombinedORM/std": 0.14082342386245728, "step": 4554, "train_speed(iter/s)": 0.127279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 346.3125, "completions/min_length": 270.0, "epoch": 3.7675765095119935, "grad_norm": 0.1674693524837494, "kl": 0.0736083984375, "learning_rate": 1.7396052756009574e-06, "loss": 0.0007357755675911903, "memory(GiB)": 38.1, "reward": 0.4691101908683777, "reward_std": 0.04494474455714226, "rewards/VisualizationJSONCombinedORM/mean": 0.4691101908683777, "rewards/VisualizationJSONCombinedORM/std": 0.191811665892601, "step": 4555, "train_speed(iter/s)": 0.127215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 305.875, "completions/min_length": 250.0, "epoch": 3.7684036393713813, "grad_norm": 0.17732784152030945, "kl": 0.0518798828125, "learning_rate": 1.7374166632967104e-06, "loss": 0.0005188286304473877, "memory(GiB)": 38.1, "reward": 0.2746398448944092, "reward_std": 0.024638913571834564, "rewards/VisualizationJSONCombinedORM/mean": 0.2746398448944092, "rewards/VisualizationJSONCombinedORM/std": 0.15885719656944275, "step": 4556, "train_speed(iter/s)": 0.127148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 308.6875, "completions/min_length": 259.0, "epoch": 3.769230769230769, "grad_norm": 0.23582179844379425, "kl": 0.05181884765625, "learning_rate": 1.7352291390792798e-06, "loss": 0.0005182866007089615, "memory(GiB)": 38.1, "reward": 0.48509299755096436, "reward_std": 0.06004295498132706, "rewards/VisualizationJSONCombinedORM/mean": 0.48509299755096436, "rewards/VisualizationJSONCombinedORM/std": 0.1182539090514183, "step": 4557, "train_speed(iter/s)": 0.12708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 319.9375, "completions/min_length": 240.0, "epoch": 3.7700578990901574, "grad_norm": 0.16955982148647308, "kl": 0.05145263671875, "learning_rate": 1.73304270367821e-06, "loss": 0.0005140155553817749, "memory(GiB)": 38.1, "reward": 0.39867183566093445, "reward_std": 0.03388320654630661, "rewards/VisualizationJSONCombinedORM/mean": 0.39867183566093445, "rewards/VisualizationJSONCombinedORM/std": 0.07718317210674286, "step": 4558, "train_speed(iter/s)": 0.127032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 323.0, "completions/min_length": 260.0, "epoch": 3.770885028949545, "grad_norm": 0.21871517598628998, "kl": 0.07806396484375, "learning_rate": 1.7308573578226945e-06, "loss": 0.0007794424891471863, "memory(GiB)": 38.1, "reward": 0.5280898809432983, "reward_std": 0.046803221106529236, "rewards/VisualizationJSONCombinedORM/mean": 0.5280898809432983, "rewards/VisualizationJSONCombinedORM/std": 0.06114087998867035, "step": 4559, "train_speed(iter/s)": 0.126972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 318.5625, "completions/min_length": 251.0, "epoch": 3.771712158808933, "grad_norm": 0.1592731475830078, "kl": 0.0325927734375, "learning_rate": 1.7286731022415515e-06, "loss": 0.00032573379576206207, "memory(GiB)": 38.1, "reward": 0.7984867095947266, "reward_std": 0.03386520594358444, "rewards/VisualizationJSONCombinedORM/mean": 0.7984867095947266, "rewards/VisualizationJSONCombinedORM/std": 0.10217590630054474, "step": 4560, "train_speed(iter/s)": 0.12691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 317.875, "completions/min_length": 248.0, "epoch": 3.772539288668321, "grad_norm": 0.21667130291461945, "kl": 0.1077880859375, "learning_rate": 1.7264899376632415e-06, "loss": 0.0010792817920446396, "memory(GiB)": 38.1, "reward": 0.5922651290893555, "reward_std": 0.07815530896186829, "rewards/VisualizationJSONCombinedORM/mean": 0.5922651290893555, "rewards/VisualizationJSONCombinedORM/std": 0.196186825633049, "step": 4561, "train_speed(iter/s)": 0.12685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 314.625, "completions/min_length": 243.0, "epoch": 3.7733664185277087, "grad_norm": 0.20157378911972046, "kl": 0.05181884765625, "learning_rate": 1.7243078648158612e-06, "loss": 0.0005178377032279968, "memory(GiB)": 38.1, "reward": 0.5791058540344238, "reward_std": 0.050226692110300064, "rewards/VisualizationJSONCombinedORM/mean": 0.5791058540344238, "rewards/VisualizationJSONCombinedORM/std": 0.18084736168384552, "step": 4562, "train_speed(iter/s)": 0.1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 324.1875, "completions/min_length": 243.0, "epoch": 3.774193548387097, "grad_norm": 0.1972091794013977, "kl": 0.07720947265625, "learning_rate": 1.7221268844271427e-06, "loss": 0.0007723793387413025, "memory(GiB)": 38.1, "reward": 0.3698815405368805, "reward_std": 0.06704644113779068, "rewards/VisualizationJSONCombinedORM/mean": 0.3698815405368805, "rewards/VisualizationJSONCombinedORM/std": 0.18734344840049744, "step": 4563, "train_speed(iter/s)": 0.126733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 303.5625, "completions/min_length": 239.0, "epoch": 3.775020678246485, "grad_norm": 0.23509393632411957, "kl": 0.05780029296875, "learning_rate": 1.7199469972244497e-06, "loss": 0.0005790144205093384, "memory(GiB)": 38.1, "reward": 0.37904348969459534, "reward_std": 0.04581733047962189, "rewards/VisualizationJSONCombinedORM/mean": 0.37904348969459534, "rewards/VisualizationJSONCombinedORM/std": 0.06997395306825638, "step": 4564, "train_speed(iter/s)": 0.126677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 278.125, "completions/min_length": 232.0, "epoch": 3.7758478081058726, "grad_norm": 0.18837736546993256, "kl": 0.08050537109375, "learning_rate": 1.7177682039347875e-06, "loss": 0.0008050501346588135, "memory(GiB)": 38.1, "reward": 0.581818699836731, "reward_std": 0.054340910166502, "rewards/VisualizationJSONCombinedORM/mean": 0.581818699836731, "rewards/VisualizationJSONCombinedORM/std": 0.0692133828997612, "step": 4565, "train_speed(iter/s)": 0.126612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 321.0, "completions/min_length": 236.0, "epoch": 3.7766749379652604, "grad_norm": 0.20671600103378296, "kl": 0.0732421875, "learning_rate": 1.7155905052847938e-06, "loss": 0.00073271244764328, "memory(GiB)": 38.1, "reward": 0.22995606064796448, "reward_std": 0.024546107277274132, "rewards/VisualizationJSONCombinedORM/mean": 0.22995606064796448, "rewards/VisualizationJSONCombinedORM/std": 0.041920922696590424, "step": 4566, "train_speed(iter/s)": 0.126568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 317.0, "completions/min_length": 254.0, "epoch": 3.7775020678246483, "grad_norm": 0.17996744811534882, "kl": 0.037109375, "learning_rate": 1.7134139020007418e-06, "loss": 0.0003707185387611389, "memory(GiB)": 38.1, "reward": 0.8318877816200256, "reward_std": 0.04038621112704277, "rewards/VisualizationJSONCombinedORM/mean": 0.8318877816200256, "rewards/VisualizationJSONCombinedORM/std": 0.06572818756103516, "step": 4567, "train_speed(iter/s)": 0.126502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 320.9375, "completions/min_length": 266.0, "epoch": 3.7783291976840365, "grad_norm": 0.16622738540172577, "kl": 0.073974609375, "learning_rate": 1.7112383948085348e-06, "loss": 0.0007399395108222961, "memory(GiB)": 38.1, "reward": 0.5951639413833618, "reward_std": 0.06003464013338089, "rewards/VisualizationJSONCombinedORM/mean": 0.5951639413833618, "rewards/VisualizationJSONCombinedORM/std": 0.16455674171447754, "step": 4568, "train_speed(iter/s)": 0.126448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.25, "completions/min_length": 222.0, "epoch": 3.7791563275434243, "grad_norm": 0.17688047885894775, "kl": 0.04388427734375, "learning_rate": 1.709063984433721e-06, "loss": 0.00043907761573791504, "memory(GiB)": 38.1, "reward": 0.5792768001556396, "reward_std": 0.046828486025333405, "rewards/VisualizationJSONCombinedORM/mean": 0.5792768001556396, "rewards/VisualizationJSONCombinedORM/std": 0.11295387893915176, "step": 4569, "train_speed(iter/s)": 0.126399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 308.625, "completions/min_length": 238.0, "epoch": 3.779983457402812, "grad_norm": 0.21984727680683136, "kl": 0.2099609375, "learning_rate": 1.706890671601471e-06, "loss": 0.002097688615322113, "memory(GiB)": 38.1, "reward": 0.6087913513183594, "reward_std": 0.07102612406015396, "rewards/VisualizationJSONCombinedORM/mean": 0.6087913513183594, "rewards/VisualizationJSONCombinedORM/std": 0.15611301362514496, "step": 4570, "train_speed(iter/s)": 0.12633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 283.5, "completions/min_length": 221.0, "epoch": 3.7808105872622004, "grad_norm": 0.2158874124288559, "kl": 0.147216796875, "learning_rate": 1.7047184570365976e-06, "loss": 0.0014712214469909668, "memory(GiB)": 38.1, "reward": 0.5280627608299255, "reward_std": 0.0778200551867485, "rewards/VisualizationJSONCombinedORM/mean": 0.5280627608299255, "rewards/VisualizationJSONCombinedORM/std": 0.09790672361850739, "step": 4571, "train_speed(iter/s)": 0.12628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 324.625, "completions/min_length": 254.0, "epoch": 3.7816377171215882, "grad_norm": 0.1634640395641327, "kl": 0.132568359375, "learning_rate": 1.7025473414635435e-06, "loss": 0.0013255719095468521, "memory(GiB)": 38.1, "reward": 0.37830281257629395, "reward_std": 0.03986693173646927, "rewards/VisualizationJSONCombinedORM/mean": 0.37830281257629395, "rewards/VisualizationJSONCombinedORM/std": 0.15590518712997437, "step": 4572, "train_speed(iter/s)": 0.126215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 326.875, "completions/min_length": 263.0, "epoch": 3.782464846980976, "grad_norm": 0.26202818751335144, "kl": 0.043701171875, "learning_rate": 1.7003773256063882e-06, "loss": 0.00043753162026405334, "memory(GiB)": 38.1, "reward": 0.5357183218002319, "reward_std": 0.08561857044696808, "rewards/VisualizationJSONCombinedORM/mean": 0.5357183218002319, "rewards/VisualizationJSONCombinedORM/std": 0.16883499920368195, "step": 4573, "train_speed(iter/s)": 0.126153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 308.0, "completions/min_length": 239.0, "epoch": 3.783291976840364, "grad_norm": 0.2549867630004883, "kl": 0.1004638671875, "learning_rate": 1.6982084101888374e-06, "loss": 0.0010051615536212921, "memory(GiB)": 38.1, "reward": 0.4674212634563446, "reward_std": 0.04096323251724243, "rewards/VisualizationJSONCombinedORM/mean": 0.4674212634563446, "rewards/VisualizationJSONCombinedORM/std": 0.17490379512310028, "step": 4574, "train_speed(iter/s)": 0.126082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 300.8125, "completions/min_length": 222.0, "epoch": 3.7841191066997517, "grad_norm": 0.2233581393957138, "kl": 0.0552978515625, "learning_rate": 1.6960405959342402e-06, "loss": 0.0005541294813156128, "memory(GiB)": 38.1, "reward": 0.7470120191574097, "reward_std": 0.05513765290379524, "rewards/VisualizationJSONCombinedORM/mean": 0.7470120191574097, "rewards/VisualizationJSONCombinedORM/std": 0.11069578677415848, "step": 4575, "train_speed(iter/s)": 0.126032 }, { "epoch": 3.7841191066997517, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 385.8333333333333, "eval_completions/mean_length": 315.9166666666667, "eval_completions/min_length": 266.4583333333333, "eval_kl": 0.09904988606770833, "eval_loss": 0.001000946038402617, "eval_reward": 0.46308350438872975, "eval_reward_std": 0.054587379204652585, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46308350438872975, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05458738236726882, "eval_runtime": 324.3341, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 352.5, "completions/min_length": 265.0, "epoch": 3.78494623655914, "grad_norm": 0.18456517159938812, "kl": 0.033447265625, "learning_rate": 1.6938738835655682e-06, "loss": 0.0003341846168041229, "memory(GiB)": 38.1, "reward": 0.43241557478904724, "reward_std": 0.03554891049861908, "rewards/VisualizationJSONCombinedORM/mean": 0.43241557478904724, "rewards/VisualizationJSONCombinedORM/std": 0.1571648269891739, "step": 4576, "train_speed(iter/s)": 0.124844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 308.5625, "completions/min_length": 243.0, "epoch": 3.785773366418528, "grad_norm": 0.1726665049791336, "kl": 0.0811767578125, "learning_rate": 1.6917082738054319e-06, "loss": 0.000811845064163208, "memory(GiB)": 38.1, "reward": 0.6489920020103455, "reward_std": 0.04023313522338867, "rewards/VisualizationJSONCombinedORM/mean": 0.6489920020103455, "rewards/VisualizationJSONCombinedORM/std": 0.13365109264850616, "step": 4577, "train_speed(iter/s)": 0.124781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 327.3125, "completions/min_length": 260.0, "epoch": 3.7866004962779156, "grad_norm": 0.1428038328886032, "kl": 0.04296875, "learning_rate": 1.6895437673760728e-06, "loss": 0.00043010147055611014, "memory(GiB)": 38.1, "reward": 0.6758760213851929, "reward_std": 0.03641234710812569, "rewards/VisualizationJSONCombinedORM/mean": 0.6758760213851929, "rewards/VisualizationJSONCombinedORM/std": 0.06603750586509705, "step": 4578, "train_speed(iter/s)": 0.124712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 287.5625, "completions/min_length": 226.0, "epoch": 3.7874276261373034, "grad_norm": 0.1919844150543213, "kl": 0.0458984375, "learning_rate": 1.6873803649993647e-06, "loss": 0.00046006590127944946, "memory(GiB)": 38.1, "reward": 0.6216269135475159, "reward_std": 0.06842412799596786, "rewards/VisualizationJSONCombinedORM/mean": 0.6216269135475159, "rewards/VisualizationJSONCombinedORM/std": 0.09370435029268265, "step": 4579, "train_speed(iter/s)": 0.124666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 294.6875, "completions/min_length": 230.0, "epoch": 3.7882547559966913, "grad_norm": 0.26359331607818604, "kl": 0.27978515625, "learning_rate": 1.6852180673968093e-06, "loss": 0.0027919411659240723, "memory(GiB)": 38.1, "reward": 0.6083382368087769, "reward_std": 0.08093813061714172, "rewards/VisualizationJSONCombinedORM/mean": 0.6083382368087769, "rewards/VisualizationJSONCombinedORM/std": 0.10564392060041428, "step": 4580, "train_speed(iter/s)": 0.124587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 323.75, "completions/min_length": 248.0, "epoch": 3.7890818858560795, "grad_norm": 0.18110792338848114, "kl": 0.0755615234375, "learning_rate": 1.6830568752895455e-06, "loss": 0.0007541216909885406, "memory(GiB)": 38.1, "reward": 0.3907971680164337, "reward_std": 0.03202147036790848, "rewards/VisualizationJSONCombinedORM/mean": 0.3907971680164337, "rewards/VisualizationJSONCombinedORM/std": 0.2410748153924942, "step": 4581, "train_speed(iter/s)": 0.124529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 336.375, "completions/min_length": 253.0, "epoch": 3.7899090157154673, "grad_norm": 0.20594802498817444, "kl": 0.07867431640625, "learning_rate": 1.6808967893983397e-06, "loss": 0.0007864907383918762, "memory(GiB)": 38.1, "reward": 0.5366384983062744, "reward_std": 0.05487446486949921, "rewards/VisualizationJSONCombinedORM/mean": 0.5366384983062744, "rewards/VisualizationJSONCombinedORM/std": 0.2718435227870941, "step": 4582, "train_speed(iter/s)": 0.124463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 326.9375, "completions/min_length": 234.0, "epoch": 3.790736145574855, "grad_norm": 0.21317742764949799, "kl": 0.1009521484375, "learning_rate": 1.6787378104435931e-06, "loss": 0.0010109823197126389, "memory(GiB)": 38.1, "reward": 0.5246866345405579, "reward_std": 0.06713719666004181, "rewards/VisualizationJSONCombinedORM/mean": 0.5246866345405579, "rewards/VisualizationJSONCombinedORM/std": 0.19690430164337158, "step": 4583, "train_speed(iter/s)": 0.124393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 334.625, "completions/min_length": 270.0, "epoch": 3.7915632754342434, "grad_norm": 0.197585791349411, "kl": 0.0594482421875, "learning_rate": 1.6765799391453302e-06, "loss": 0.0005945786833763123, "memory(GiB)": 38.1, "reward": 0.37329012155532837, "reward_std": 0.039262451231479645, "rewards/VisualizationJSONCombinedORM/mean": 0.37329012155532837, "rewards/VisualizationJSONCombinedORM/std": 0.10060340911149979, "step": 4584, "train_speed(iter/s)": 0.124329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 297.875, "completions/min_length": 234.0, "epoch": 3.7923904052936313, "grad_norm": 0.17844827473163605, "kl": 0.1376953125, "learning_rate": 1.6744231762232178e-06, "loss": 0.0013749152421951294, "memory(GiB)": 38.1, "reward": 0.4641650319099426, "reward_std": 0.04842686280608177, "rewards/VisualizationJSONCombinedORM/mean": 0.4641650319099426, "rewards/VisualizationJSONCombinedORM/std": 0.27746808528900146, "step": 4585, "train_speed(iter/s)": 0.124274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 316.375, "completions/min_length": 237.0, "epoch": 3.793217535153019, "grad_norm": 0.2076437920331955, "kl": 0.05438232421875, "learning_rate": 1.6722675223965412e-06, "loss": 0.0005447231233119965, "memory(GiB)": 38.1, "reward": 0.46951302886009216, "reward_std": 0.046393051743507385, "rewards/VisualizationJSONCombinedORM/mean": 0.46951302886009216, "rewards/VisualizationJSONCombinedORM/std": 0.20634257793426514, "step": 4586, "train_speed(iter/s)": 0.12422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 335.25, "completions/min_length": 269.0, "epoch": 3.794044665012407, "grad_norm": 0.21680690348148346, "kl": 0.07904052734375, "learning_rate": 1.670112978384223e-06, "loss": 0.0007908418774604797, "memory(GiB)": 38.1, "reward": 0.40238407254219055, "reward_std": 0.03974563255906105, "rewards/VisualizationJSONCombinedORM/mean": 0.40238407254219055, "rewards/VisualizationJSONCombinedORM/std": 0.20149484276771545, "step": 4587, "train_speed(iter/s)": 0.124159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 323.9375, "completions/min_length": 287.0, "epoch": 3.7948717948717947, "grad_norm": 0.1644982099533081, "kl": 0.1004638671875, "learning_rate": 1.6679595449048137e-06, "loss": 0.0010031387209892273, "memory(GiB)": 38.1, "reward": 0.40300923585891724, "reward_std": 0.03874557092785835, "rewards/VisualizationJSONCombinedORM/mean": 0.40300923585891724, "rewards/VisualizationJSONCombinedORM/std": 0.20392218232154846, "step": 4588, "train_speed(iter/s)": 0.124097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.75, "completions/min_length": 231.0, "epoch": 3.795698924731183, "grad_norm": 0.17694911360740662, "kl": 0.05059814453125, "learning_rate": 1.6658072226764949e-06, "loss": 0.0005058497190475464, "memory(GiB)": 38.1, "reward": 0.5101790428161621, "reward_std": 0.0453367605805397, "rewards/VisualizationJSONCombinedORM/mean": 0.5101790428161621, "rewards/VisualizationJSONCombinedORM/std": 0.21862851083278656, "step": 4589, "train_speed(iter/s)": 0.124059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 317.375, "completions/min_length": 256.0, "epoch": 3.796526054590571, "grad_norm": 0.16846108436584473, "kl": 0.0328369140625, "learning_rate": 1.6636560124170713e-06, "loss": 0.0003286479040980339, "memory(GiB)": 38.1, "reward": 0.3692827820777893, "reward_std": 0.04809274524450302, "rewards/VisualizationJSONCombinedORM/mean": 0.3692827820777893, "rewards/VisualizationJSONCombinedORM/std": 0.052818816155195236, "step": 4590, "train_speed(iter/s)": 0.123997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 289.6875, "completions/min_length": 233.0, "epoch": 3.7973531844499586, "grad_norm": 0.24206174910068512, "kl": 0.0767822265625, "learning_rate": 1.6615059148439882e-06, "loss": 0.0007682666182518005, "memory(GiB)": 38.1, "reward": 0.38365963101387024, "reward_std": 0.05268393084406853, "rewards/VisualizationJSONCombinedORM/mean": 0.38365963101387024, "rewards/VisualizationJSONCombinedORM/std": 0.061192892491817474, "step": 4591, "train_speed(iter/s)": 0.123938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 298.3125, "completions/min_length": 243.0, "epoch": 3.7981803143093464, "grad_norm": 0.19384056329727173, "kl": 0.051025390625, "learning_rate": 1.6593569306743085e-06, "loss": 0.0005105528980493546, "memory(GiB)": 38.1, "reward": 0.640687108039856, "reward_std": 0.04179355874657631, "rewards/VisualizationJSONCombinedORM/mean": 0.640687108039856, "rewards/VisualizationJSONCombinedORM/std": 0.047264523804187775, "step": 4592, "train_speed(iter/s)": 0.123885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 338.0625, "completions/min_length": 240.0, "epoch": 3.7990074441687343, "grad_norm": 0.16192662715911865, "kl": 0.1192626953125, "learning_rate": 1.6572090606247294e-06, "loss": 0.0011911801993846893, "memory(GiB)": 38.1, "reward": 0.6616648435592651, "reward_std": 0.042515940964221954, "rewards/VisualizationJSONCombinedORM/mean": 0.6616648435592651, "rewards/VisualizationJSONCombinedORM/std": 0.06646093726158142, "step": 4593, "train_speed(iter/s)": 0.123819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 313.6875, "completions/min_length": 236.0, "epoch": 3.7998345740281225, "grad_norm": 0.13422229886054993, "kl": 0.1058349609375, "learning_rate": 1.655062305411576e-06, "loss": 0.0010594235500320792, "memory(GiB)": 38.1, "reward": 0.7692047953605652, "reward_std": 0.046698350459337234, "rewards/VisualizationJSONCombinedORM/mean": 0.7692047953605652, "rewards/VisualizationJSONCombinedORM/std": 0.07640425115823746, "step": 4594, "train_speed(iter/s)": 0.123755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 318.875, "completions/min_length": 255.0, "epoch": 3.8006617038875103, "grad_norm": 0.16501957178115845, "kl": 0.046630859375, "learning_rate": 1.6529166657508033e-06, "loss": 0.00046718865633010864, "memory(GiB)": 38.1, "reward": 0.6221792697906494, "reward_std": 0.06024739518761635, "rewards/VisualizationJSONCombinedORM/mean": 0.6221792697906494, "rewards/VisualizationJSONCombinedORM/std": 0.1514357626438141, "step": 4595, "train_speed(iter/s)": 0.123695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 311.5, "completions/min_length": 252.0, "epoch": 3.801488833746898, "grad_norm": 0.22920647263526917, "kl": 0.06146240234375, "learning_rate": 1.6507721423579886e-06, "loss": 0.0006141737103462219, "memory(GiB)": 38.1, "reward": 0.47700658440589905, "reward_std": 0.058138396590948105, "rewards/VisualizationJSONCombinedORM/mean": 0.47700658440589905, "rewards/VisualizationJSONCombinedORM/std": 0.09674457460641861, "step": 4596, "train_speed(iter/s)": 0.123632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 316.8125, "completions/min_length": 241.0, "epoch": 3.8023159636062864, "grad_norm": 0.1714126467704773, "kl": 0.06292724609375, "learning_rate": 1.6486287359483422e-06, "loss": 0.0006323084235191345, "memory(GiB)": 38.1, "reward": 0.4611627161502838, "reward_std": 0.023738674819469452, "rewards/VisualizationJSONCombinedORM/mean": 0.4611627161502838, "rewards/VisualizationJSONCombinedORM/std": 0.09711936116218567, "step": 4597, "train_speed(iter/s)": 0.123578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 305.875, "completions/min_length": 242.0, "epoch": 3.8031430934656743, "grad_norm": 0.19219160079956055, "kl": 0.03302001953125, "learning_rate": 1.6464864472366999e-06, "loss": 0.00033010542392730713, "memory(GiB)": 38.1, "reward": 0.4420816898345947, "reward_std": 0.061949945986270905, "rewards/VisualizationJSONCombinedORM/mean": 0.4420816898345947, "rewards/VisualizationJSONCombinedORM/std": 0.24155233800411224, "step": 4598, "train_speed(iter/s)": 0.123509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 316.6875, "completions/min_length": 243.0, "epoch": 3.803970223325062, "grad_norm": 0.27719247341156006, "kl": 0.06024169921875, "learning_rate": 1.6443452769375261e-06, "loss": 0.0006018169224262238, "memory(GiB)": 38.1, "reward": 0.7287166118621826, "reward_std": 0.08234916627407074, "rewards/VisualizationJSONCombinedORM/mean": 0.7287166118621826, "rewards/VisualizationJSONCombinedORM/std": 0.1662769913673401, "step": 4599, "train_speed(iter/s)": 0.123451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 314.9375, "completions/min_length": 239.0, "epoch": 3.80479735318445, "grad_norm": 0.18901468813419342, "kl": 0.06744384765625, "learning_rate": 1.642205225764908e-06, "loss": 0.0006737448275089264, "memory(GiB)": 38.1, "reward": 0.581375002861023, "reward_std": 0.02544781193137169, "rewards/VisualizationJSONCombinedORM/mean": 0.581375002861023, "rewards/VisualizationJSONCombinedORM/std": 0.08630555123090744, "step": 4600, "train_speed(iter/s)": 0.123383 }, { "epoch": 3.80479735318445, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.0833333333333, "eval_completions/mean_length": 313.78125, "eval_completions/min_length": 260.4583333333333, "eval_kl": 0.09105428059895833, "eval_loss": 0.0009216790203936398, "eval_reward": 0.47170065591732663, "eval_reward_std": 0.054236497497186065, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.47170065591732663, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05423649738077074, "eval_runtime": 316.0993, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 314.9375, "completions/min_length": 268.0, "epoch": 3.8056244830438377, "grad_norm": 0.1946440190076828, "kl": 0.066650390625, "learning_rate": 1.6400662944325656e-06, "loss": 0.0006668530404567719, "memory(GiB)": 38.1, "reward": 0.6868813037872314, "reward_std": 0.06931942701339722, "rewards/VisualizationJSONCombinedORM/mean": 0.6868813037872314, "rewards/VisualizationJSONCombinedORM/std": 0.11986492574214935, "step": 4601, "train_speed(iter/s)": 0.122286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 309.625, "completions/min_length": 239.0, "epoch": 3.806451612903226, "grad_norm": 0.2422434687614441, "kl": 0.2781982421875, "learning_rate": 1.637928483653844e-06, "loss": 0.0027839578688144684, "memory(GiB)": 38.1, "reward": 0.3949284255504608, "reward_std": 0.03683606535196304, "rewards/VisualizationJSONCombinedORM/mean": 0.3949284255504608, "rewards/VisualizationJSONCombinedORM/std": 0.13140735030174255, "step": 4602, "train_speed(iter/s)": 0.122236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 270.5, "completions/min_length": 223.0, "epoch": 3.807278742762614, "grad_norm": 0.1836845725774765, "kl": 0.0419921875, "learning_rate": 1.635791794141709e-06, "loss": 0.0004195496439933777, "memory(GiB)": 38.1, "reward": 0.6398336291313171, "reward_std": 0.04398347809910774, "rewards/VisualizationJSONCombinedORM/mean": 0.6398336291313171, "rewards/VisualizationJSONCombinedORM/std": 0.05655864253640175, "step": 4603, "train_speed(iter/s)": 0.122181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 273.625, "completions/min_length": 238.0, "epoch": 3.8081058726220016, "grad_norm": 0.17554178833961487, "kl": 0.068115234375, "learning_rate": 1.6336562266087585e-06, "loss": 0.0006804578006267548, "memory(GiB)": 38.1, "reward": 0.4847688674926758, "reward_std": 0.14401765167713165, "rewards/VisualizationJSONCombinedORM/mean": 0.4847688674926758, "rewards/VisualizationJSONCombinedORM/std": 0.23720942437648773, "step": 4604, "train_speed(iter/s)": 0.122123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 300.125, "completions/min_length": 248.0, "epoch": 3.8089330024813894, "grad_norm": 0.2029285579919815, "kl": 0.078857421875, "learning_rate": 1.6315217817672142e-06, "loss": 0.0007885154336690903, "memory(GiB)": 38.1, "reward": 0.7461739778518677, "reward_std": 0.08941015601158142, "rewards/VisualizationJSONCombinedORM/mean": 0.7461739778518677, "rewards/VisualizationJSONCombinedORM/std": 0.0949627086520195, "step": 4605, "train_speed(iter/s)": 0.122075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 319.4375, "completions/min_length": 282.0, "epoch": 3.8097601323407773, "grad_norm": 0.23884569108486176, "kl": 0.1875, "learning_rate": 1.6293884603289246e-06, "loss": 0.0018707644194364548, "memory(GiB)": 38.1, "reward": 0.6315009593963623, "reward_std": 0.09389227628707886, "rewards/VisualizationJSONCombinedORM/mean": 0.6315009593963623, "rewards/VisualizationJSONCombinedORM/std": 0.1479329764842987, "step": 4606, "train_speed(iter/s)": 0.12201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 319.3125, "completions/min_length": 273.0, "epoch": 3.8105872622001655, "grad_norm": 0.24176299571990967, "kl": 0.05975341796875, "learning_rate": 1.6272562630053585e-06, "loss": 0.0005985461175441742, "memory(GiB)": 38.1, "reward": 0.5543351173400879, "reward_std": 0.06208857148885727, "rewards/VisualizationJSONCombinedORM/mean": 0.5543351173400879, "rewards/VisualizationJSONCombinedORM/std": 0.09378442168235779, "step": 4607, "train_speed(iter/s)": 0.121947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 313.8125, "completions/min_length": 220.0, "epoch": 3.8114143920595533, "grad_norm": 0.17659741640090942, "kl": 0.10791015625, "learning_rate": 1.6251251905076192e-06, "loss": 0.001080431044101715, "memory(GiB)": 38.1, "reward": 0.23669184744358063, "reward_std": 0.023107722401618958, "rewards/VisualizationJSONCombinedORM/mean": 0.23669184744358063, "rewards/VisualizationJSONCombinedORM/std": 0.026276225224137306, "step": 4608, "train_speed(iter/s)": 0.121867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 300.6875, "completions/min_length": 235.0, "epoch": 3.812241521918941, "grad_norm": 0.22790056467056274, "kl": 0.0662841796875, "learning_rate": 1.6229952435464252e-06, "loss": 0.0006643394008278847, "memory(GiB)": 38.1, "reward": 0.49965718388557434, "reward_std": 0.05613310635089874, "rewards/VisualizationJSONCombinedORM/mean": 0.49965718388557434, "rewards/VisualizationJSONCombinedORM/std": 0.1353355050086975, "step": 4609, "train_speed(iter/s)": 0.121811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 314.6875, "completions/min_length": 240.0, "epoch": 3.8130686517783294, "grad_norm": 0.24114395678043365, "kl": 0.12164306640625, "learning_rate": 1.6208664228321254e-06, "loss": 0.0012204628437757492, "memory(GiB)": 38.1, "reward": 0.5030477046966553, "reward_std": 0.07179637253284454, "rewards/VisualizationJSONCombinedORM/mean": 0.5030477046966553, "rewards/VisualizationJSONCombinedORM/std": 0.07562048733234406, "step": 4610, "train_speed(iter/s)": 0.121743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 297.625, "completions/min_length": 240.0, "epoch": 3.8138957816377173, "grad_norm": 0.2223661094903946, "kl": 0.0826416015625, "learning_rate": 1.6187387290746908e-06, "loss": 0.0008263364434242249, "memory(GiB)": 38.1, "reward": 0.48855912685394287, "reward_std": 0.06987515091896057, "rewards/VisualizationJSONCombinedORM/mean": 0.48855912685394287, "rewards/VisualizationJSONCombinedORM/std": 0.1395518183708191, "step": 4611, "train_speed(iter/s)": 0.121673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 333.75, "completions/min_length": 273.0, "epoch": 3.814722911497105, "grad_norm": 0.19442108273506165, "kl": 0.06256103515625, "learning_rate": 1.6166121629837195e-06, "loss": 0.0006260797381401062, "memory(GiB)": 38.1, "reward": 0.5253446102142334, "reward_std": 0.04122485592961311, "rewards/VisualizationJSONCombinedORM/mean": 0.5253446102142334, "rewards/VisualizationJSONCombinedORM/std": 0.19297268986701965, "step": 4612, "train_speed(iter/s)": 0.121626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 338.6875, "completions/min_length": 237.0, "epoch": 3.815550041356493, "grad_norm": 0.22942322492599487, "kl": 0.1995849609375, "learning_rate": 1.614486725268426e-06, "loss": 0.001997917890548706, "memory(GiB)": 38.1, "reward": 0.5477604866027832, "reward_std": 0.062432944774627686, "rewards/VisualizationJSONCombinedORM/mean": 0.5477604866027832, "rewards/VisualizationJSONCombinedORM/std": 0.10701411962509155, "step": 4613, "train_speed(iter/s)": 0.121561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 321.9375, "completions/min_length": 249.0, "epoch": 3.8163771712158807, "grad_norm": 0.2443990260362625, "kl": 0.2679443359375, "learning_rate": 1.6123624166376606e-06, "loss": 0.002669721841812134, "memory(GiB)": 38.1, "reward": 0.4091264307498932, "reward_std": 0.035307835787534714, "rewards/VisualizationJSONCombinedORM/mean": 0.4091264307498932, "rewards/VisualizationJSONCombinedORM/std": 0.044586800038814545, "step": 4614, "train_speed(iter/s)": 0.121505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 290.625, "completions/min_length": 206.0, "epoch": 3.817204301075269, "grad_norm": 0.5423189997673035, "kl": 0.268798828125, "learning_rate": 1.610239237799885e-06, "loss": 0.00268588587641716, "memory(GiB)": 38.1, "reward": 0.5457475185394287, "reward_std": 0.11454436928033829, "rewards/VisualizationJSONCombinedORM/mean": 0.5457475185394287, "rewards/VisualizationJSONCombinedORM/std": 0.16883067786693573, "step": 4615, "train_speed(iter/s)": 0.121459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 335.4375, "completions/min_length": 254.0, "epoch": 3.818031430934657, "grad_norm": 0.2190936803817749, "kl": 0.0919189453125, "learning_rate": 1.6081171894631903e-06, "loss": 0.0009195730090141296, "memory(GiB)": 38.1, "reward": 0.6168762445449829, "reward_std": 0.07635684311389923, "rewards/VisualizationJSONCombinedORM/mean": 0.6168762445449829, "rewards/VisualizationJSONCombinedORM/std": 0.11684735864400864, "step": 4616, "train_speed(iter/s)": 0.121401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 330.5, "completions/min_length": 239.0, "epoch": 3.8188585607940446, "grad_norm": 0.19006489217281342, "kl": 0.090087890625, "learning_rate": 1.6059962723352912e-06, "loss": 0.0009034257382154465, "memory(GiB)": 38.1, "reward": 0.44130632281303406, "reward_std": 0.027845490723848343, "rewards/VisualizationJSONCombinedORM/mean": 0.44130632281303406, "rewards/VisualizationJSONCombinedORM/std": 0.2120920717716217, "step": 4617, "train_speed(iter/s)": 0.121338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 291.8125, "completions/min_length": 215.0, "epoch": 3.8196856906534324, "grad_norm": 0.1690497249364853, "kl": 0.1505126953125, "learning_rate": 1.6038764871235236e-06, "loss": 0.0015060044825077057, "memory(GiB)": 38.1, "reward": 0.5724311470985413, "reward_std": 0.08054889738559723, "rewards/VisualizationJSONCombinedORM/mean": 0.5724311470985413, "rewards/VisualizationJSONCombinedORM/std": 0.30494174361228943, "step": 4618, "train_speed(iter/s)": 0.121279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 288.0625, "completions/min_length": 218.0, "epoch": 3.8205128205128203, "grad_norm": 0.175754576921463, "kl": 0.0970458984375, "learning_rate": 1.6017578345348429e-06, "loss": 0.0009729526937007904, "memory(GiB)": 38.1, "reward": 0.4422956109046936, "reward_std": 0.03841649368405342, "rewards/VisualizationJSONCombinedORM/mean": 0.4422956109046936, "rewards/VisualizationJSONCombinedORM/std": 0.2728753685951233, "step": 4619, "train_speed(iter/s)": 0.121217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 292.1875, "completions/min_length": 232.0, "epoch": 3.8213399503722085, "grad_norm": 0.20209725201129913, "kl": 0.0999755859375, "learning_rate": 1.5996403152758315e-06, "loss": 0.0010004602372646332, "memory(GiB)": 38.1, "reward": 0.6920512914657593, "reward_std": 0.07681450247764587, "rewards/VisualizationJSONCombinedORM/mean": 0.6920512914657593, "rewards/VisualizationJSONCombinedORM/std": 0.07466717809438705, "step": 4620, "train_speed(iter/s)": 0.121161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 296.8125, "completions/min_length": 213.0, "epoch": 3.8221670802315963, "grad_norm": 0.22070381045341492, "kl": 0.06396484375, "learning_rate": 1.5975239300526924e-06, "loss": 0.0006402954459190369, "memory(GiB)": 38.1, "reward": 0.3688664734363556, "reward_std": 0.035125311464071274, "rewards/VisualizationJSONCombinedORM/mean": 0.3688664734363556, "rewards/VisualizationJSONCombinedORM/std": 0.038661446422338486, "step": 4621, "train_speed(iter/s)": 0.12111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 313.1875, "completions/min_length": 237.0, "epoch": 3.822994210090984, "grad_norm": 0.15872539579868317, "kl": 0.0428466796875, "learning_rate": 1.595408679571251e-06, "loss": 0.0004275888204574585, "memory(GiB)": 38.1, "reward": 0.7110751867294312, "reward_std": 0.06900496780872345, "rewards/VisualizationJSONCombinedORM/mean": 0.7110751867294312, "rewards/VisualizationJSONCombinedORM/std": 0.08916473388671875, "step": 4622, "train_speed(iter/s)": 0.121052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 304.875, "completions/min_length": 242.0, "epoch": 3.8238213399503724, "grad_norm": 0.17711618542671204, "kl": 0.03594970703125, "learning_rate": 1.5932945645369486e-06, "loss": 0.00035945698618888855, "memory(GiB)": 38.1, "reward": 0.6373803615570068, "reward_std": 0.0761885941028595, "rewards/VisualizationJSONCombinedORM/mean": 0.6373803615570068, "rewards/VisualizationJSONCombinedORM/std": 0.07442078739404678, "step": 4623, "train_speed(iter/s)": 0.120993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 331.0625, "completions/min_length": 285.0, "epoch": 3.8246484698097603, "grad_norm": 0.15655656158924103, "kl": 0.07916259765625, "learning_rate": 1.5911815856548584e-06, "loss": 0.0007936432957649231, "memory(GiB)": 38.1, "reward": 0.5771881937980652, "reward_std": 0.03531063720583916, "rewards/VisualizationJSONCombinedORM/mean": 0.5771881937980652, "rewards/VisualizationJSONCombinedORM/std": 0.059737928211688995, "step": 4624, "train_speed(iter/s)": 0.120921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 316.25, "completions/min_length": 260.0, "epoch": 3.825475599669148, "grad_norm": 0.22389960289001465, "kl": 0.0789794921875, "learning_rate": 1.5890697436296648e-06, "loss": 0.0007908567786216736, "memory(GiB)": 38.1, "reward": 0.48113369941711426, "reward_std": 0.05497617647051811, "rewards/VisualizationJSONCombinedORM/mean": 0.48113369941711426, "rewards/VisualizationJSONCombinedORM/std": 0.10113178938627243, "step": 4625, "train_speed(iter/s)": 0.120863 }, { "epoch": 3.825475599669148, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 377.25, "eval_completions/mean_length": 312.0104166666667, "eval_completions/min_length": 254.5, "eval_kl": 0.06682332356770833, "eval_loss": 0.0006691012531518936, "eval_reward": 0.44288671016693115, "eval_reward_std": 0.061838303110562265, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44288671016693115, "eval_rewards/VisualizationJSONCombinedORM/std": 0.061838303110562265, "eval_runtime": 318.3121, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 319.375, "completions/min_length": 224.0, "epoch": 3.826302729528536, "grad_norm": 0.1885824203491211, "kl": 0.052490234375, "learning_rate": 1.5869590391656781e-06, "loss": 0.0005256347358226776, "memory(GiB)": 38.1, "reward": 0.29066529870033264, "reward_std": 0.040878668427467346, "rewards/VisualizationJSONCombinedORM/mean": 0.29066529870033264, "rewards/VisualizationJSONCombinedORM/std": 0.06701089441776276, "step": 4626, "train_speed(iter/s)": 0.119815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 306.6875, "completions/min_length": 225.0, "epoch": 3.8271298593879237, "grad_norm": 0.18000607192516327, "kl": 0.0941162109375, "learning_rate": 1.584849472966828e-06, "loss": 0.0009414404630661011, "memory(GiB)": 38.1, "reward": 0.535037636756897, "reward_std": 0.04609137773513794, "rewards/VisualizationJSONCombinedORM/mean": 0.535037636756897, "rewards/VisualizationJSONCombinedORM/std": 0.21317638456821442, "step": 4627, "train_speed(iter/s)": 0.119775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 330.9375, "completions/min_length": 248.0, "epoch": 3.827956989247312, "grad_norm": 0.23787952959537506, "kl": 0.08245849609375, "learning_rate": 1.5827410457366665e-06, "loss": 0.0008232947438955307, "memory(GiB)": 38.1, "reward": 0.22551505267620087, "reward_std": 0.02193436399102211, "rewards/VisualizationJSONCombinedORM/mean": 0.22551505267620087, "rewards/VisualizationJSONCombinedORM/std": 0.05254482477903366, "step": 4628, "train_speed(iter/s)": 0.119721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 294.0625, "completions/min_length": 228.0, "epoch": 3.8287841191067, "grad_norm": 0.18305586278438568, "kl": 0.0499267578125, "learning_rate": 1.5806337581783593e-06, "loss": 0.0004992596805095673, "memory(GiB)": 38.1, "reward": 0.6685028076171875, "reward_std": 0.051726747304201126, "rewards/VisualizationJSONCombinedORM/mean": 0.6685028076171875, "rewards/VisualizationJSONCombinedORM/std": 0.15259332954883575, "step": 4629, "train_speed(iter/s)": 0.119676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 315.6875, "completions/min_length": 243.0, "epoch": 3.8296112489660876, "grad_norm": 0.17856653034687042, "kl": 0.066650390625, "learning_rate": 1.5785276109947028e-06, "loss": 0.0006657205522060394, "memory(GiB)": 38.1, "reward": 0.6100641489028931, "reward_std": 0.07437494397163391, "rewards/VisualizationJSONCombinedORM/mean": 0.6100641489028931, "rewards/VisualizationJSONCombinedORM/std": 0.13197225332260132, "step": 4630, "train_speed(iter/s)": 0.119615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 325.5625, "completions/min_length": 254.0, "epoch": 3.8304383788254754, "grad_norm": 0.24808312952518463, "kl": 0.04901123046875, "learning_rate": 1.576422604888102e-06, "loss": 0.000490725040435791, "memory(GiB)": 38.1, "reward": 0.5637040138244629, "reward_std": 0.06401994824409485, "rewards/VisualizationJSONCombinedORM/mean": 0.5637040138244629, "rewards/VisualizationJSONCombinedORM/std": 0.06188538670539856, "step": 4631, "train_speed(iter/s)": 0.119565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 336.75, "completions/min_length": 268.0, "epoch": 3.8312655086848633, "grad_norm": 0.1977362483739853, "kl": 0.146240234375, "learning_rate": 1.5743187405605885e-06, "loss": 0.0014580897986888885, "memory(GiB)": 38.1, "reward": 0.35190749168395996, "reward_std": 0.04111699014902115, "rewards/VisualizationJSONCombinedORM/mean": 0.35190749168395996, "rewards/VisualizationJSONCombinedORM/std": 0.11089355498552322, "step": 4632, "train_speed(iter/s)": 0.119489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 327.9375, "completions/min_length": 275.0, "epoch": 3.8320926385442515, "grad_norm": 0.18291422724723816, "kl": 0.0609130859375, "learning_rate": 1.5722160187138102e-06, "loss": 0.0006103590130805969, "memory(GiB)": 38.1, "reward": 0.5683479309082031, "reward_std": 0.05093630403280258, "rewards/VisualizationJSONCombinedORM/mean": 0.5683479309082031, "rewards/VisualizationJSONCombinedORM/std": 0.08874861896038055, "step": 4633, "train_speed(iter/s)": 0.119429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 302.5, "completions/min_length": 224.0, "epoch": 3.8329197684036393, "grad_norm": 0.21562710404396057, "kl": 0.05889892578125, "learning_rate": 1.570114440049037e-06, "loss": 0.0005894973874092102, "memory(GiB)": 38.1, "reward": 0.617017388343811, "reward_std": 0.03900529444217682, "rewards/VisualizationJSONCombinedORM/mean": 0.617017388343811, "rewards/VisualizationJSONCombinedORM/std": 0.19120806455612183, "step": 4634, "train_speed(iter/s)": 0.119368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 319.8125, "completions/min_length": 241.0, "epoch": 3.833746898263027, "grad_norm": 0.2067052125930786, "kl": 0.0526123046875, "learning_rate": 1.5680140052671516e-06, "loss": 0.0005258917808532715, "memory(GiB)": 38.1, "reward": 0.5704337358474731, "reward_std": 0.051136285066604614, "rewards/VisualizationJSONCombinedORM/mean": 0.5704337358474731, "rewards/VisualizationJSONCombinedORM/std": 0.1829884648323059, "step": 4635, "train_speed(iter/s)": 0.119311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 316.625, "completions/min_length": 226.0, "epoch": 3.8345740281224154, "grad_norm": 0.21564042568206787, "kl": 0.0548095703125, "learning_rate": 1.5659147150686605e-06, "loss": 0.0005486989393830299, "memory(GiB)": 38.1, "reward": 0.3823900818824768, "reward_std": 0.035063546150922775, "rewards/VisualizationJSONCombinedORM/mean": 0.3823900818824768, "rewards/VisualizationJSONCombinedORM/std": 0.10932782292366028, "step": 4636, "train_speed(iter/s)": 0.119267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 301.75, "completions/min_length": 247.0, "epoch": 3.8354011579818033, "grad_norm": 0.21960493922233582, "kl": 0.057373046875, "learning_rate": 1.5638165701536866e-06, "loss": 0.0005740746855735779, "memory(GiB)": 38.1, "reward": 0.4609648883342743, "reward_std": 0.09248387813568115, "rewards/VisualizationJSONCombinedORM/mean": 0.4609648883342743, "rewards/VisualizationJSONCombinedORM/std": 0.20135928690433502, "step": 4637, "train_speed(iter/s)": 0.119214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 330.375, "completions/min_length": 282.0, "epoch": 3.836228287841191, "grad_norm": 0.17437124252319336, "kl": 0.036041259765625, "learning_rate": 1.561719571221973e-06, "loss": 0.0003623589873313904, "memory(GiB)": 38.1, "reward": 0.7717199921607971, "reward_std": 0.029088126495480537, "rewards/VisualizationJSONCombinedORM/mean": 0.7717199921607971, "rewards/VisualizationJSONCombinedORM/std": 0.08240217715501785, "step": 4638, "train_speed(iter/s)": 0.119149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 360.1875, "completions/min_length": 310.0, "epoch": 3.837055417700579, "grad_norm": 0.199645534157753, "kl": 0.112548828125, "learning_rate": 1.5596237189728736e-06, "loss": 0.0011251792311668396, "memory(GiB)": 38.1, "reward": 0.7068163752555847, "reward_std": 0.08348412811756134, "rewards/VisualizationJSONCombinedORM/mean": 0.7068163752555847, "rewards/VisualizationJSONCombinedORM/std": 0.09572522342205048, "step": 4639, "train_speed(iter/s)": 0.119096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 296.9375, "completions/min_length": 237.0, "epoch": 3.8378825475599667, "grad_norm": 0.2230817973613739, "kl": 0.0745849609375, "learning_rate": 1.5575290141053712e-06, "loss": 0.0007470399141311646, "memory(GiB)": 38.1, "reward": 0.6005339622497559, "reward_std": 0.06453533470630646, "rewards/VisualizationJSONCombinedORM/mean": 0.6005339622497559, "rewards/VisualizationJSONCombinedORM/std": 0.15629559755325317, "step": 4640, "train_speed(iter/s)": 0.119041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 309.25, "completions/min_length": 255.0, "epoch": 3.838709677419355, "grad_norm": 0.224532812833786, "kl": 0.032379150390625, "learning_rate": 1.5554354573180553e-06, "loss": 0.00032401829957962036, "memory(GiB)": 38.1, "reward": 0.364055871963501, "reward_std": 0.046482332050800323, "rewards/VisualizationJSONCombinedORM/mean": 0.364055871963501, "rewards/VisualizationJSONCombinedORM/std": 0.1020197719335556, "step": 4641, "train_speed(iter/s)": 0.118988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 324.0, "completions/min_length": 239.0, "epoch": 3.839536807278743, "grad_norm": 0.2152535319328308, "kl": 0.0623779296875, "learning_rate": 1.5533430493091384e-06, "loss": 0.0006248801946640015, "memory(GiB)": 38.1, "reward": 0.3756902515888214, "reward_std": 0.034728728234767914, "rewards/VisualizationJSONCombinedORM/mean": 0.3756902515888214, "rewards/VisualizationJSONCombinedORM/std": 0.11040890961885452, "step": 4642, "train_speed(iter/s)": 0.118926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 328.125, "completions/min_length": 266.0, "epoch": 3.8403639371381306, "grad_norm": 0.18206632137298584, "kl": 0.056884765625, "learning_rate": 1.551251790776448e-06, "loss": 0.0005685687065124512, "memory(GiB)": 38.1, "reward": 0.3205534815788269, "reward_std": 0.031319715082645416, "rewards/VisualizationJSONCombinedORM/mean": 0.3205534815788269, "rewards/VisualizationJSONCombinedORM/std": 0.050804998725652695, "step": 4643, "train_speed(iter/s)": 0.118873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 312.9375, "completions/min_length": 268.0, "epoch": 3.8411910669975184, "grad_norm": 0.21855390071868896, "kl": 0.0555419921875, "learning_rate": 1.5491616824174304e-06, "loss": 0.000556308776140213, "memory(GiB)": 38.1, "reward": 0.45912837982177734, "reward_std": 0.048633553087711334, "rewards/VisualizationJSONCombinedORM/mean": 0.45912837982177734, "rewards/VisualizationJSONCombinedORM/std": 0.06276347488164902, "step": 4644, "train_speed(iter/s)": 0.118821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 317.625, "completions/min_length": 253.0, "epoch": 3.8420181968569063, "grad_norm": 0.19853968918323517, "kl": 0.06903076171875, "learning_rate": 1.5470727249291423e-06, "loss": 0.0006884783506393433, "memory(GiB)": 38.1, "reward": 0.5576574802398682, "reward_std": 0.042558278888463974, "rewards/VisualizationJSONCombinedORM/mean": 0.5576574802398682, "rewards/VisualizationJSONCombinedORM/std": 0.04754149913787842, "step": 4645, "train_speed(iter/s)": 0.118759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 322.8125, "completions/min_length": 246.0, "epoch": 3.8428453267162945, "grad_norm": 0.18647602200508118, "kl": 0.1331787109375, "learning_rate": 1.544984919008266e-06, "loss": 0.0013327673077583313, "memory(GiB)": 38.1, "reward": 0.4994511604309082, "reward_std": 0.03050333820283413, "rewards/VisualizationJSONCombinedORM/mean": 0.4994511604309082, "rewards/VisualizationJSONCombinedORM/std": 0.18543806672096252, "step": 4646, "train_speed(iter/s)": 0.118711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 295.5, "completions/min_length": 241.0, "epoch": 3.8436724565756824, "grad_norm": 0.19236579537391663, "kl": 0.03558349609375, "learning_rate": 1.542898265351091e-06, "loss": 0.00035567954182624817, "memory(GiB)": 38.1, "reward": 0.4243341088294983, "reward_std": 0.04649798572063446, "rewards/VisualizationJSONCombinedORM/mean": 0.4243341088294983, "rewards/VisualizationJSONCombinedORM/std": 0.09716223925352097, "step": 4647, "train_speed(iter/s)": 0.118673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 295.5625, "completions/min_length": 226.0, "epoch": 3.84449958643507, "grad_norm": 0.15793493390083313, "kl": 0.03900146484375, "learning_rate": 1.540812764653527e-06, "loss": 0.0003913789987564087, "memory(GiB)": 38.1, "reward": 0.7882394790649414, "reward_std": 0.06023569405078888, "rewards/VisualizationJSONCombinedORM/mean": 0.7882394790649414, "rewards/VisualizationJSONCombinedORM/std": 0.08023766428232193, "step": 4648, "train_speed(iter/s)": 0.118613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/mean_length": 319.125, "completions/min_length": 231.0, "epoch": 3.8453267162944584, "grad_norm": 0.1902535855770111, "kl": 0.1226806640625, "learning_rate": 1.5387284176110983e-06, "loss": 0.0012262724339962006, "memory(GiB)": 38.1, "reward": 0.2941080927848816, "reward_std": 0.04899580031633377, "rewards/VisualizationJSONCombinedORM/mean": 0.2941080927848816, "rewards/VisualizationJSONCombinedORM/std": 0.10831176489591599, "step": 4649, "train_speed(iter/s)": 0.118557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 331.6875, "completions/min_length": 287.0, "epoch": 3.8461538461538463, "grad_norm": 0.1960236132144928, "kl": 0.08514404296875, "learning_rate": 1.5366452249189462e-06, "loss": 0.0008535608649253845, "memory(GiB)": 38.1, "reward": 0.5844063758850098, "reward_std": 0.06778056919574738, "rewards/VisualizationJSONCombinedORM/mean": 0.5844063758850098, "rewards/VisualizationJSONCombinedORM/std": 0.2834285497665405, "step": 4650, "train_speed(iter/s)": 0.118502 }, { "epoch": 3.8461538461538463, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.9583333333333, "eval_completions/mean_length": 314.9583333333333, "eval_completions/min_length": 260.6666666666667, "eval_kl": 0.057139078776041664, "eval_loss": 0.0005709950928576291, "eval_reward": 0.43298253292838734, "eval_reward_std": 0.04472997800136606, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43298253292838734, "eval_rewards/VisualizationJSONCombinedORM/std": 0.044729979029701404, "eval_runtime": 313.2099, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 277.9375, "completions/min_length": 218.0, "epoch": 3.846980976013234, "grad_norm": 0.18063479661941528, "kl": 0.0472412109375, "learning_rate": 1.5345631872718214e-06, "loss": 0.00047354958951473236, "memory(GiB)": 38.1, "reward": 0.6989355087280273, "reward_std": 0.07606644928455353, "rewards/VisualizationJSONCombinedORM/mean": 0.6989355087280273, "rewards/VisualizationJSONCombinedORM/std": 0.07954250276088715, "step": 4651, "train_speed(iter/s)": 0.117513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 321.6875, "completions/min_length": 264.0, "epoch": 3.847808105872622, "grad_norm": 0.15853357315063477, "kl": 0.04595947265625, "learning_rate": 1.5324823053640959e-06, "loss": 0.0004599913954734802, "memory(GiB)": 38.1, "reward": 0.5544860363006592, "reward_std": 0.06614145636558533, "rewards/VisualizationJSONCombinedORM/mean": 0.5544860363006592, "rewards/VisualizationJSONCombinedORM/std": 0.09433332830667496, "step": 4652, "train_speed(iter/s)": 0.117461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 308.625, "completions/min_length": 239.0, "epoch": 3.8486352357320097, "grad_norm": 0.18775464594364166, "kl": 0.0377197265625, "learning_rate": 1.5304025798897521e-06, "loss": 0.00037825293838977814, "memory(GiB)": 38.1, "reward": 0.4575231969356537, "reward_std": 0.03282404690980911, "rewards/VisualizationJSONCombinedORM/mean": 0.4575231969356537, "rewards/VisualizationJSONCombinedORM/std": 0.05066579952836037, "step": 4653, "train_speed(iter/s)": 0.117412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 289.5625, "completions/min_length": 235.0, "epoch": 3.849462365591398, "grad_norm": 0.16795134544372559, "kl": 0.085296630859375, "learning_rate": 1.5283240115423914e-06, "loss": 0.0008535198867321014, "memory(GiB)": 38.1, "reward": 0.5565769672393799, "reward_std": 0.07473502308130264, "rewards/VisualizationJSONCombinedORM/mean": 0.5565769672393799, "rewards/VisualizationJSONCombinedORM/std": 0.17572972178459167, "step": 4654, "train_speed(iter/s)": 0.117371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 298.1875, "completions/min_length": 220.0, "epoch": 3.850289495450786, "grad_norm": 0.2355143129825592, "kl": 0.050048828125, "learning_rate": 1.52624660101522e-06, "loss": 0.0005007758736610413, "memory(GiB)": 38.1, "reward": 0.6875510215759277, "reward_std": 0.07399609684944153, "rewards/VisualizationJSONCombinedORM/mean": 0.6875510215759277, "rewards/VisualizationJSONCombinedORM/std": 0.11415601521730423, "step": 4655, "train_speed(iter/s)": 0.117328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 344.3125, "completions/min_length": 239.0, "epoch": 3.8511166253101736, "grad_norm": 0.1901288777589798, "kl": 0.0643310546875, "learning_rate": 1.5241703490010707e-06, "loss": 0.0006429627537727356, "memory(GiB)": 38.1, "reward": 0.47947776317596436, "reward_std": 0.04484570026397705, "rewards/VisualizationJSONCombinedORM/mean": 0.47947776317596436, "rewards/VisualizationJSONCombinedORM/std": 0.12235566973686218, "step": 4656, "train_speed(iter/s)": 0.117267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 306.5, "completions/min_length": 220.0, "epoch": 3.851943755169562, "grad_norm": 0.1929129660129547, "kl": 0.0496826171875, "learning_rate": 1.5220952561923791e-06, "loss": 0.0004966557025909424, "memory(GiB)": 38.1, "reward": 0.5238012671470642, "reward_std": 0.07903841882944107, "rewards/VisualizationJSONCombinedORM/mean": 0.5238012671470642, "rewards/VisualizationJSONCombinedORM/std": 0.2563183009624481, "step": 4657, "train_speed(iter/s)": 0.117215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 320.3125, "completions/min_length": 236.0, "epoch": 3.8527708850289497, "grad_norm": 0.19658106565475464, "kl": 0.0997314453125, "learning_rate": 1.5200213232811995e-06, "loss": 0.0009985752403736115, "memory(GiB)": 38.1, "reward": 0.6595914959907532, "reward_std": 0.05923953652381897, "rewards/VisualizationJSONCombinedORM/mean": 0.6595914959907532, "rewards/VisualizationJSONCombinedORM/std": 0.0690436065196991, "step": 4658, "train_speed(iter/s)": 0.117154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 304.375, "completions/min_length": 232.0, "epoch": 3.8535980148883375, "grad_norm": 0.22110190987586975, "kl": 0.05316162109375, "learning_rate": 1.517948550959198e-06, "loss": 0.0005307812243700027, "memory(GiB)": 38.1, "reward": 0.33178919553756714, "reward_std": 0.03475513309240341, "rewards/VisualizationJSONCombinedORM/mean": 0.33178919553756714, "rewards/VisualizationJSONCombinedORM/std": 0.03904148191213608, "step": 4659, "train_speed(iter/s)": 0.117109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 331.125, "completions/min_length": 261.0, "epoch": 3.8544251447477254, "grad_norm": 0.3158712685108185, "kl": 0.08203125, "learning_rate": 1.5158769399176559e-06, "loss": 0.0008181184530258179, "memory(GiB)": 38.1, "reward": 0.5743129849433899, "reward_std": 0.06062488257884979, "rewards/VisualizationJSONCombinedORM/mean": 0.5743129849433899, "rewards/VisualizationJSONCombinedORM/std": 0.0998816192150116, "step": 4660, "train_speed(iter/s)": 0.117063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 289.5625, "completions/min_length": 238.0, "epoch": 3.855252274607113, "grad_norm": 0.18153543770313263, "kl": 0.030120849609375, "learning_rate": 1.5138064908474603e-06, "loss": 0.000301215797662735, "memory(GiB)": 38.1, "reward": 0.3575966954231262, "reward_std": 0.042787108570337296, "rewards/VisualizationJSONCombinedORM/mean": 0.3575966954231262, "rewards/VisualizationJSONCombinedORM/std": 0.1121724545955658, "step": 4661, "train_speed(iter/s)": 0.117019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 343.0, "completions/min_length": 287.0, "epoch": 3.8560794044665014, "grad_norm": 0.18691644072532654, "kl": 0.043701171875, "learning_rate": 1.5117372044391221e-06, "loss": 0.0004376024007797241, "memory(GiB)": 38.1, "reward": 0.4165997803211212, "reward_std": 0.05840964615345001, "rewards/VisualizationJSONCombinedORM/mean": 0.4165997803211212, "rewards/VisualizationJSONCombinedORM/std": 0.14279332756996155, "step": 4662, "train_speed(iter/s)": 0.11697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 312.25, "completions/min_length": 220.0, "epoch": 3.8569065343258893, "grad_norm": 0.24325360357761383, "kl": 0.0860595703125, "learning_rate": 1.5096690813827525e-06, "loss": 0.0008598491549491882, "memory(GiB)": 38.1, "reward": 0.5102759599685669, "reward_std": 0.0745122879743576, "rewards/VisualizationJSONCombinedORM/mean": 0.5102759599685669, "rewards/VisualizationJSONCombinedORM/std": 0.07513067871332169, "step": 4663, "train_speed(iter/s)": 0.116921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 285.125, "completions/min_length": 237.0, "epoch": 3.857733664185277, "grad_norm": 0.17001110315322876, "kl": 0.07373046875, "learning_rate": 1.507602122368083e-06, "loss": 0.0007386729121208191, "memory(GiB)": 38.1, "reward": 0.5627503395080566, "reward_std": 0.0584123358130455, "rewards/VisualizationJSONCombinedORM/mean": 0.5627503395080566, "rewards/VisualizationJSONCombinedORM/std": 0.10609441995620728, "step": 4664, "train_speed(iter/s)": 0.116878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 302.625, "completions/min_length": 229.0, "epoch": 3.858560794044665, "grad_norm": 0.22137749195098877, "kl": 0.08782958984375, "learning_rate": 1.505536328084453e-06, "loss": 0.0008785109966993332, "memory(GiB)": 38.1, "reward": 0.46307340264320374, "reward_std": 0.04626530781388283, "rewards/VisualizationJSONCombinedORM/mean": 0.46307340264320374, "rewards/VisualizationJSONCombinedORM/std": 0.14416776597499847, "step": 4665, "train_speed(iter/s)": 0.116823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 298.125, "completions/min_length": 236.0, "epoch": 3.8593879239040527, "grad_norm": 0.23833604156970978, "kl": 0.09942626953125, "learning_rate": 1.503471699220817e-06, "loss": 0.000995200127363205, "memory(GiB)": 38.1, "reward": 0.7003905177116394, "reward_std": 0.0592532642185688, "rewards/VisualizationJSONCombinedORM/mean": 0.7003905177116394, "rewards/VisualizationJSONCombinedORM/std": 0.07163957506418228, "step": 4666, "train_speed(iter/s)": 0.116785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 306.125, "completions/min_length": 228.0, "epoch": 3.860215053763441, "grad_norm": 0.18075919151306152, "kl": 0.0809326171875, "learning_rate": 1.501408236465735e-06, "loss": 0.0008105337619781494, "memory(GiB)": 38.1, "reward": 0.5086288452148438, "reward_std": 0.0475258007645607, "rewards/VisualizationJSONCombinedORM/mean": 0.5086288452148438, "rewards/VisualizationJSONCombinedORM/std": 0.255534291267395, "step": 4667, "train_speed(iter/s)": 0.116716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 304.9375, "completions/min_length": 239.0, "epoch": 3.861042183622829, "grad_norm": 0.19429059326648712, "kl": 0.05511474609375, "learning_rate": 1.4993459405073825e-06, "loss": 0.0005509946495294571, "memory(GiB)": 38.1, "reward": 0.6094528436660767, "reward_std": 0.06138717010617256, "rewards/VisualizationJSONCombinedORM/mean": 0.6094528436660767, "rewards/VisualizationJSONCombinedORM/std": 0.22394007444381714, "step": 4668, "train_speed(iter/s)": 0.116665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 292.125, "completions/min_length": 223.0, "epoch": 3.8618693134822166, "grad_norm": 0.24445578455924988, "kl": 0.0850830078125, "learning_rate": 1.4972848120335453e-06, "loss": 0.0008519887924194336, "memory(GiB)": 38.1, "reward": 0.3230026364326477, "reward_std": 0.03578919917345047, "rewards/VisualizationJSONCombinedORM/mean": 0.3230026364326477, "rewards/VisualizationJSONCombinedORM/std": 0.10757848620414734, "step": 4669, "train_speed(iter/s)": 0.116608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 320.625, "completions/min_length": 253.0, "epoch": 3.862696443341605, "grad_norm": 0.17533361911773682, "kl": 0.04193115234375, "learning_rate": 1.4952248517316215e-06, "loss": 0.00041912496089935303, "memory(GiB)": 38.1, "reward": 0.391170859336853, "reward_std": 0.03985413536429405, "rewards/VisualizationJSONCombinedORM/mean": 0.391170859336853, "rewards/VisualizationJSONCombinedORM/std": 0.09531887620687485, "step": 4670, "train_speed(iter/s)": 0.116556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 306.25, "completions/min_length": 215.0, "epoch": 3.8635235732009927, "grad_norm": 0.19439896941184998, "kl": 0.128173828125, "learning_rate": 1.4931660602886122e-06, "loss": 0.0012812912464141846, "memory(GiB)": 38.1, "reward": 0.6098841428756714, "reward_std": 0.054558977484703064, "rewards/VisualizationJSONCombinedORM/mean": 0.6098841428756714, "rewards/VisualizationJSONCombinedORM/std": 0.19741643965244293, "step": 4671, "train_speed(iter/s)": 0.116519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 324.9375, "completions/min_length": 269.0, "epoch": 3.8643507030603805, "grad_norm": 0.18595410883426666, "kl": 0.07904052734375, "learning_rate": 1.49110843839114e-06, "loss": 0.0007896721363067627, "memory(GiB)": 38.1, "reward": 0.780789852142334, "reward_std": 0.05106941610574722, "rewards/VisualizationJSONCombinedORM/mean": 0.780789852142334, "rewards/VisualizationJSONCombinedORM/std": 0.08956985920667648, "step": 4672, "train_speed(iter/s)": 0.116476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 303.4375, "completions/min_length": 222.0, "epoch": 3.8651778329197684, "grad_norm": 0.25077104568481445, "kl": 0.0762939453125, "learning_rate": 1.4890519867254271e-06, "loss": 0.0007619671523571014, "memory(GiB)": 38.1, "reward": 0.4300917387008667, "reward_std": 0.05513101816177368, "rewards/VisualizationJSONCombinedORM/mean": 0.4300917387008667, "rewards/VisualizationJSONCombinedORM/std": 0.06491906195878983, "step": 4673, "train_speed(iter/s)": 0.116436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 308.3125, "completions/min_length": 233.0, "epoch": 3.866004962779156, "grad_norm": 0.1809486448764801, "kl": 0.08197021484375, "learning_rate": 1.486996705977311e-06, "loss": 0.0008180756121873856, "memory(GiB)": 38.1, "reward": 0.5794776678085327, "reward_std": 0.048942215740680695, "rewards/VisualizationJSONCombinedORM/mean": 0.5794776678085327, "rewards/VisualizationJSONCombinedORM/std": 0.06138504669070244, "step": 4674, "train_speed(iter/s)": 0.116393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 291.75, "completions/min_length": 218.0, "epoch": 3.8668320926385444, "grad_norm": 0.20673228800296783, "kl": 0.10546875, "learning_rate": 1.4849425968322384e-06, "loss": 0.0010541733354330063, "memory(GiB)": 38.1, "reward": 0.38876575231552124, "reward_std": 0.05021907389163971, "rewards/VisualizationJSONCombinedORM/mean": 0.38876575231552124, "rewards/VisualizationJSONCombinedORM/std": 0.1775607466697693, "step": 4675, "train_speed(iter/s)": 0.116339 }, { "epoch": 3.8668320926385444, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.4583333333333, "eval_completions/mean_length": 315.140625, "eval_completions/min_length": 264.7083333333333, "eval_kl": 0.08022562662760417, "eval_loss": 0.0008078441023826599, "eval_reward": 0.46542851502696675, "eval_reward_std": 0.056168072197275855, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46542851502696675, "eval_rewards/VisualizationJSONCombinedORM/std": 0.056168072391301394, "eval_runtime": 317.4771, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 321.5, "completions/min_length": 236.0, "epoch": 3.8676592224979323, "grad_norm": 0.17465047538280487, "kl": 0.1162109375, "learning_rate": 1.4828896599752645e-06, "loss": 0.0011593326926231384, "memory(GiB)": 38.1, "reward": 0.40459147095680237, "reward_std": 0.037586018443107605, "rewards/VisualizationJSONCombinedORM/mean": 0.40459147095680237, "rewards/VisualizationJSONCombinedORM/std": 0.23367133736610413, "step": 4676, "train_speed(iter/s)": 0.115374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 319.1875, "completions/min_length": 255.0, "epoch": 3.86848635235732, "grad_norm": 0.2075226604938507, "kl": 0.0738525390625, "learning_rate": 1.4808378960910502e-06, "loss": 0.0007381550967693329, "memory(GiB)": 38.1, "reward": 0.5134625434875488, "reward_std": 0.06236434727907181, "rewards/VisualizationJSONCombinedORM/mean": 0.5134625434875488, "rewards/VisualizationJSONCombinedORM/std": 0.08483196794986725, "step": 4677, "train_speed(iter/s)": 0.115327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 279.75, "completions/min_length": 234.0, "epoch": 3.869313482216708, "grad_norm": 0.29963502287864685, "kl": 0.281005859375, "learning_rate": 1.478787305863873e-06, "loss": 0.002809181809425354, "memory(GiB)": 38.1, "reward": 0.32943713665008545, "reward_std": 0.026283208280801773, "rewards/VisualizationJSONCombinedORM/mean": 0.32943713665008545, "rewards/VisualizationJSONCombinedORM/std": 0.026785770431160927, "step": 4678, "train_speed(iter/s)": 0.11527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 268.3125, "completions/min_length": 220.0, "epoch": 3.8701406120760957, "grad_norm": 0.2125045210123062, "kl": 0.06585693359375, "learning_rate": 1.4767378899776108e-06, "loss": 0.0006578788161277771, "memory(GiB)": 38.1, "reward": 0.49612775444984436, "reward_std": 0.06392642855644226, "rewards/VisualizationJSONCombinedORM/mean": 0.49612775444984436, "rewards/VisualizationJSONCombinedORM/std": 0.16067460179328918, "step": 4679, "train_speed(iter/s)": 0.115241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 271.8125, "completions/min_length": 223.0, "epoch": 3.870967741935484, "grad_norm": 0.19156166911125183, "kl": 0.11083984375, "learning_rate": 1.4746896491157541e-06, "loss": 0.0011120028793811798, "memory(GiB)": 38.1, "reward": 0.46156609058380127, "reward_std": 0.08034692704677582, "rewards/VisualizationJSONCombinedORM/mean": 0.46156609058380127, "rewards/VisualizationJSONCombinedORM/std": 0.0814899429678917, "step": 4680, "train_speed(iter/s)": 0.115189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 326.0, "completions/min_length": 239.0, "epoch": 3.871794871794872, "grad_norm": 0.21208788454532623, "kl": 0.0572509765625, "learning_rate": 1.4726425839614022e-06, "loss": 0.0005726665258407593, "memory(GiB)": 38.1, "reward": 0.4782596528530121, "reward_std": 0.030099231749773026, "rewards/VisualizationJSONCombinedORM/mean": 0.4782596528530121, "rewards/VisualizationJSONCombinedORM/std": 0.22752857208251953, "step": 4681, "train_speed(iter/s)": 0.115139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 305.8125, "completions/min_length": 230.0, "epoch": 3.8726220016542596, "grad_norm": 0.27096062898635864, "kl": 0.08740234375, "learning_rate": 1.4705966951972612e-06, "loss": 0.0008751526474952698, "memory(GiB)": 38.1, "reward": 0.5706694722175598, "reward_std": 0.06524638086557388, "rewards/VisualizationJSONCombinedORM/mean": 0.5706694722175598, "rewards/VisualizationJSONCombinedORM/std": 0.18336248397827148, "step": 4682, "train_speed(iter/s)": 0.115081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 317.5625, "completions/min_length": 243.0, "epoch": 3.873449131513648, "grad_norm": 0.25616511702537537, "kl": 0.05450439453125, "learning_rate": 1.4685519835056416e-06, "loss": 0.0005440264940261841, "memory(GiB)": 38.1, "reward": 0.595221996307373, "reward_std": 0.07995209842920303, "rewards/VisualizationJSONCombinedORM/mean": 0.595221996307373, "rewards/VisualizationJSONCombinedORM/std": 0.08393053710460663, "step": 4683, "train_speed(iter/s)": 0.115031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 323.5, "completions/min_length": 265.0, "epoch": 3.8742762613730357, "grad_norm": 0.22676649689674377, "kl": 0.03936767578125, "learning_rate": 1.4665084495684662e-06, "loss": 0.00039421021938323975, "memory(GiB)": 38.1, "reward": 0.5320812463760376, "reward_std": 0.04714445769786835, "rewards/VisualizationJSONCombinedORM/mean": 0.5320812463760376, "rewards/VisualizationJSONCombinedORM/std": 0.05474868789315224, "step": 4684, "train_speed(iter/s)": 0.114982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 337.875, "completions/min_length": 278.0, "epoch": 3.8751033912324235, "grad_norm": 0.21505019068717957, "kl": 0.09197998046875, "learning_rate": 1.4644660940672628e-06, "loss": 0.0009198971092700958, "memory(GiB)": 38.1, "reward": 0.5392158031463623, "reward_std": 0.03735767677426338, "rewards/VisualizationJSONCombinedORM/mean": 0.5392158031463623, "rewards/VisualizationJSONCombinedORM/std": 0.10584810376167297, "step": 4685, "train_speed(iter/s)": 0.114923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 328.4375, "completions/min_length": 250.0, "epoch": 3.8759305210918114, "grad_norm": 0.21185235679149628, "kl": 0.09503173828125, "learning_rate": 1.4624249176831668e-06, "loss": 0.0009538084268569946, "memory(GiB)": 38.1, "reward": 0.5645634531974792, "reward_std": 0.09251783043146133, "rewards/VisualizationJSONCombinedORM/mean": 0.5645634531974792, "rewards/VisualizationJSONCombinedORM/std": 0.13025552034378052, "step": 4686, "train_speed(iter/s)": 0.114878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 301.6875, "completions/min_length": 206.0, "epoch": 3.876757650951199, "grad_norm": 0.27681437134742737, "kl": 0.05548095703125, "learning_rate": 1.4603849210969208e-06, "loss": 0.0005543753504753113, "memory(GiB)": 38.1, "reward": 0.6057318449020386, "reward_std": 0.07862067222595215, "rewards/VisualizationJSONCombinedORM/mean": 0.6057318449020386, "rewards/VisualizationJSONCombinedORM/std": 0.09562526643276215, "step": 4687, "train_speed(iter/s)": 0.11483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 309.625, "completions/min_length": 249.0, "epoch": 3.8775847808105874, "grad_norm": 0.18715274333953857, "kl": 0.0721435546875, "learning_rate": 1.4583461049888714e-06, "loss": 0.0007220283150672913, "memory(GiB)": 38.1, "reward": 0.6630650758743286, "reward_std": 0.06886938214302063, "rewards/VisualizationJSONCombinedORM/mean": 0.6630650758743286, "rewards/VisualizationJSONCombinedORM/std": 0.12152983993291855, "step": 4688, "train_speed(iter/s)": 0.114784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 293.6875, "completions/min_length": 243.0, "epoch": 3.8784119106699753, "grad_norm": 0.21762169897556305, "kl": 0.05291748046875, "learning_rate": 1.4563084700389768e-06, "loss": 0.0005296990275382996, "memory(GiB)": 38.1, "reward": 0.25027036666870117, "reward_std": 0.017905795946717262, "rewards/VisualizationJSONCombinedORM/mean": 0.25027036666870117, "rewards/VisualizationJSONCombinedORM/std": 0.11497340351343155, "step": 4689, "train_speed(iter/s)": 0.114742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 295.8125, "completions/min_length": 242.0, "epoch": 3.879239040529363, "grad_norm": 0.2045716494321823, "kl": 0.03875732421875, "learning_rate": 1.4542720169267933e-06, "loss": 0.0003872215747833252, "memory(GiB)": 38.1, "reward": 0.5011919140815735, "reward_std": 0.03673557937145233, "rewards/VisualizationJSONCombinedORM/mean": 0.5011919140815735, "rewards/VisualizationJSONCombinedORM/std": 0.1740761548280716, "step": 4690, "train_speed(iter/s)": 0.114696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 353.8125, "completions/min_length": 283.0, "epoch": 3.880066170388751, "grad_norm": 0.23200972378253937, "kl": 0.057373046875, "learning_rate": 1.4522367463314897e-06, "loss": 0.0005734506994485855, "memory(GiB)": 38.1, "reward": 0.29371756315231323, "reward_std": 0.028060534968972206, "rewards/VisualizationJSONCombinedORM/mean": 0.29371756315231323, "rewards/VisualizationJSONCombinedORM/std": 0.08276641368865967, "step": 4691, "train_speed(iter/s)": 0.114641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 334.5, "completions/min_length": 259.0, "epoch": 3.8808933002481387, "grad_norm": 0.18097873032093048, "kl": 0.05853271484375, "learning_rate": 1.450202658931838e-06, "loss": 0.0005849748849868774, "memory(GiB)": 38.1, "reward": 0.6125720143318176, "reward_std": 0.10218814015388489, "rewards/VisualizationJSONCombinedORM/mean": 0.6125720143318176, "rewards/VisualizationJSONCombinedORM/std": 0.1146358922123909, "step": 4692, "train_speed(iter/s)": 0.114579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 302.1875, "completions/min_length": 215.0, "epoch": 3.881720430107527, "grad_norm": 0.24537555873394012, "kl": 0.05340576171875, "learning_rate": 1.448169755406218e-06, "loss": 0.0005335323512554169, "memory(GiB)": 38.1, "reward": 0.3092753291130066, "reward_std": 0.04075103625655174, "rewards/VisualizationJSONCombinedORM/mean": 0.3092753291130066, "rewards/VisualizationJSONCombinedORM/std": 0.08253160119056702, "step": 4693, "train_speed(iter/s)": 0.114534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 346.5625, "completions/min_length": 298.0, "epoch": 3.882547559966915, "grad_norm": 0.17781828343868256, "kl": 0.06884765625, "learning_rate": 1.4461380364326072e-06, "loss": 0.0006881244480609894, "memory(GiB)": 38.1, "reward": 0.405320942401886, "reward_std": 0.0401848703622818, "rewards/VisualizationJSONCombinedORM/mean": 0.405320942401886, "rewards/VisualizationJSONCombinedORM/std": 0.13093777000904083, "step": 4694, "train_speed(iter/s)": 0.114486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 311.5625, "completions/min_length": 242.0, "epoch": 3.8833746898263026, "grad_norm": 0.18715085089206696, "kl": 0.055908203125, "learning_rate": 1.4441075026885999e-06, "loss": 0.0005616173148155212, "memory(GiB)": 38.1, "reward": 0.7150284051895142, "reward_std": 0.034837715327739716, "rewards/VisualizationJSONCombinedORM/mean": 0.7150284051895142, "rewards/VisualizationJSONCombinedORM/std": 0.06456857919692993, "step": 4695, "train_speed(iter/s)": 0.114425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 312.5625, "completions/min_length": 257.0, "epoch": 3.884201819685691, "grad_norm": 0.16277329623699188, "kl": 0.04205322265625, "learning_rate": 1.442078154851384e-06, "loss": 0.00042079389095306396, "memory(GiB)": 38.1, "reward": 0.3288124203681946, "reward_std": 0.0270367581397295, "rewards/VisualizationJSONCombinedORM/mean": 0.3288124203681946, "rewards/VisualizationJSONCombinedORM/std": 0.1278962641954422, "step": 4696, "train_speed(iter/s)": 0.114377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 313.9375, "completions/min_length": 237.0, "epoch": 3.8850289495450787, "grad_norm": 0.24697910249233246, "kl": 0.079833984375, "learning_rate": 1.440049993597758e-06, "loss": 0.0008004214614629745, "memory(GiB)": 38.1, "reward": 0.5719873309135437, "reward_std": 0.040296632796525955, "rewards/VisualizationJSONCombinedORM/mean": 0.5719873309135437, "rewards/VisualizationJSONCombinedORM/std": 0.1419687569141388, "step": 4697, "train_speed(iter/s)": 0.114323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 294.0, "completions/min_length": 241.0, "epoch": 3.8858560794044665, "grad_norm": 0.1634363979101181, "kl": 0.11962890625, "learning_rate": 1.4380230196041234e-06, "loss": 0.001194886863231659, "memory(GiB)": 38.1, "reward": 0.5816417932510376, "reward_std": 0.08462849259376526, "rewards/VisualizationJSONCombinedORM/mean": 0.5816417932510376, "rewards/VisualizationJSONCombinedORM/std": 0.15588046610355377, "step": 4698, "train_speed(iter/s)": 0.114274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 323.75, "completions/min_length": 254.0, "epoch": 3.8866832092638544, "grad_norm": 0.20798584818840027, "kl": 0.2108154296875, "learning_rate": 1.435997233546486e-06, "loss": 0.0021092481911182404, "memory(GiB)": 38.1, "reward": 0.423412561416626, "reward_std": 0.07497868686914444, "rewards/VisualizationJSONCombinedORM/mean": 0.423412561416626, "rewards/VisualizationJSONCombinedORM/std": 0.16903148591518402, "step": 4699, "train_speed(iter/s)": 0.114227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 285.25, "completions/min_length": 224.0, "epoch": 3.887510339123242, "grad_norm": 0.20483162999153137, "kl": 0.056884765625, "learning_rate": 1.433972636100452e-06, "loss": 0.0005695372819900513, "memory(GiB)": 38.1, "reward": 0.652987003326416, "reward_std": 0.08086314052343369, "rewards/VisualizationJSONCombinedORM/mean": 0.652987003326416, "rewards/VisualizationJSONCombinedORM/std": 0.08711539953947067, "step": 4700, "train_speed(iter/s)": 0.114182 }, { "epoch": 3.887510339123242, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.7083333333333, "eval_completions/mean_length": 311.0260416666667, "eval_completions/min_length": 260.2083333333333, "eval_kl": 0.09250895182291667, "eval_loss": 0.0009266684646718204, "eval_reward": 0.4618430243184169, "eval_reward_std": 0.06570273106141637, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4618430243184169, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0657027317987134, "eval_runtime": 317.1738, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 324.6875, "completions/min_length": 241.0, "epoch": 3.8883374689826304, "grad_norm": 0.16783496737480164, "kl": 0.0980224609375, "learning_rate": 1.4319492279412388e-06, "loss": 0.0009778663516044617, "memory(GiB)": 38.1, "reward": 0.6348111033439636, "reward_std": 0.04778147116303444, "rewards/VisualizationJSONCombinedORM/mean": 0.6348111033439636, "rewards/VisualizationJSONCombinedORM/std": 0.07552061975002289, "step": 4701, "train_speed(iter/s)": 0.113265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 323.3125, "completions/min_length": 263.0, "epoch": 3.8891645988420183, "grad_norm": 0.5883932709693909, "kl": 0.70654296875, "learning_rate": 1.429927009743659e-06, "loss": 0.0070739127695560455, "memory(GiB)": 38.1, "reward": 0.49772191047668457, "reward_std": 0.0182932261377573, "rewards/VisualizationJSONCombinedORM/mean": 0.49772191047668457, "rewards/VisualizationJSONCombinedORM/std": 0.2824459969997406, "step": 4702, "train_speed(iter/s)": 0.113218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 317.9375, "completions/min_length": 235.0, "epoch": 3.889991728701406, "grad_norm": 0.2394610047340393, "kl": 0.103271484375, "learning_rate": 1.4279059821821329e-06, "loss": 0.0010317638516426086, "memory(GiB)": 38.1, "reward": 0.40694957971572876, "reward_std": 0.038317032158374786, "rewards/VisualizationJSONCombinedORM/mean": 0.40694957971572876, "rewards/VisualizationJSONCombinedORM/std": 0.10218210518360138, "step": 4703, "train_speed(iter/s)": 0.113176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 308.0625, "completions/min_length": 250.0, "epoch": 3.890818858560794, "grad_norm": 0.14880770444869995, "kl": 0.147705078125, "learning_rate": 1.4258861459306821e-06, "loss": 0.0014731506817042828, "memory(GiB)": 38.1, "reward": 0.4941599369049072, "reward_std": 0.03618931770324707, "rewards/VisualizationJSONCombinedORM/mean": 0.4941599369049072, "rewards/VisualizationJSONCombinedORM/std": 0.06032435968518257, "step": 4704, "train_speed(iter/s)": 0.11313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 299.4375, "completions/min_length": 250.0, "epoch": 3.8916459884201817, "grad_norm": 0.2550455331802368, "kl": 0.103271484375, "learning_rate": 1.423867501662934e-06, "loss": 0.0010306742042303085, "memory(GiB)": 38.1, "reward": 0.33226844668388367, "reward_std": 0.04665728285908699, "rewards/VisualizationJSONCombinedORM/mean": 0.33226844668388367, "rewards/VisualizationJSONCombinedORM/std": 0.0892125815153122, "step": 4705, "train_speed(iter/s)": 0.113077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 316.625, "completions/min_length": 257.0, "epoch": 3.89247311827957, "grad_norm": 0.18306489288806915, "kl": 0.1431884765625, "learning_rate": 1.4218500500521122e-06, "loss": 0.0014290250837802887, "memory(GiB)": 38.1, "reward": 0.36784881353378296, "reward_std": 0.0209110826253891, "rewards/VisualizationJSONCombinedORM/mean": 0.36784881353378296, "rewards/VisualizationJSONCombinedORM/std": 0.10728614032268524, "step": 4706, "train_speed(iter/s)": 0.113035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 296.1875, "completions/min_length": 225.0, "epoch": 3.893300248138958, "grad_norm": 0.2234833836555481, "kl": 0.0731201171875, "learning_rate": 1.4198337917710475e-06, "loss": 0.0007288046181201935, "memory(GiB)": 38.1, "reward": 0.5785616636276245, "reward_std": 0.053310930728912354, "rewards/VisualizationJSONCombinedORM/mean": 0.5785616636276245, "rewards/VisualizationJSONCombinedORM/std": 0.24509687721729279, "step": 4707, "train_speed(iter/s)": 0.112994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 320.6875, "completions/min_length": 257.0, "epoch": 3.8941273779983456, "grad_norm": 0.23454871773719788, "kl": 0.07147216796875, "learning_rate": 1.4178187274921724e-06, "loss": 0.0007145106792449951, "memory(GiB)": 38.1, "reward": 0.41450294852256775, "reward_std": 0.03888346999883652, "rewards/VisualizationJSONCombinedORM/mean": 0.41450294852256775, "rewards/VisualizationJSONCombinedORM/std": 0.03827695921063423, "step": 4708, "train_speed(iter/s)": 0.112937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 300.6875, "completions/min_length": 200.0, "epoch": 3.894954507857734, "grad_norm": 0.19411753118038177, "kl": 0.0634765625, "learning_rate": 1.4158048578875211e-06, "loss": 0.0006359964609146118, "memory(GiB)": 38.1, "reward": 0.4794695973396301, "reward_std": 0.06403513252735138, "rewards/VisualizationJSONCombinedORM/mean": 0.4794695973396301, "rewards/VisualizationJSONCombinedORM/std": 0.0796857625246048, "step": 4709, "train_speed(iter/s)": 0.112894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 309.75, "completions/min_length": 236.0, "epoch": 3.8957816377171217, "grad_norm": 0.1874319612979889, "kl": 0.03692626953125, "learning_rate": 1.4137921836287238e-06, "loss": 0.0003691576421260834, "memory(GiB)": 38.1, "reward": 0.6750446557998657, "reward_std": 0.06363072246313095, "rewards/VisualizationJSONCombinedORM/mean": 0.6750446557998657, "rewards/VisualizationJSONCombinedORM/std": 0.0646478533744812, "step": 4710, "train_speed(iter/s)": 0.112844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 306.8125, "completions/min_length": 248.0, "epoch": 3.8966087675765095, "grad_norm": 0.20907847583293915, "kl": 0.0556640625, "learning_rate": 1.4117807053870235e-06, "loss": 0.0005568098276853561, "memory(GiB)": 38.1, "reward": 0.5155345797538757, "reward_std": 0.039929091930389404, "rewards/VisualizationJSONCombinedORM/mean": 0.5155345797538757, "rewards/VisualizationJSONCombinedORM/std": 0.20978860557079315, "step": 4711, "train_speed(iter/s)": 0.112796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 312.125, "completions/min_length": 242.0, "epoch": 3.8974358974358974, "grad_norm": 0.20625948905944824, "kl": 0.0596923828125, "learning_rate": 1.4097704238332532e-06, "loss": 0.0005970001220703125, "memory(GiB)": 38.1, "reward": 0.6561503410339355, "reward_std": 0.08159630745649338, "rewards/VisualizationJSONCombinedORM/mean": 0.6561503410339355, "rewards/VisualizationJSONCombinedORM/std": 0.16561785340309143, "step": 4712, "train_speed(iter/s)": 0.112761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 324.125, "completions/min_length": 238.0, "epoch": 3.898263027295285, "grad_norm": 0.18386603891849518, "kl": 0.0460205078125, "learning_rate": 1.407761339637852e-06, "loss": 0.00045990198850631714, "memory(GiB)": 38.1, "reward": 0.4927438795566559, "reward_std": 0.05048191547393799, "rewards/VisualizationJSONCombinedORM/mean": 0.4927438795566559, "rewards/VisualizationJSONCombinedORM/std": 0.08582312613725662, "step": 4713, "train_speed(iter/s)": 0.112712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 322.1875, "completions/min_length": 264.0, "epoch": 3.8990901571546734, "grad_norm": 0.1970500648021698, "kl": 0.062744140625, "learning_rate": 1.4057534534708588e-06, "loss": 0.0006272569298744202, "memory(GiB)": 38.1, "reward": 0.6087684035301208, "reward_std": 0.03203611075878143, "rewards/VisualizationJSONCombinedORM/mean": 0.6087684035301208, "rewards/VisualizationJSONCombinedORM/std": 0.16141177713871002, "step": 4714, "train_speed(iter/s)": 0.112667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 311.0625, "completions/min_length": 225.0, "epoch": 3.8999172870140613, "grad_norm": 0.20645172894001007, "kl": 0.2197265625, "learning_rate": 1.4037467660019156e-06, "loss": 0.0021986160427331924, "memory(GiB)": 38.1, "reward": 0.4334757924079895, "reward_std": 0.03962232917547226, "rewards/VisualizationJSONCombinedORM/mean": 0.4334757924079895, "rewards/VisualizationJSONCombinedORM/std": 0.10170629620552063, "step": 4715, "train_speed(iter/s)": 0.112627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 317.0625, "completions/min_length": 244.0, "epoch": 3.900744416873449, "grad_norm": 0.23039793968200684, "kl": 0.08197021484375, "learning_rate": 1.4017412779002565e-06, "loss": 0.0008192919194698334, "memory(GiB)": 38.1, "reward": 0.3797783851623535, "reward_std": 0.05832210183143616, "rewards/VisualizationJSONCombinedORM/mean": 0.3797783851623535, "rewards/VisualizationJSONCombinedORM/std": 0.07157202064990997, "step": 4716, "train_speed(iter/s)": 0.112583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 327.625, "completions/min_length": 261.0, "epoch": 3.901571546732837, "grad_norm": 0.18154127895832062, "kl": 0.06317138671875, "learning_rate": 1.399736989834728e-06, "loss": 0.0006311703473329544, "memory(GiB)": 38.1, "reward": 0.3335292339324951, "reward_std": 0.018606994301080704, "rewards/VisualizationJSONCombinedORM/mean": 0.3335292339324951, "rewards/VisualizationJSONCombinedORM/std": 0.06123090535402298, "step": 4717, "train_speed(iter/s)": 0.112523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 289.25, "completions/min_length": 234.0, "epoch": 3.9023986765922247, "grad_norm": 0.17289331555366516, "kl": 0.05950927734375, "learning_rate": 1.397733902473764e-06, "loss": 0.0005972236394882202, "memory(GiB)": 38.1, "reward": 0.6180217266082764, "reward_std": 0.07615569978952408, "rewards/VisualizationJSONCombinedORM/mean": 0.6180217266082764, "rewards/VisualizationJSONCombinedORM/std": 0.08280839771032333, "step": 4718, "train_speed(iter/s)": 0.112486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 329.6875, "completions/min_length": 276.0, "epoch": 3.903225806451613, "grad_norm": 0.17567817866802216, "kl": 0.06756591796875, "learning_rate": 1.395732016485406e-06, "loss": 0.0006768032908439636, "memory(GiB)": 38.1, "reward": 0.5539503693580627, "reward_std": 0.024333883076906204, "rewards/VisualizationJSONCombinedORM/mean": 0.5539503693580627, "rewards/VisualizationJSONCombinedORM/std": 0.09095453470945358, "step": 4719, "train_speed(iter/s)": 0.112439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 304.875, "completions/min_length": 235.0, "epoch": 3.904052936311001, "grad_norm": 0.1738273650407791, "kl": 0.06939697265625, "learning_rate": 1.3937313325372919e-06, "loss": 0.00069388747215271, "memory(GiB)": 38.1, "reward": 0.6729884743690491, "reward_std": 0.0440814308822155, "rewards/VisualizationJSONCombinedORM/mean": 0.6729884743690491, "rewards/VisualizationJSONCombinedORM/std": 0.08671664446592331, "step": 4720, "train_speed(iter/s)": 0.112389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 303.8125, "completions/min_length": 240.0, "epoch": 3.9048800661703886, "grad_norm": 0.18875280022621155, "kl": 0.0836181640625, "learning_rate": 1.391731851296661e-06, "loss": 0.0008389651775360107, "memory(GiB)": 38.1, "reward": 0.42694780230522156, "reward_std": 0.04230229929089546, "rewards/VisualizationJSONCombinedORM/mean": 0.42694780230522156, "rewards/VisualizationJSONCombinedORM/std": 0.15846146643161774, "step": 4721, "train_speed(iter/s)": 0.112336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 380.5, "completions/min_length": 293.0, "epoch": 3.905707196029777, "grad_norm": 0.18817287683486938, "kl": 0.05096435546875, "learning_rate": 1.3897335734303458e-06, "loss": 0.0005098581314086914, "memory(GiB)": 38.1, "reward": 0.6701265573501587, "reward_std": 0.06968284398317337, "rewards/VisualizationJSONCombinedORM/mean": 0.6701265573501587, "rewards/VisualizationJSONCombinedORM/std": 0.07880318909883499, "step": 4722, "train_speed(iter/s)": 0.112272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 325.1875, "completions/min_length": 255.0, "epoch": 3.9065343258891647, "grad_norm": 0.18962162733078003, "kl": 0.076416015625, "learning_rate": 1.3877364996047837e-06, "loss": 0.0007641799747943878, "memory(GiB)": 38.1, "reward": 0.5738131999969482, "reward_std": 0.04880315437912941, "rewards/VisualizationJSONCombinedORM/mean": 0.5738131999969482, "rewards/VisualizationJSONCombinedORM/std": 0.1811881959438324, "step": 4723, "train_speed(iter/s)": 0.112228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 333.875, "completions/min_length": 237.0, "epoch": 3.9073614557485525, "grad_norm": 0.17231231927871704, "kl": 0.0628662109375, "learning_rate": 1.3857406304860084e-06, "loss": 0.0006280522793531418, "memory(GiB)": 38.1, "reward": 0.36272165179252625, "reward_std": 0.028095928952097893, "rewards/VisualizationJSONCombinedORM/mean": 0.36272165179252625, "rewards/VisualizationJSONCombinedORM/std": 0.09054575860500336, "step": 4724, "train_speed(iter/s)": 0.112178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 321.0, "completions/min_length": 244.0, "epoch": 3.9081885856079404, "grad_norm": 0.21546553075313568, "kl": 0.03912353515625, "learning_rate": 1.383745966739652e-06, "loss": 0.00039045512676239014, "memory(GiB)": 38.1, "reward": 0.42562055587768555, "reward_std": 0.03427576646208763, "rewards/VisualizationJSONCombinedORM/mean": 0.42562055587768555, "rewards/VisualizationJSONCombinedORM/std": 0.15709395706653595, "step": 4725, "train_speed(iter/s)": 0.112119 }, { "epoch": 3.9081885856079404, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.25, "eval_completions/mean_length": 306.515625, "eval_completions/min_length": 258.125, "eval_kl": 0.087677001953125, "eval_loss": 0.0008789698476903141, "eval_reward": 0.45774002621571225, "eval_reward_std": 0.058412726308840014, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45774002621571225, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05841272685211152, "eval_runtime": 311.8758, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 315.875, "completions/min_length": 236.0, "epoch": 3.909015715467328, "grad_norm": 0.16557735204696655, "kl": 0.076416015625, "learning_rate": 1.3817525090309403e-06, "loss": 0.0007643736898899078, "memory(GiB)": 38.1, "reward": 0.528624415397644, "reward_std": 0.034906964749097824, "rewards/VisualizationJSONCombinedORM/mean": 0.528624415397644, "rewards/VisualizationJSONCombinedORM/std": 0.1388409286737442, "step": 4726, "train_speed(iter/s)": 0.111236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 317.375, "completions/min_length": 246.0, "epoch": 3.9098428453267164, "grad_norm": 0.2256612628698349, "kl": 0.04071044921875, "learning_rate": 1.3797602580247066e-06, "loss": 0.00040646773413755, "memory(GiB)": 38.1, "reward": 0.5096665620803833, "reward_std": 0.022337492555379868, "rewards/VisualizationJSONCombinedORM/mean": 0.5096665620803833, "rewards/VisualizationJSONCombinedORM/std": 0.12837834656238556, "step": 4727, "train_speed(iter/s)": 0.111189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 289.375, "completions/min_length": 230.0, "epoch": 3.9106699751861043, "grad_norm": 0.16516579687595367, "kl": 0.04058837890625, "learning_rate": 1.3777692143853721e-06, "loss": 0.00040556490421295166, "memory(GiB)": 38.1, "reward": 0.5722909569740295, "reward_std": 0.035991597920656204, "rewards/VisualizationJSONCombinedORM/mean": 0.5722909569740295, "rewards/VisualizationJSONCombinedORM/std": 0.18300387263298035, "step": 4728, "train_speed(iter/s)": 0.111144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 319.625, "completions/min_length": 249.0, "epoch": 3.911497105045492, "grad_norm": 0.19435954093933105, "kl": 0.26171875, "learning_rate": 1.3757793787769596e-06, "loss": 0.0026122182607650757, "memory(GiB)": 38.1, "reward": 0.4668924808502197, "reward_std": 0.07059556245803833, "rewards/VisualizationJSONCombinedORM/mean": 0.4668924808502197, "rewards/VisualizationJSONCombinedORM/std": 0.16850513219833374, "step": 4729, "train_speed(iter/s)": 0.111097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 282.375, "completions/min_length": 240.0, "epoch": 3.91232423490488, "grad_norm": 0.19938763976097107, "kl": 0.03558349609375, "learning_rate": 1.37379075186309e-06, "loss": 0.00035646557807922363, "memory(GiB)": 38.1, "reward": 0.5843212604522705, "reward_std": 0.030562598258256912, "rewards/VisualizationJSONCombinedORM/mean": 0.5843212604522705, "rewards/VisualizationJSONCombinedORM/std": 0.2788937985897064, "step": 4730, "train_speed(iter/s)": 0.11106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 311.8125, "completions/min_length": 249.0, "epoch": 3.9131513647642677, "grad_norm": 0.18158505856990814, "kl": 0.04962158203125, "learning_rate": 1.3718033343069797e-06, "loss": 0.0004964396357536316, "memory(GiB)": 38.1, "reward": 0.5200926065444946, "reward_std": 0.03989339619874954, "rewards/VisualizationJSONCombinedORM/mean": 0.5200926065444946, "rewards/VisualizationJSONCombinedORM/std": 0.30580535531044006, "step": 4731, "train_speed(iter/s)": 0.111017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 323.6875, "completions/min_length": 236.0, "epoch": 3.913978494623656, "grad_norm": 0.15777671337127686, "kl": 0.0455322265625, "learning_rate": 1.3698171267714377e-06, "loss": 0.0004553627222776413, "memory(GiB)": 38.1, "reward": 0.5846035480499268, "reward_std": 0.06665076315402985, "rewards/VisualizationJSONCombinedORM/mean": 0.5846035480499268, "rewards/VisualizationJSONCombinedORM/std": 0.23832589387893677, "step": 4732, "train_speed(iter/s)": 0.110969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 309.0, "completions/min_length": 246.0, "epoch": 3.914805624483044, "grad_norm": 0.19752290844917297, "kl": 0.02972412109375, "learning_rate": 1.3678321299188802e-06, "loss": 0.0002970658242702484, "memory(GiB)": 38.1, "reward": 0.5376240015029907, "reward_std": 0.04745052009820938, "rewards/VisualizationJSONCombinedORM/mean": 0.5376240015029907, "rewards/VisualizationJSONCombinedORM/std": 0.08878808468580246, "step": 4733, "train_speed(iter/s)": 0.11092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 308.75, "completions/min_length": 242.0, "epoch": 3.9156327543424316, "grad_norm": 0.19259314239025116, "kl": 0.067138671875, "learning_rate": 1.3658483444113075e-06, "loss": 0.0006712609902024269, "memory(GiB)": 38.1, "reward": 0.43279334902763367, "reward_std": 0.030316319316625595, "rewards/VisualizationJSONCombinedORM/mean": 0.43279334902763367, "rewards/VisualizationJSONCombinedORM/std": 0.2554652988910675, "step": 4734, "train_speed(iter/s)": 0.110878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 311.875, "completions/min_length": 246.0, "epoch": 3.91645988420182, "grad_norm": 0.20651808381080627, "kl": 0.064208984375, "learning_rate": 1.3638657709103238e-06, "loss": 0.0006430386565625668, "memory(GiB)": 38.1, "reward": 0.4880141317844391, "reward_std": 0.06322119385004044, "rewards/VisualizationJSONCombinedORM/mean": 0.4880141317844391, "rewards/VisualizationJSONCombinedORM/std": 0.21859996020793915, "step": 4735, "train_speed(iter/s)": 0.110825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 283.75, "completions/min_length": 215.0, "epoch": 3.9172870140612077, "grad_norm": 0.2148381620645523, "kl": 0.09521484375, "learning_rate": 1.3618844100771256e-06, "loss": 0.0009520873427391052, "memory(GiB)": 38.1, "reward": 0.5488895177841187, "reward_std": 0.07007285952568054, "rewards/VisualizationJSONCombinedORM/mean": 0.5488895177841187, "rewards/VisualizationJSONCombinedORM/std": 0.14535455405712128, "step": 4736, "train_speed(iter/s)": 0.110782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 312.9375, "completions/min_length": 221.0, "epoch": 3.9181141439205955, "grad_norm": 0.18846039474010468, "kl": 0.05609130859375, "learning_rate": 1.3599042625725084e-06, "loss": 0.0005604270845651627, "memory(GiB)": 38.1, "reward": 0.5212663412094116, "reward_std": 0.06346150487661362, "rewards/VisualizationJSONCombinedORM/mean": 0.5212663412094116, "rewards/VisualizationJSONCombinedORM/std": 0.13794930279254913, "step": 4737, "train_speed(iter/s)": 0.110723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.8125, "completions/min_length": 259.0, "epoch": 3.9189412737799834, "grad_norm": 0.17241954803466797, "kl": 0.05712890625, "learning_rate": 1.3579253290568573e-06, "loss": 0.0005706734955310822, "memory(GiB)": 38.1, "reward": 0.7395756244659424, "reward_std": 0.05753645300865173, "rewards/VisualizationJSONCombinedORM/mean": 0.7395756244659424, "rewards/VisualizationJSONCombinedORM/std": 0.18772628903388977, "step": 4738, "train_speed(iter/s)": 0.110677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 300.375, "completions/min_length": 252.0, "epoch": 3.919768403639371, "grad_norm": 0.1938195377588272, "kl": 0.106689453125, "learning_rate": 1.3559476101901575e-06, "loss": 0.0010687559843063354, "memory(GiB)": 38.1, "reward": 0.4113818407058716, "reward_std": 0.05061725527048111, "rewards/VisualizationJSONCombinedORM/mean": 0.4113818407058716, "rewards/VisualizationJSONCombinedORM/std": 0.1540006697177887, "step": 4739, "train_speed(iter/s)": 0.110622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 346.375, "completions/min_length": 212.0, "epoch": 3.9205955334987594, "grad_norm": 0.17243748903274536, "kl": 0.05938720703125, "learning_rate": 1.3539711066319873e-06, "loss": 0.0005942583084106445, "memory(GiB)": 38.1, "reward": 0.6209786534309387, "reward_std": 0.07148481905460358, "rewards/VisualizationJSONCombinedORM/mean": 0.6209786534309387, "rewards/VisualizationJSONCombinedORM/std": 0.08812278509140015, "step": 4740, "train_speed(iter/s)": 0.110578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 312.375, "completions/min_length": 228.0, "epoch": 3.9214226633581473, "grad_norm": 0.18890345096588135, "kl": 0.1544189453125, "learning_rate": 1.351995819041521e-06, "loss": 0.0015498846769332886, "memory(GiB)": 38.1, "reward": 0.5615710020065308, "reward_std": 0.07070603221654892, "rewards/VisualizationJSONCombinedORM/mean": 0.5615710020065308, "rewards/VisualizationJSONCombinedORM/std": 0.11512897163629532, "step": 4741, "train_speed(iter/s)": 0.11054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 320.4375, "completions/min_length": 240.0, "epoch": 3.922249793217535, "grad_norm": 0.22422011196613312, "kl": 0.088134765625, "learning_rate": 1.3500217480775229e-06, "loss": 0.0008822977542877197, "memory(GiB)": 38.1, "reward": 0.5834943056106567, "reward_std": 0.06594406068325043, "rewards/VisualizationJSONCombinedORM/mean": 0.5834943056106567, "rewards/VisualizationJSONCombinedORM/std": 0.15333761274814606, "step": 4742, "train_speed(iter/s)": 0.110489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 331.375, "completions/min_length": 272.0, "epoch": 3.9230769230769234, "grad_norm": 0.17625530064105988, "kl": 0.08135986328125, "learning_rate": 1.34804889439836e-06, "loss": 0.0008128080517053604, "memory(GiB)": 38.1, "reward": 0.41639694571495056, "reward_std": 0.061980001628398895, "rewards/VisualizationJSONCombinedORM/mean": 0.41639694571495056, "rewards/VisualizationJSONCombinedORM/std": 0.08244439214468002, "step": 4743, "train_speed(iter/s)": 0.110446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 346.25, "completions/min_length": 287.0, "epoch": 3.9239040529363107, "grad_norm": 0.1738828867673874, "kl": 0.06866455078125, "learning_rate": 1.3460772586619842e-06, "loss": 0.0006878972053527832, "memory(GiB)": 38.1, "reward": 0.4173363745212555, "reward_std": 0.0312705896794796, "rewards/VisualizationJSONCombinedORM/mean": 0.4173363745212555, "rewards/VisualizationJSONCombinedORM/std": 0.1525121033191681, "step": 4744, "train_speed(iter/s)": 0.110403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 301.5625, "completions/min_length": 247.0, "epoch": 3.924731182795699, "grad_norm": 0.16822579503059387, "kl": 0.0760498046875, "learning_rate": 1.3441068415259462e-06, "loss": 0.0007613301277160645, "memory(GiB)": 38.1, "reward": 0.6594107151031494, "reward_std": 0.03960777074098587, "rewards/VisualizationJSONCombinedORM/mean": 0.6594107151031494, "rewards/VisualizationJSONCombinedORM/std": 0.06094556301832199, "step": 4745, "train_speed(iter/s)": 0.110347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 307.375, "completions/min_length": 245.0, "epoch": 3.925558312655087, "grad_norm": 0.237875297665596, "kl": 0.1826171875, "learning_rate": 1.34213764364739e-06, "loss": 0.0018318742513656616, "memory(GiB)": 38.1, "reward": 0.6551125049591064, "reward_std": 0.07375140488147736, "rewards/VisualizationJSONCombinedORM/mean": 0.6551125049591064, "rewards/VisualizationJSONCombinedORM/std": 0.11926766484975815, "step": 4746, "train_speed(iter/s)": 0.110302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 302.875, "completions/min_length": 243.0, "epoch": 3.9263854425144746, "grad_norm": 0.20081233978271484, "kl": 0.08544921875, "learning_rate": 1.3401696656830532e-06, "loss": 0.0008535198867321014, "memory(GiB)": 38.1, "reward": 0.3221404552459717, "reward_std": 0.02882649190723896, "rewards/VisualizationJSONCombinedORM/mean": 0.3221404552459717, "rewards/VisualizationJSONCombinedORM/std": 0.0756329894065857, "step": 4747, "train_speed(iter/s)": 0.110251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 301.9375, "completions/min_length": 247.0, "epoch": 3.927212572373863, "grad_norm": 0.1679694652557373, "kl": 0.099365234375, "learning_rate": 1.3382029082892617e-06, "loss": 0.0009931251406669617, "memory(GiB)": 38.1, "reward": 0.24117395281791687, "reward_std": 0.018326111137866974, "rewards/VisualizationJSONCombinedORM/mean": 0.24117395281791687, "rewards/VisualizationJSONCombinedORM/std": 0.04809826612472534, "step": 4748, "train_speed(iter/s)": 0.11021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 309.6875, "completions/min_length": 257.0, "epoch": 3.9280397022332507, "grad_norm": 0.2010691910982132, "kl": 0.08251953125, "learning_rate": 1.336237372121944e-06, "loss": 0.0008252635598182678, "memory(GiB)": 38.1, "reward": 0.7260968685150146, "reward_std": 0.045235905796289444, "rewards/VisualizationJSONCombinedORM/mean": 0.7260968685150146, "rewards/VisualizationJSONCombinedORM/std": 0.04543323814868927, "step": 4749, "train_speed(iter/s)": 0.110168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 308.3125, "completions/min_length": 254.0, "epoch": 3.9288668320926385, "grad_norm": 0.1745975762605667, "kl": 0.035614013671875, "learning_rate": 1.334273057836611e-06, "loss": 0.0003574937582015991, "memory(GiB)": 38.1, "reward": 0.7074093818664551, "reward_std": 0.038520701229572296, "rewards/VisualizationJSONCombinedORM/mean": 0.7074093818664551, "rewards/VisualizationJSONCombinedORM/std": 0.06827207654714584, "step": 4750, "train_speed(iter/s)": 0.110126 }, { "epoch": 3.9288668320926385, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.5416666666667, "eval_completions/mean_length": 308.0989583333333, "eval_completions/min_length": 263.125, "eval_kl": 0.09679158528645833, "eval_loss": 0.0009844700107350945, "eval_reward": 0.4571337563296159, "eval_reward_std": 0.05799439370942613, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4571337563296159, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05799439382584145, "eval_runtime": 315.3431, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 320.0625, "completions/min_length": 253.0, "epoch": 3.9296939619520264, "grad_norm": 0.2001895010471344, "kl": 0.05767822265625, "learning_rate": 1.3323099660883725e-06, "loss": 0.0005758032202720642, "memory(GiB)": 38.1, "reward": 0.4925553798675537, "reward_std": 0.05447224900126457, "rewards/VisualizationJSONCombinedORM/mean": 0.4925553798675537, "rewards/VisualizationJSONCombinedORM/std": 0.28766068816185, "step": 4751, "train_speed(iter/s)": 0.109286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 285.25, "completions/min_length": 239.0, "epoch": 3.930521091811414, "grad_norm": 0.1587388962507248, "kl": 0.0599365234375, "learning_rate": 1.330348097531929e-06, "loss": 0.0005990557256154716, "memory(GiB)": 38.1, "reward": 0.5067578554153442, "reward_std": 0.03961896896362305, "rewards/VisualizationJSONCombinedORM/mean": 0.5067578554153442, "rewards/VisualizationJSONCombinedORM/std": 0.09848369657993317, "step": 4752, "train_speed(iter/s)": 0.109245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 292.0625, "completions/min_length": 242.0, "epoch": 3.9313482216708024, "grad_norm": 0.1969853788614273, "kl": 0.199951171875, "learning_rate": 1.3283874528215735e-06, "loss": 0.001999134197831154, "memory(GiB)": 38.1, "reward": 0.5067659616470337, "reward_std": 0.0641312450170517, "rewards/VisualizationJSONCombinedORM/mean": 0.5067659616470337, "rewards/VisualizationJSONCombinedORM/std": 0.22357700765132904, "step": 4753, "train_speed(iter/s)": 0.109209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 289.6875, "completions/min_length": 221.0, "epoch": 3.9321753515301903, "grad_norm": 0.18441209197044373, "kl": 0.033447265625, "learning_rate": 1.3264280326111878e-06, "loss": 0.0003349296748638153, "memory(GiB)": 38.1, "reward": 0.5342227220535278, "reward_std": 0.06263985484838486, "rewards/VisualizationJSONCombinedORM/mean": 0.5342227220535278, "rewards/VisualizationJSONCombinedORM/std": 0.06722034513950348, "step": 4754, "train_speed(iter/s)": 0.109165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 311.1875, "completions/min_length": 260.0, "epoch": 3.933002481389578, "grad_norm": 0.18505677580833435, "kl": 0.08935546875, "learning_rate": 1.3244698375542492e-06, "loss": 0.000892292708158493, "memory(GiB)": 38.1, "reward": 0.37330362200737, "reward_std": 0.015495304018259048, "rewards/VisualizationJSONCombinedORM/mean": 0.37330362200737, "rewards/VisualizationJSONCombinedORM/std": 0.16400425136089325, "step": 4755, "train_speed(iter/s)": 0.109126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 318.0625, "completions/min_length": 267.0, "epoch": 3.9338296112489664, "grad_norm": 0.1900114119052887, "kl": 0.10009765625, "learning_rate": 1.3225128683038247e-06, "loss": 0.0010018227621912956, "memory(GiB)": 38.1, "reward": 0.5949521064758301, "reward_std": 0.06621169298887253, "rewards/VisualizationJSONCombinedORM/mean": 0.5949521064758301, "rewards/VisualizationJSONCombinedORM/std": 0.06479664891958237, "step": 4756, "train_speed(iter/s)": 0.109074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 312.8125, "completions/min_length": 261.0, "epoch": 3.934656741108354, "grad_norm": 0.17703819274902344, "kl": 0.0467529296875, "learning_rate": 1.320557125512575e-06, "loss": 0.00046778470277786255, "memory(GiB)": 38.1, "reward": 0.5271674990653992, "reward_std": 0.04745221138000488, "rewards/VisualizationJSONCombinedORM/mean": 0.5271674990653992, "rewards/VisualizationJSONCombinedORM/std": 0.12411143630743027, "step": 4757, "train_speed(iter/s)": 0.109031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 261.0625, "completions/min_length": 234.0, "epoch": 3.935483870967742, "grad_norm": 0.24461254477500916, "kl": 0.05621337890625, "learning_rate": 1.318602609832743e-06, "loss": 0.0005618780851364136, "memory(GiB)": 38.1, "reward": 0.43269455432891846, "reward_std": 0.09360489249229431, "rewards/VisualizationJSONCombinedORM/mean": 0.43269455432891846, "rewards/VisualizationJSONCombinedORM/std": 0.18440140783786774, "step": 4758, "train_speed(iter/s)": 0.109003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 261.5, "completions/min_length": 235.0, "epoch": 3.93631100082713, "grad_norm": 0.23150429129600525, "kl": 0.0650634765625, "learning_rate": 1.3166493219161774e-06, "loss": 0.0006506145000457764, "memory(GiB)": 38.1, "reward": 0.5217180252075195, "reward_std": 0.08019471168518066, "rewards/VisualizationJSONCombinedORM/mean": 0.5217180252075195, "rewards/VisualizationJSONCombinedORM/std": 0.24948491156101227, "step": 4759, "train_speed(iter/s)": 0.108965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 323.3125, "completions/min_length": 242.0, "epoch": 3.9371381306865176, "grad_norm": 0.1622840315103531, "kl": 0.0435791015625, "learning_rate": 1.3146972624143024e-06, "loss": 0.00043564289808273315, "memory(GiB)": 38.1, "reward": 0.6110013723373413, "reward_std": 0.03482156991958618, "rewards/VisualizationJSONCombinedORM/mean": 0.6110013723373413, "rewards/VisualizationJSONCombinedORM/std": 0.1918354332447052, "step": 4760, "train_speed(iter/s)": 0.108918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 311.8125, "completions/min_length": 234.0, "epoch": 3.937965260545906, "grad_norm": 0.18524262309074402, "kl": 0.03790283203125, "learning_rate": 1.3127464319781413e-06, "loss": 0.00038002803921699524, "memory(GiB)": 38.1, "reward": 0.5408288240432739, "reward_std": 0.03816744685173035, "rewards/VisualizationJSONCombinedORM/mean": 0.5408288240432739, "rewards/VisualizationJSONCombinedORM/std": 0.2729838490486145, "step": 4761, "train_speed(iter/s)": 0.10886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 317.5, "completions/min_length": 241.0, "epoch": 3.9387923904052937, "grad_norm": 0.18620721995830536, "kl": 0.05975341796875, "learning_rate": 1.3107968312583053e-06, "loss": 0.000597737729549408, "memory(GiB)": 38.1, "reward": 0.6224223375320435, "reward_std": 0.08727137744426727, "rewards/VisualizationJSONCombinedORM/mean": 0.6224223375320435, "rewards/VisualizationJSONCombinedORM/std": 0.08525309711694717, "step": 4762, "train_speed(iter/s)": 0.108806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 306.75, "completions/min_length": 219.0, "epoch": 3.9396195202646815, "grad_norm": 0.253856360912323, "kl": 0.06243896484375, "learning_rate": 1.3088484609049968e-06, "loss": 0.0006251037120819092, "memory(GiB)": 38.1, "reward": 0.3967704176902771, "reward_std": 0.03497789427638054, "rewards/VisualizationJSONCombinedORM/mean": 0.3967704176902771, "rewards/VisualizationJSONCombinedORM/std": 0.22345435619354248, "step": 4763, "train_speed(iter/s)": 0.108763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 337.9375, "completions/min_length": 286.0, "epoch": 3.9404466501240694, "grad_norm": 0.1755022257566452, "kl": 0.057373046875, "learning_rate": 1.306901321568001e-06, "loss": 0.0005742758512496948, "memory(GiB)": 38.1, "reward": 0.6821607351303101, "reward_std": 0.07845939695835114, "rewards/VisualizationJSONCombinedORM/mean": 0.6821607351303101, "rewards/VisualizationJSONCombinedORM/std": 0.08272828906774521, "step": 4764, "train_speed(iter/s)": 0.108721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 305.5625, "completions/min_length": 232.0, "epoch": 3.941273779983457, "grad_norm": 0.29708388447761536, "kl": 0.0526123046875, "learning_rate": 1.3049554138967052e-06, "loss": 0.0005260668694972992, "memory(GiB)": 38.1, "reward": 0.3394966423511505, "reward_std": 0.03492604196071625, "rewards/VisualizationJSONCombinedORM/mean": 0.3394966423511505, "rewards/VisualizationJSONCombinedORM/std": 0.1203187108039856, "step": 4765, "train_speed(iter/s)": 0.108665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 299.875, "completions/min_length": 244.0, "epoch": 3.9421009098428454, "grad_norm": 0.2139868587255478, "kl": 0.1375732421875, "learning_rate": 1.3030107385400736e-06, "loss": 0.0013790540397167206, "memory(GiB)": 38.1, "reward": 0.508152961730957, "reward_std": 0.040994904935359955, "rewards/VisualizationJSONCombinedORM/mean": 0.508152961730957, "rewards/VisualizationJSONCombinedORM/std": 0.15217122435569763, "step": 4766, "train_speed(iter/s)": 0.10863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 335.0625, "completions/min_length": 244.0, "epoch": 3.9429280397022333, "grad_norm": 0.18605594336986542, "kl": 0.10662841796875, "learning_rate": 1.3010672961466664e-06, "loss": 0.0010648220777511597, "memory(GiB)": 38.1, "reward": 0.580534040927887, "reward_std": 0.0400727316737175, "rewards/VisualizationJSONCombinedORM/mean": 0.580534040927887, "rewards/VisualizationJSONCombinedORM/std": 0.09853782504796982, "step": 4767, "train_speed(iter/s)": 0.108589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 278.3125, "completions/min_length": 228.0, "epoch": 3.943755169561621, "grad_norm": 0.23132425546646118, "kl": 0.169677734375, "learning_rate": 1.2991250873646306e-06, "loss": 0.0016926750540733337, "memory(GiB)": 38.1, "reward": 0.4893465042114258, "reward_std": 0.06674325466156006, "rewards/VisualizationJSONCombinedORM/mean": 0.4893465042114258, "rewards/VisualizationJSONCombinedORM/std": 0.09296181052923203, "step": 4768, "train_speed(iter/s)": 0.10854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 286.4375, "completions/min_length": 242.0, "epoch": 3.9445822994210094, "grad_norm": 0.21468184888362885, "kl": 0.11376953125, "learning_rate": 1.2971841128417034e-06, "loss": 0.0011375918984413147, "memory(GiB)": 38.1, "reward": 0.4612305462360382, "reward_std": 0.0973098948597908, "rewards/VisualizationJSONCombinedORM/mean": 0.4612305462360382, "rewards/VisualizationJSONCombinedORM/std": 0.22021475434303284, "step": 4769, "train_speed(iter/s)": 0.108503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 288.1875, "completions/min_length": 257.0, "epoch": 3.945409429280397, "grad_norm": 0.25422871112823486, "kl": 0.04638671875, "learning_rate": 1.2952443732252058e-06, "loss": 0.0004649851471185684, "memory(GiB)": 38.1, "reward": 0.5396365523338318, "reward_std": 0.05980158597230911, "rewards/VisualizationJSONCombinedORM/mean": 0.5396365523338318, "rewards/VisualizationJSONCombinedORM/std": 0.11645729094743729, "step": 4770, "train_speed(iter/s)": 0.10847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 318.5, "completions/min_length": 249.0, "epoch": 3.946236559139785, "grad_norm": 0.2044249176979065, "kl": 0.087646484375, "learning_rate": 1.2933058691620499e-06, "loss": 0.0008745305240154266, "memory(GiB)": 38.1, "reward": 0.5145097970962524, "reward_std": 0.06967554986476898, "rewards/VisualizationJSONCombinedORM/mean": 0.5145097970962524, "rewards/VisualizationJSONCombinedORM/std": 0.16914689540863037, "step": 4771, "train_speed(iter/s)": 0.108422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 307.875, "completions/min_length": 268.0, "epoch": 3.947063688999173, "grad_norm": 0.23172627389431, "kl": 0.0966796875, "learning_rate": 1.2913686012987402e-06, "loss": 0.0009678336791694164, "memory(GiB)": 38.1, "reward": 0.48475906252861023, "reward_std": 0.06352997571229935, "rewards/VisualizationJSONCombinedORM/mean": 0.48475906252861023, "rewards/VisualizationJSONCombinedORM/std": 0.16051502525806427, "step": 4772, "train_speed(iter/s)": 0.108383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 271.75, "completions/min_length": 228.0, "epoch": 3.9478908188585606, "grad_norm": 0.23364923894405365, "kl": 0.08758544921875, "learning_rate": 1.289432570281361e-06, "loss": 0.0008756518363952637, "memory(GiB)": 38.1, "reward": 0.39872461557388306, "reward_std": 0.04802002012729645, "rewards/VisualizationJSONCombinedORM/mean": 0.39872461557388306, "rewards/VisualizationJSONCombinedORM/std": 0.13997887074947357, "step": 4773, "train_speed(iter/s)": 0.108346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 325.6875, "completions/min_length": 266.0, "epoch": 3.948717948717949, "grad_norm": 0.23504965007305145, "kl": 0.0711669921875, "learning_rate": 1.2874977767555885e-06, "loss": 0.0007124319672584534, "memory(GiB)": 38.1, "reward": 0.654529333114624, "reward_std": 0.11248032748699188, "rewards/VisualizationJSONCombinedORM/mean": 0.654529333114624, "rewards/VisualizationJSONCombinedORM/std": 0.1471821814775467, "step": 4774, "train_speed(iter/s)": 0.108303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 298.625, "completions/min_length": 230.0, "epoch": 3.9495450785773367, "grad_norm": 0.1972358673810959, "kl": 0.0556640625, "learning_rate": 1.2855642213666858e-06, "loss": 0.0005565769970417023, "memory(GiB)": 38.1, "reward": 0.6078433990478516, "reward_std": 0.04121901094913483, "rewards/VisualizationJSONCombinedORM/mean": 0.6078433990478516, "rewards/VisualizationJSONCombinedORM/std": 0.21276985108852386, "step": 4775, "train_speed(iter/s)": 0.108266 }, { "epoch": 3.9495450785773367, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.1666666666667, "eval_completions/mean_length": 310.6927083333333, "eval_completions/min_length": 256.0416666666667, "eval_kl": 0.10883585611979167, "eval_loss": 0.001099959365092218, "eval_reward": 0.4587737446029981, "eval_reward_std": 0.05455417224826912, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4587737446029981, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05455417203484103, "eval_runtime": 313.3957, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 330.125, "completions/min_length": 243.0, "epoch": 3.9503722084367245, "grad_norm": 0.2056143432855606, "kl": 0.06512451171875, "learning_rate": 1.2836319047595035e-06, "loss": 0.0006511621177196503, "memory(GiB)": 38.1, "reward": 0.6759777069091797, "reward_std": 0.05330081656575203, "rewards/VisualizationJSONCombinedORM/mean": 0.6759777069091797, "rewards/VisualizationJSONCombinedORM/std": 0.18758541345596313, "step": 4776, "train_speed(iter/s)": 0.107462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 289.0625, "completions/min_length": 236.0, "epoch": 3.9511993382961124, "grad_norm": 0.20571410655975342, "kl": 0.087158203125, "learning_rate": 1.281700827578476e-06, "loss": 0.0008708890527486801, "memory(GiB)": 38.1, "reward": 0.41501203179359436, "reward_std": 0.05577489361166954, "rewards/VisualizationJSONCombinedORM/mean": 0.41501203179359436, "rewards/VisualizationJSONCombinedORM/std": 0.07913076132535934, "step": 4777, "train_speed(iter/s)": 0.10743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 295.0625, "completions/min_length": 225.0, "epoch": 3.9520264681555, "grad_norm": 0.23483948409557343, "kl": 0.1324462890625, "learning_rate": 1.279770990467628e-06, "loss": 0.0013271300122141838, "memory(GiB)": 38.1, "reward": 0.4878261089324951, "reward_std": 0.07036825269460678, "rewards/VisualizationJSONCombinedORM/mean": 0.4878261089324951, "rewards/VisualizationJSONCombinedORM/std": 0.10292622447013855, "step": 4778, "train_speed(iter/s)": 0.10739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 304.875, "completions/min_length": 237.0, "epoch": 3.9528535980148884, "grad_norm": 0.17979884147644043, "kl": 0.060546875, "learning_rate": 1.2778423940705686e-06, "loss": 0.0006063580513000488, "memory(GiB)": 38.1, "reward": 0.7222989201545715, "reward_std": 0.04089789092540741, "rewards/VisualizationJSONCombinedORM/mean": 0.7222989201545715, "rewards/VisualizationJSONCombinedORM/std": 0.05160916596651077, "step": 4779, "train_speed(iter/s)": 0.107352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.0625, "completions/min_length": 251.0, "epoch": 3.9536807278742763, "grad_norm": 0.19034752249717712, "kl": 0.06353759765625, "learning_rate": 1.2759150390304953e-06, "loss": 0.0006361622363328934, "memory(GiB)": 38.1, "reward": 0.44697728753089905, "reward_std": 0.03805213421583176, "rewards/VisualizationJSONCombinedORM/mean": 0.44697728753089905, "rewards/VisualizationJSONCombinedORM/std": 0.19214561581611633, "step": 4780, "train_speed(iter/s)": 0.107321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 288.6875, "completions/min_length": 236.0, "epoch": 3.954507857733664, "grad_norm": 0.18557125329971313, "kl": 0.10546875, "learning_rate": 1.2739889259901866e-06, "loss": 0.0010543763637542725, "memory(GiB)": 38.1, "reward": 0.4984559118747711, "reward_std": 0.07098761945962906, "rewards/VisualizationJSONCombinedORM/mean": 0.4984559118747711, "rewards/VisualizationJSONCombinedORM/std": 0.2129863053560257, "step": 4781, "train_speed(iter/s)": 0.107272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 298.9375, "completions/min_length": 236.0, "epoch": 3.9553349875930524, "grad_norm": 0.18448588252067566, "kl": 0.1053466796875, "learning_rate": 1.272064055592015e-06, "loss": 0.0010539405047893524, "memory(GiB)": 38.1, "reward": 0.49220770597457886, "reward_std": 0.035242702811956406, "rewards/VisualizationJSONCombinedORM/mean": 0.49220770597457886, "rewards/VisualizationJSONCombinedORM/std": 0.1483619660139084, "step": 4782, "train_speed(iter/s)": 0.107243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 329.375, "completions/min_length": 285.0, "epoch": 3.95616211745244, "grad_norm": 0.2724132835865021, "kl": 0.094482421875, "learning_rate": 1.2701404284779295e-06, "loss": 0.0009451583027839661, "memory(GiB)": 38.1, "reward": 0.6014081835746765, "reward_std": 0.0581965297460556, "rewards/VisualizationJSONCombinedORM/mean": 0.6014081835746765, "rewards/VisualizationJSONCombinedORM/std": 0.06581319868564606, "step": 4783, "train_speed(iter/s)": 0.107206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 296.125, "completions/min_length": 232.0, "epoch": 3.956989247311828, "grad_norm": 0.17995500564575195, "kl": 0.133544921875, "learning_rate": 1.2682180452894705e-06, "loss": 0.0013371743261814117, "memory(GiB)": 38.1, "reward": 0.5244784355163574, "reward_std": 0.025052227079868317, "rewards/VisualizationJSONCombinedORM/mean": 0.5244784355163574, "rewards/VisualizationJSONCombinedORM/std": 0.23566165566444397, "step": 4784, "train_speed(iter/s)": 0.107156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 314.5625, "completions/min_length": 234.0, "epoch": 3.957816377171216, "grad_norm": 0.17689882218837738, "kl": 0.1043701171875, "learning_rate": 1.266296906667762e-06, "loss": 0.0010453369468450546, "memory(GiB)": 38.1, "reward": 0.47900721430778503, "reward_std": 0.028400182723999023, "rewards/VisualizationJSONCombinedORM/mean": 0.47900721430778503, "rewards/VisualizationJSONCombinedORM/std": 0.19944900274276733, "step": 4785, "train_speed(iter/s)": 0.107114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 312.375, "completions/min_length": 239.0, "epoch": 3.9586435070306036, "grad_norm": 0.19389685988426208, "kl": 0.0667724609375, "learning_rate": 1.2643770132535139e-06, "loss": 0.0006680786609649658, "memory(GiB)": 38.1, "reward": 0.4741175174713135, "reward_std": 0.052292726933956146, "rewards/VisualizationJSONCombinedORM/mean": 0.4741175174713135, "rewards/VisualizationJSONCombinedORM/std": 0.07922124117612839, "step": 4786, "train_speed(iter/s)": 0.107076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 317.25, "completions/min_length": 267.0, "epoch": 3.959470636889992, "grad_norm": 0.2569209039211273, "kl": 0.1138916015625, "learning_rate": 1.2624583656870153e-06, "loss": 0.001139078289270401, "memory(GiB)": 38.1, "reward": 0.20348712801933289, "reward_std": 0.023415377363562584, "rewards/VisualizationJSONCombinedORM/mean": 0.20348712801933289, "rewards/VisualizationJSONCombinedORM/std": 0.031203068792819977, "step": 4787, "train_speed(iter/s)": 0.107032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 313.75, "completions/min_length": 243.0, "epoch": 3.9602977667493797, "grad_norm": 0.19968964159488678, "kl": 0.109375, "learning_rate": 1.2605409646081502e-06, "loss": 0.0010954812169075012, "memory(GiB)": 38.1, "reward": 0.6368570327758789, "reward_std": 0.07873187959194183, "rewards/VisualizationJSONCombinedORM/mean": 0.6368570327758789, "rewards/VisualizationJSONCombinedORM/std": 0.11544553190469742, "step": 4788, "train_speed(iter/s)": 0.106984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 352.0, "completions/min_length": 261.0, "epoch": 3.9611248966087675, "grad_norm": 0.2496100664138794, "kl": 0.056884765625, "learning_rate": 1.258624810656376e-06, "loss": 0.0005686953663825989, "memory(GiB)": 38.1, "reward": 0.4710072875022888, "reward_std": 0.037884872406721115, "rewards/VisualizationJSONCombinedORM/mean": 0.4710072875022888, "rewards/VisualizationJSONCombinedORM/std": 0.08707869052886963, "step": 4789, "train_speed(iter/s)": 0.106952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 302.1875, "completions/min_length": 229.0, "epoch": 3.9619520264681554, "grad_norm": 0.2192496508359909, "kl": 0.07208251953125, "learning_rate": 1.256709904470741e-06, "loss": 0.000721074640750885, "memory(GiB)": 38.1, "reward": 0.39713194966316223, "reward_std": 0.03552180528640747, "rewards/VisualizationJSONCombinedORM/mean": 0.39713194966316223, "rewards/VisualizationJSONCombinedORM/std": 0.03432004153728485, "step": 4790, "train_speed(iter/s)": 0.106908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 313.5, "completions/min_length": 245.0, "epoch": 3.962779156327543, "grad_norm": 0.2235998511314392, "kl": 0.04205322265625, "learning_rate": 1.2547962466898744e-06, "loss": 0.0004199370741844177, "memory(GiB)": 38.1, "reward": 0.45290425419807434, "reward_std": 0.049379609525203705, "rewards/VisualizationJSONCombinedORM/mean": 0.45290425419807434, "rewards/VisualizationJSONCombinedORM/std": 0.1732637733221054, "step": 4791, "train_speed(iter/s)": 0.106868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 315.8125, "completions/min_length": 260.0, "epoch": 3.9636062861869314, "grad_norm": 0.1679062843322754, "kl": 0.09918212890625, "learning_rate": 1.2528838379519924e-06, "loss": 0.000989466905593872, "memory(GiB)": 38.1, "reward": 0.4830736517906189, "reward_std": 0.047814205288887024, "rewards/VisualizationJSONCombinedORM/mean": 0.4830736517906189, "rewards/VisualizationJSONCombinedORM/std": 0.048714779317379, "step": 4792, "train_speed(iter/s)": 0.106817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 306.375, "completions/min_length": 265.0, "epoch": 3.9644334160463193, "grad_norm": 0.19829967617988586, "kl": 0.125244140625, "learning_rate": 1.2509726788948894e-06, "loss": 0.0012513212859630585, "memory(GiB)": 38.1, "reward": 0.6252990961074829, "reward_std": 0.05539349466562271, "rewards/VisualizationJSONCombinedORM/mean": 0.6252990961074829, "rewards/VisualizationJSONCombinedORM/std": 0.07706378400325775, "step": 4793, "train_speed(iter/s)": 0.106782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 302.1875, "completions/min_length": 249.0, "epoch": 3.965260545905707, "grad_norm": 0.19877690076828003, "kl": 0.06103515625, "learning_rate": 1.2490627701559471e-06, "loss": 0.00061025470495224, "memory(GiB)": 38.1, "reward": 0.4473031759262085, "reward_std": 0.056568026542663574, "rewards/VisualizationJSONCombinedORM/mean": 0.4473031759262085, "rewards/VisualizationJSONCombinedORM/std": 0.0912153348326683, "step": 4794, "train_speed(iter/s)": 0.106746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 308.1875, "completions/min_length": 231.0, "epoch": 3.9660876757650954, "grad_norm": 0.19350965321063995, "kl": 0.132080078125, "learning_rate": 1.2471541123721292e-06, "loss": 0.001324191689491272, "memory(GiB)": 38.1, "reward": 0.4213140606880188, "reward_std": 0.05617575719952583, "rewards/VisualizationJSONCombinedORM/mean": 0.4213140606880188, "rewards/VisualizationJSONCombinedORM/std": 0.09571631997823715, "step": 4795, "train_speed(iter/s)": 0.106694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 284.125, "completions/min_length": 228.0, "epoch": 3.966914805624483, "grad_norm": 0.25493124127388, "kl": 0.0748291015625, "learning_rate": 1.2452467061799828e-06, "loss": 0.000749148428440094, "memory(GiB)": 38.1, "reward": 0.43242165446281433, "reward_std": 0.07546097785234451, "rewards/VisualizationJSONCombinedORM/mean": 0.43242165446281433, "rewards/VisualizationJSONCombinedORM/std": 0.08600480109453201, "step": 4796, "train_speed(iter/s)": 0.106644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 328.125, "completions/min_length": 274.0, "epoch": 3.967741935483871, "grad_norm": 0.20844857394695282, "kl": 0.1168212890625, "learning_rate": 1.2433405522156334e-06, "loss": 0.0011692866683006287, "memory(GiB)": 38.1, "reward": 0.29538625478744507, "reward_std": 0.031410228461027145, "rewards/VisualizationJSONCombinedORM/mean": 0.29538625478744507, "rewards/VisualizationJSONCombinedORM/std": 0.12965892255306244, "step": 4797, "train_speed(iter/s)": 0.106605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 320.875, "completions/min_length": 251.0, "epoch": 3.968569065343259, "grad_norm": 0.18986399471759796, "kl": 0.03656005859375, "learning_rate": 1.2414356511147968e-06, "loss": 0.0003659836947917938, "memory(GiB)": 38.1, "reward": 0.7528883814811707, "reward_std": 0.0643964409828186, "rewards/VisualizationJSONCombinedORM/mean": 0.7528883814811707, "rewards/VisualizationJSONCombinedORM/std": 0.06718423962593079, "step": 4798, "train_speed(iter/s)": 0.106561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 278.375, "completions/min_length": 214.0, "epoch": 3.9693961952026466, "grad_norm": 0.19283530116081238, "kl": 0.081298828125, "learning_rate": 1.2395320035127634e-06, "loss": 0.0008136667311191559, "memory(GiB)": 38.1, "reward": 0.5444009304046631, "reward_std": 0.0642537847161293, "rewards/VisualizationJSONCombinedORM/mean": 0.5444009304046631, "rewards/VisualizationJSONCombinedORM/std": 0.09997779875993729, "step": 4799, "train_speed(iter/s)": 0.106525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 307.9375, "completions/min_length": 250.0, "epoch": 3.970223325062035, "grad_norm": 0.2156577855348587, "kl": 0.176025390625, "learning_rate": 1.2376296100444092e-06, "loss": 0.001763075590133667, "memory(GiB)": 38.1, "reward": 0.4225442111492157, "reward_std": 0.049764975905418396, "rewards/VisualizationJSONCombinedORM/mean": 0.4225442111492157, "rewards/VisualizationJSONCombinedORM/std": 0.17120693624019623, "step": 4800, "train_speed(iter/s)": 0.106485 }, { "epoch": 3.970223325062035, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 370.5833333333333, "eval_completions/mean_length": 306.6302083333333, "eval_completions/min_length": 257.3333333333333, "eval_kl": 0.1078033447265625, "eval_loss": 0.0010864163050428033, "eval_reward": 0.4686637446284294, "eval_reward_std": 0.054694996758674584, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4686637446284294, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05469499850490441, "eval_runtime": 314.3429, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 328.0, "completions/min_length": 246.0, "epoch": 3.9710504549214227, "grad_norm": 0.2462867796421051, "kl": 0.078857421875, "learning_rate": 1.235728471344192e-06, "loss": 0.0007880441844463348, "memory(GiB)": 38.1, "reward": 0.4275849759578705, "reward_std": 0.045063316822052, "rewards/VisualizationJSONCombinedORM/mean": 0.4275849759578705, "rewards/VisualizationJSONCombinedORM/std": 0.07075769454240799, "step": 4801, "train_speed(iter/s)": 0.105712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 334.5, "completions/min_length": 253.0, "epoch": 3.9718775847808105, "grad_norm": 0.303391695022583, "kl": 0.209716796875, "learning_rate": 1.233828588046151e-06, "loss": 0.0020915865898132324, "memory(GiB)": 38.1, "reward": 0.6491112112998962, "reward_std": 0.08331803977489471, "rewards/VisualizationJSONCombinedORM/mean": 0.6491112112998962, "rewards/VisualizationJSONCombinedORM/std": 0.13197103142738342, "step": 4802, "train_speed(iter/s)": 0.105671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 317.8125, "completions/min_length": 235.0, "epoch": 3.9727047146401984, "grad_norm": 0.2814302146434784, "kl": 0.0992431640625, "learning_rate": 1.2319299607839026e-06, "loss": 0.0009916462004184723, "memory(GiB)": 38.1, "reward": 0.5324372053146362, "reward_std": 0.05348047986626625, "rewards/VisualizationJSONCombinedORM/mean": 0.5324372053146362, "rewards/VisualizationJSONCombinedORM/std": 0.2253381460905075, "step": 4803, "train_speed(iter/s)": 0.105621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 309.25, "completions/min_length": 265.0, "epoch": 3.973531844499586, "grad_norm": 0.20785795152187347, "kl": 0.120361328125, "learning_rate": 1.2300325901906529e-06, "loss": 0.001201484352350235, "memory(GiB)": 38.1, "reward": 0.5173086524009705, "reward_std": 0.0673900619149208, "rewards/VisualizationJSONCombinedORM/mean": 0.5173086524009705, "rewards/VisualizationJSONCombinedORM/std": 0.07357562333345413, "step": 4804, "train_speed(iter/s)": 0.105583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 292.9375, "completions/min_length": 247.0, "epoch": 3.9743589743589745, "grad_norm": 0.2241605818271637, "kl": 0.06866455078125, "learning_rate": 1.2281364768991804e-06, "loss": 0.0006870776414871216, "memory(GiB)": 38.1, "reward": 0.46689918637275696, "reward_std": 0.049774184823036194, "rewards/VisualizationJSONCombinedORM/mean": 0.46689918637275696, "rewards/VisualizationJSONCombinedORM/std": 0.11530327051877975, "step": 4805, "train_speed(iter/s)": 0.105545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 310.0, "completions/min_length": 243.0, "epoch": 3.9751861042183623, "grad_norm": 0.20534688234329224, "kl": 0.1146240234375, "learning_rate": 1.2262416215418494e-06, "loss": 0.0011434629559516907, "memory(GiB)": 38.1, "reward": 0.3164746165275574, "reward_std": 0.031604208052158356, "rewards/VisualizationJSONCombinedORM/mean": 0.3164746165275574, "rewards/VisualizationJSONCombinedORM/std": 0.08989998698234558, "step": 4806, "train_speed(iter/s)": 0.105508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 330.625, "completions/min_length": 236.0, "epoch": 3.97601323407775, "grad_norm": 0.17943228781223297, "kl": 0.05157470703125, "learning_rate": 1.2243480247506019e-06, "loss": 0.0005161985754966736, "memory(GiB)": 38.1, "reward": 0.5181908011436462, "reward_std": 0.025007333606481552, "rewards/VisualizationJSONCombinedORM/mean": 0.5181908011436462, "rewards/VisualizationJSONCombinedORM/std": 0.1895124465227127, "step": 4807, "train_speed(iter/s)": 0.105466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 302.0, "completions/min_length": 216.0, "epoch": 3.9768403639371384, "grad_norm": 0.4074729382991791, "kl": 0.237060546875, "learning_rate": 1.222455687156963e-06, "loss": 0.002374976873397827, "memory(GiB)": 38.1, "reward": 0.7011098861694336, "reward_std": 0.10209481418132782, "rewards/VisualizationJSONCombinedORM/mean": 0.7011098861694336, "rewards/VisualizationJSONCombinedORM/std": 0.15262101590633392, "step": 4808, "train_speed(iter/s)": 0.105436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 320.875, "completions/min_length": 257.0, "epoch": 3.977667493796526, "grad_norm": 0.20190978050231934, "kl": 0.1119384765625, "learning_rate": 1.2205646093920342e-06, "loss": 0.0011186730116605759, "memory(GiB)": 38.1, "reward": 0.6296221017837524, "reward_std": 0.06520477682352066, "rewards/VisualizationJSONCombinedORM/mean": 0.6296221017837524, "rewards/VisualizationJSONCombinedORM/std": 0.1333732306957245, "step": 4809, "train_speed(iter/s)": 0.105395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 316.0, "completions/min_length": 270.0, "epoch": 3.978494623655914, "grad_norm": 0.24866321682929993, "kl": 0.10302734375, "learning_rate": 1.2186747920864993e-06, "loss": 0.001028057187795639, "memory(GiB)": 38.1, "reward": 0.44440120458602905, "reward_std": 0.07467005401849747, "rewards/VisualizationJSONCombinedORM/mean": 0.44440120458602905, "rewards/VisualizationJSONCombinedORM/std": 0.19869564473628998, "step": 4810, "train_speed(iter/s)": 0.105352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 301.3125, "completions/min_length": 247.0, "epoch": 3.979321753515302, "grad_norm": 0.2770906984806061, "kl": 0.072509765625, "learning_rate": 1.2167862358706216e-06, "loss": 0.0007244981825351715, "memory(GiB)": 38.1, "reward": 0.6447086930274963, "reward_std": 0.07525242865085602, "rewards/VisualizationJSONCombinedORM/mean": 0.6447086930274963, "rewards/VisualizationJSONCombinedORM/std": 0.10182695835828781, "step": 4811, "train_speed(iter/s)": 0.105315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 330.0625, "completions/min_length": 250.0, "epoch": 3.9801488833746896, "grad_norm": 0.1799318492412567, "kl": 0.08721923828125, "learning_rate": 1.2148989413742446e-06, "loss": 0.0008711554110050201, "memory(GiB)": 38.1, "reward": 0.48273399472236633, "reward_std": 0.06277085840702057, "rewards/VisualizationJSONCombinedORM/mean": 0.48273399472236633, "rewards/VisualizationJSONCombinedORM/std": 0.2715268135070801, "step": 4812, "train_speed(iter/s)": 0.105268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 318.9375, "completions/min_length": 235.0, "epoch": 3.980976013234078, "grad_norm": 0.26728031039237976, "kl": 0.0860595703125, "learning_rate": 1.213012909226786e-06, "loss": 0.0008572693914175034, "memory(GiB)": 38.1, "reward": 0.5410735607147217, "reward_std": 0.08616126328706741, "rewards/VisualizationJSONCombinedORM/mean": 0.5410735607147217, "rewards/VisualizationJSONCombinedORM/std": 0.08903837949037552, "step": 4813, "train_speed(iter/s)": 0.105225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 324.375, "completions/min_length": 264.0, "epoch": 3.9818031430934657, "grad_norm": 0.18031588196754456, "kl": 0.0963134765625, "learning_rate": 1.2111281400572517e-06, "loss": 0.0009620748460292816, "memory(GiB)": 38.1, "reward": 0.4019635319709778, "reward_std": 0.045790500938892365, "rewards/VisualizationJSONCombinedORM/mean": 0.4019635319709778, "rewards/VisualizationJSONCombinedORM/std": 0.11159752309322357, "step": 4814, "train_speed(iter/s)": 0.105192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 312.5625, "completions/min_length": 234.0, "epoch": 3.9826302729528535, "grad_norm": 0.1551927924156189, "kl": 0.152099609375, "learning_rate": 1.2092446344942165e-06, "loss": 0.0015251555014401674, "memory(GiB)": 38.1, "reward": 0.49043262004852295, "reward_std": 0.01196098793298006, "rewards/VisualizationJSONCombinedORM/mean": 0.49043262004852295, "rewards/VisualizationJSONCombinedORM/std": 0.18644016981124878, "step": 4815, "train_speed(iter/s)": 0.105161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 328.0625, "completions/min_length": 251.0, "epoch": 3.9834574028122414, "grad_norm": 0.18239358067512512, "kl": 0.07159423828125, "learning_rate": 1.2073623931658407e-06, "loss": 0.0007162727415561676, "memory(GiB)": 38.1, "reward": 0.46329742670059204, "reward_std": 0.03357616811990738, "rewards/VisualizationJSONCombinedORM/mean": 0.46329742670059204, "rewards/VisualizationJSONCombinedORM/std": 0.21235884726047516, "step": 4816, "train_speed(iter/s)": 0.105115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 320.4375, "completions/min_length": 274.0, "epoch": 3.984284532671629, "grad_norm": 0.20527514815330505, "kl": 0.04656982421875, "learning_rate": 1.2054814166998596e-06, "loss": 0.0004652440547943115, "memory(GiB)": 38.1, "reward": 0.45431405305862427, "reward_std": 0.05136715620756149, "rewards/VisualizationJSONCombinedORM/mean": 0.45431405305862427, "rewards/VisualizationJSONCombinedORM/std": 0.125327929854393, "step": 4817, "train_speed(iter/s)": 0.10507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 318.3125, "completions/min_length": 251.0, "epoch": 3.9851116625310175, "grad_norm": 0.19623331725597382, "kl": 0.1129150390625, "learning_rate": 1.20360170572359e-06, "loss": 0.001129094511270523, "memory(GiB)": 38.1, "reward": 0.36859557032585144, "reward_std": 0.031602539122104645, "rewards/VisualizationJSONCombinedORM/mean": 0.36859557032585144, "rewards/VisualizationJSONCombinedORM/std": 0.044993165880441666, "step": 4818, "train_speed(iter/s)": 0.105022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 291.5, "completions/min_length": 224.0, "epoch": 3.9859387923904053, "grad_norm": 0.18225859105587006, "kl": 0.130615234375, "learning_rate": 1.201723260863919e-06, "loss": 0.0013040602207183838, "memory(GiB)": 38.1, "reward": 0.4180518388748169, "reward_std": 0.027237992733716965, "rewards/VisualizationJSONCombinedORM/mean": 0.4180518388748169, "rewards/VisualizationJSONCombinedORM/std": 0.1376405954360962, "step": 4819, "train_speed(iter/s)": 0.104989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 308.875, "completions/min_length": 229.0, "epoch": 3.986765922249793, "grad_norm": 0.12670868635177612, "kl": 0.04718017578125, "learning_rate": 1.199846082747323e-06, "loss": 0.00047154538333415985, "memory(GiB)": 38.1, "reward": 0.4200017750263214, "reward_std": 0.025820521637797356, "rewards/VisualizationJSONCombinedORM/mean": 0.4200017750263214, "rewards/VisualizationJSONCombinedORM/std": 0.10737185925245285, "step": 4820, "train_speed(iter/s)": 0.104941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 332.4375, "completions/min_length": 267.0, "epoch": 3.9875930521091814, "grad_norm": 0.16948196291923523, "kl": 0.1005859375, "learning_rate": 1.1979701719998454e-06, "loss": 0.001007910817861557, "memory(GiB)": 38.1, "reward": 0.6990185976028442, "reward_std": 0.07334546744823456, "rewards/VisualizationJSONCombinedORM/mean": 0.6990185976028442, "rewards/VisualizationJSONCombinedORM/std": 0.1433531939983368, "step": 4821, "train_speed(iter/s)": 0.104902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 288.625, "completions/min_length": 234.0, "epoch": 3.988420181968569, "grad_norm": 0.18852464854717255, "kl": 0.096923828125, "learning_rate": 1.1960955292471132e-06, "loss": 0.0009711235761642456, "memory(GiB)": 38.1, "reward": 0.6066775918006897, "reward_std": 0.0980280190706253, "rewards/VisualizationJSONCombinedORM/mean": 0.6066775918006897, "rewards/VisualizationJSONCombinedORM/std": 0.09568636119365692, "step": 4822, "train_speed(iter/s)": 0.104862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 266.0, "completions/min_length": 218.0, "epoch": 3.989247311827957, "grad_norm": 0.2224515974521637, "kl": 0.05853271484375, "learning_rate": 1.1942221551143274e-06, "loss": 0.0005847327411174774, "memory(GiB)": 38.1, "reward": 0.5810251235961914, "reward_std": 0.09062550216913223, "rewards/VisualizationJSONCombinedORM/mean": 0.5810251235961914, "rewards/VisualizationJSONCombinedORM/std": 0.09390870481729507, "step": 4823, "train_speed(iter/s)": 0.104816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 299.8125, "completions/min_length": 231.0, "epoch": 3.990074441687345, "grad_norm": 0.16593529284000397, "kl": 0.040283203125, "learning_rate": 1.192350050226269e-06, "loss": 0.0004019811749458313, "memory(GiB)": 38.1, "reward": 0.5537447929382324, "reward_std": 0.033353835344314575, "rewards/VisualizationJSONCombinedORM/mean": 0.5537447929382324, "rewards/VisualizationJSONCombinedORM/std": 0.1818821132183075, "step": 4824, "train_speed(iter/s)": 0.104783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 346.1875, "completions/min_length": 282.0, "epoch": 3.9909015715467326, "grad_norm": 0.201004758477211, "kl": 0.13037109375, "learning_rate": 1.1904792152072914e-06, "loss": 0.0013050585985183716, "memory(GiB)": 38.1, "reward": 0.31757664680480957, "reward_std": 0.045765221118927, "rewards/VisualizationJSONCombinedORM/mean": 0.31757664680480957, "rewards/VisualizationJSONCombinedORM/std": 0.11669906228780746, "step": 4825, "train_speed(iter/s)": 0.104729 }, { "epoch": 3.9909015715467326, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.5416666666667, "eval_completions/mean_length": 309.265625, "eval_completions/min_length": 257.75, "eval_kl": 0.07056681315104167, "eval_loss": 0.0007159591768868268, "eval_reward": 0.4377974687765042, "eval_reward_std": 0.04927580215735361, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4377974687765042, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04927580429163451, "eval_runtime": 316.6994, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 290.1875, "completions/min_length": 219.0, "epoch": 3.991728701406121, "grad_norm": 0.20093488693237305, "kl": 0.05230712890625, "learning_rate": 1.1886096506813273e-06, "loss": 0.0005234405398368835, "memory(GiB)": 38.1, "reward": 0.7322273254394531, "reward_std": 0.06334567070007324, "rewards/VisualizationJSONCombinedORM/mean": 0.7322273254394531, "rewards/VisualizationJSONCombinedORM/std": 0.07047539949417114, "step": 4826, "train_speed(iter/s)": 0.103978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 304.1875, "completions/min_length": 229.0, "epoch": 3.9925558312655087, "grad_norm": 0.2295699119567871, "kl": 0.0731201171875, "learning_rate": 1.1867413572718861e-06, "loss": 0.0007302910089492798, "memory(GiB)": 38.1, "reward": 0.4543299674987793, "reward_std": 0.054480090737342834, "rewards/VisualizationJSONCombinedORM/mean": 0.4543299674987793, "rewards/VisualizationJSONCombinedORM/std": 0.12951433658599854, "step": 4827, "train_speed(iter/s)": 0.10394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 293.25, "completions/min_length": 236.0, "epoch": 3.9933829611248965, "grad_norm": 0.20237135887145996, "kl": 0.045654296875, "learning_rate": 1.184874335602053e-06, "loss": 0.0004573427140712738, "memory(GiB)": 38.1, "reward": 0.43582648038864136, "reward_std": 0.042700156569480896, "rewards/VisualizationJSONCombinedORM/mean": 0.43582648038864136, "rewards/VisualizationJSONCombinedORM/std": 0.1377682089805603, "step": 4828, "train_speed(iter/s)": 0.103908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 350.125, "completions/min_length": 233.0, "epoch": 3.9942100909842844, "grad_norm": 0.1781897246837616, "kl": 0.03948974609375, "learning_rate": 1.1830085862944851e-06, "loss": 0.0003945194184780121, "memory(GiB)": 38.1, "reward": 0.7030240893363953, "reward_std": 0.08408558368682861, "rewards/VisualizationJSONCombinedORM/mean": 0.7030240893363953, "rewards/VisualizationJSONCombinedORM/std": 0.08759481459856033, "step": 4829, "train_speed(iter/s)": 0.103863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 288.75, "completions/min_length": 236.0, "epoch": 3.995037220843672, "grad_norm": 0.24420824646949768, "kl": 0.06866455078125, "learning_rate": 1.1811441099714232e-06, "loss": 0.0006868317723274231, "memory(GiB)": 38.1, "reward": 0.4774645268917084, "reward_std": 0.03864217922091484, "rewards/VisualizationJSONCombinedORM/mean": 0.4774645268917084, "rewards/VisualizationJSONCombinedORM/std": 0.058169685304164886, "step": 4830, "train_speed(iter/s)": 0.103833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 310.5625, "completions/min_length": 254.0, "epoch": 3.9958643507030605, "grad_norm": 0.1568373441696167, "kl": 0.06097412109375, "learning_rate": 1.1792809072546757e-06, "loss": 0.0006090328097343445, "memory(GiB)": 38.1, "reward": 0.7535465359687805, "reward_std": 0.054548367857933044, "rewards/VisualizationJSONCombinedORM/mean": 0.7535465359687805, "rewards/VisualizationJSONCombinedORM/std": 0.1101878359913826, "step": 4831, "train_speed(iter/s)": 0.103798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 307.5625, "completions/min_length": 220.0, "epoch": 3.9966914805624483, "grad_norm": 0.17400360107421875, "kl": 0.1280517578125, "learning_rate": 1.1774189787656299e-06, "loss": 0.0012808330357074738, "memory(GiB)": 38.1, "reward": 0.5605466961860657, "reward_std": 0.06429813802242279, "rewards/VisualizationJSONCombinedORM/mean": 0.5605466961860657, "rewards/VisualizationJSONCombinedORM/std": 0.28285759687423706, "step": 4832, "train_speed(iter/s)": 0.103763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 300.25, "completions/min_length": 243.0, "epoch": 3.997518610421836, "grad_norm": 0.27212008833885193, "kl": 0.08880615234375, "learning_rate": 1.1755583251252484e-06, "loss": 0.0008869729936122894, "memory(GiB)": 38.1, "reward": 0.4848714768886566, "reward_std": 0.06558932363986969, "rewards/VisualizationJSONCombinedORM/mean": 0.4848714768886566, "rewards/VisualizationJSONCombinedORM/std": 0.08117730915546417, "step": 4833, "train_speed(iter/s)": 0.103726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 279.9375, "completions/min_length": 224.0, "epoch": 3.9983457402812244, "grad_norm": 0.18162518739700317, "kl": 0.05023193359375, "learning_rate": 1.1736989469540688e-06, "loss": 0.0005024448037147522, "memory(GiB)": 38.1, "reward": 0.279387503862381, "reward_std": 0.03285258263349533, "rewards/VisualizationJSONCombinedORM/mean": 0.279387503862381, "rewards/VisualizationJSONCombinedORM/std": 0.12639622390270233, "step": 4834, "train_speed(iter/s)": 0.103694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 323.0625, "completions/min_length": 228.0, "epoch": 3.999172870140612, "grad_norm": 0.16755464673042297, "kl": 0.0921630859375, "learning_rate": 1.171840844872198e-06, "loss": 0.0009221769869327545, "memory(GiB)": 38.1, "reward": 0.505821704864502, "reward_std": 0.03297398239374161, "rewards/VisualizationJSONCombinedORM/mean": 0.505821704864502, "rewards/VisualizationJSONCombinedORM/std": 0.09545361995697021, "step": 4835, "train_speed(iter/s)": 0.103664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 341.6875, "completions/min_length": 272.0, "epoch": 4.0, "grad_norm": 0.1803703010082245, "kl": 0.0673828125, "learning_rate": 1.1699840194993284e-06, "loss": 0.0006730668246746063, "memory(GiB)": 38.1, "reward": 0.5675685405731201, "reward_std": 0.016220899298787117, "rewards/VisualizationJSONCombinedORM/mean": 0.5675685405731201, "rewards/VisualizationJSONCombinedORM/std": 0.18770469725131989, "step": 4836, "train_speed(iter/s)": 0.103624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 317.3125, "completions/min_length": 250.0, "epoch": 4.000827129859388, "grad_norm": 0.20228390395641327, "kl": 0.2056884765625, "learning_rate": 1.1681284714547147e-06, "loss": 0.0020598135888576508, "memory(GiB)": 38.1, "reward": 0.43227943778038025, "reward_std": 0.050246257334947586, "rewards/VisualizationJSONCombinedORM/mean": 0.43227943778038025, "rewards/VisualizationJSONCombinedORM/std": 0.17507588863372803, "step": 4837, "train_speed(iter/s)": 0.103545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 308.1875, "completions/min_length": 233.0, "epoch": 4.001654259718776, "grad_norm": 0.11717789620161057, "kl": 0.074951171875, "learning_rate": 1.1662742013571926e-06, "loss": 0.0007500090869143605, "memory(GiB)": 38.1, "reward": 0.49716687202453613, "reward_std": 0.017000598832964897, "rewards/VisualizationJSONCombinedORM/mean": 0.49716687202453613, "rewards/VisualizationJSONCombinedORM/std": 0.09024586528539658, "step": 4838, "train_speed(iter/s)": 0.103501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 278.375, "completions/min_length": 216.0, "epoch": 4.002481389578164, "grad_norm": 0.17717957496643066, "kl": 0.0496826171875, "learning_rate": 1.16442120982517e-06, "loss": 0.0004963651299476624, "memory(GiB)": 38.1, "reward": 0.646589994430542, "reward_std": 0.0464351661503315, "rewards/VisualizationJSONCombinedORM/mean": 0.646589994430542, "rewards/VisualizationJSONCombinedORM/std": 0.06152946874499321, "step": 4839, "train_speed(iter/s)": 0.10347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 330.8125, "completions/min_length": 279.0, "epoch": 4.003308519437551, "grad_norm": 0.1984320878982544, "kl": 0.0975341796875, "learning_rate": 1.16256949747663e-06, "loss": 0.0009791404008865356, "memory(GiB)": 38.1, "reward": 0.4480525851249695, "reward_std": 0.03965966776013374, "rewards/VisualizationJSONCombinedORM/mean": 0.4480525851249695, "rewards/VisualizationJSONCombinedORM/std": 0.1692618578672409, "step": 4840, "train_speed(iter/s)": 0.103433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 322.25, "completions/min_length": 248.0, "epoch": 4.0041356492969395, "grad_norm": 0.19113081693649292, "kl": 0.0423583984375, "learning_rate": 1.1607190649291239e-06, "loss": 0.0004227086901664734, "memory(GiB)": 38.1, "reward": 0.3570111095905304, "reward_std": 0.02694176882505417, "rewards/VisualizationJSONCombinedORM/mean": 0.3570111095905304, "rewards/VisualizationJSONCombinedORM/std": 0.10959495604038239, "step": 4841, "train_speed(iter/s)": 0.103394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 331.9375, "completions/min_length": 263.0, "epoch": 4.004962779156328, "grad_norm": 0.17743805050849915, "kl": 0.07568359375, "learning_rate": 1.158869912799781e-06, "loss": 0.0007575489580631256, "memory(GiB)": 38.1, "reward": 0.291472852230072, "reward_std": 0.025551442056894302, "rewards/VisualizationJSONCombinedORM/mean": 0.291472852230072, "rewards/VisualizationJSONCombinedORM/std": 0.08389764279127121, "step": 4842, "train_speed(iter/s)": 0.103364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 327.0, "completions/min_length": 240.0, "epoch": 4.005789909015715, "grad_norm": 0.17784170806407928, "kl": 0.0657958984375, "learning_rate": 1.1570220417053024e-06, "loss": 0.0006579719483852386, "memory(GiB)": 38.1, "reward": 0.34970009326934814, "reward_std": 0.05429421365261078, "rewards/VisualizationJSONCombinedORM/mean": 0.34970009326934814, "rewards/VisualizationJSONCombinedORM/std": 0.10264915972948074, "step": 4843, "train_speed(iter/s)": 0.103325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 270.8125, "completions/min_length": 223.0, "epoch": 4.0066170388751035, "grad_norm": 0.21670806407928467, "kl": 0.05694580078125, "learning_rate": 1.155175452261963e-06, "loss": 0.0005695782601833344, "memory(GiB)": 38.1, "reward": 0.30870237946510315, "reward_std": 0.03398962691426277, "rewards/VisualizationJSONCombinedORM/mean": 0.30870237946510315, "rewards/VisualizationJSONCombinedORM/std": 0.03302697092294693, "step": 4844, "train_speed(iter/s)": 0.103297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 321.25, "completions/min_length": 245.0, "epoch": 4.007444168734492, "grad_norm": 0.2922859489917755, "kl": 0.06927490234375, "learning_rate": 1.1533301450856054e-06, "loss": 0.0006923601031303406, "memory(GiB)": 38.1, "reward": 0.5003506541252136, "reward_std": 0.06447675079107285, "rewards/VisualizationJSONCombinedORM/mean": 0.5003506541252136, "rewards/VisualizationJSONCombinedORM/std": 0.07692500203847885, "step": 4845, "train_speed(iter/s)": 0.10326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 299.5, "completions/min_length": 236.0, "epoch": 4.008271298593879, "grad_norm": 0.22794263064861298, "kl": 0.101806640625, "learning_rate": 1.1514861207916528e-06, "loss": 0.0010183081030845642, "memory(GiB)": 38.1, "reward": 0.5229895114898682, "reward_std": 0.03620409592986107, "rewards/VisualizationJSONCombinedORM/mean": 0.5229895114898682, "rewards/VisualizationJSONCombinedORM/std": 0.04814646393060684, "step": 4846, "train_speed(iter/s)": 0.103219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 319.0, "completions/min_length": 263.0, "epoch": 4.009098428453267, "grad_norm": 0.19106227159500122, "kl": 0.044189453125, "learning_rate": 1.149643379995093e-06, "loss": 0.0004413016140460968, "memory(GiB)": 38.1, "reward": 0.7498855590820312, "reward_std": 0.06249534338712692, "rewards/VisualizationJSONCombinedORM/mean": 0.7498855590820312, "rewards/VisualizationJSONCombinedORM/std": 0.1006607636809349, "step": 4847, "train_speed(iter/s)": 0.103184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 332.5625, "completions/min_length": 244.0, "epoch": 4.009925558312655, "grad_norm": 0.21544376015663147, "kl": 0.04534912109375, "learning_rate": 1.1478019233104887e-06, "loss": 0.00045286770910024643, "memory(GiB)": 38.1, "reward": 0.568733811378479, "reward_std": 0.04006294906139374, "rewards/VisualizationJSONCombinedORM/mean": 0.568733811378479, "rewards/VisualizationJSONCombinedORM/std": 0.06901919841766357, "step": 4848, "train_speed(iter/s)": 0.103149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 293.6875, "completions/min_length": 241.0, "epoch": 4.010752688172043, "grad_norm": 0.16357716917991638, "kl": 0.02813720703125, "learning_rate": 1.1459617513519756e-06, "loss": 0.0002814680337905884, "memory(GiB)": 38.1, "reward": 0.562240719795227, "reward_std": 0.04892103374004364, "rewards/VisualizationJSONCombinedORM/mean": 0.562240719795227, "rewards/VisualizationJSONCombinedORM/std": 0.050129640847444534, "step": 4849, "train_speed(iter/s)": 0.103103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 317.6875, "completions/min_length": 257.0, "epoch": 4.011579818031431, "grad_norm": 0.15845060348510742, "kl": 0.030548095703125, "learning_rate": 1.1441228647332602e-06, "loss": 0.00030592456459999084, "memory(GiB)": 38.1, "reward": 0.505155622959137, "reward_std": 0.03680488467216492, "rewards/VisualizationJSONCombinedORM/mean": 0.505155622959137, "rewards/VisualizationJSONCombinedORM/std": 0.07072310149669647, "step": 4850, "train_speed(iter/s)": 0.103063 }, { "epoch": 4.011579818031431, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.2916666666667, "eval_completions/mean_length": 306.65625, "eval_completions/min_length": 256.8333333333333, "eval_kl": 0.0747528076171875, "eval_loss": 0.0007607576553709805, "eval_reward": 0.44392074396212894, "eval_reward_std": 0.05165366494717697, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44392074396212894, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05165366750831405, "eval_runtime": 313.6831, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 307.75, "completions/min_length": 254.0, "epoch": 4.012406947890819, "grad_norm": 0.20858308672904968, "kl": 0.09033203125, "learning_rate": 1.1422852640676159e-06, "loss": 0.0009035728871822357, "memory(GiB)": 38.1, "reward": 0.6231356263160706, "reward_std": 0.07083654403686523, "rewards/VisualizationJSONCombinedORM/mean": 0.6231356263160706, "rewards/VisualizationJSONCombinedORM/std": 0.07361166924238205, "step": 4851, "train_speed(iter/s)": 0.102344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 300.9375, "completions/min_length": 231.0, "epoch": 4.013234077750207, "grad_norm": 0.2323118895292282, "kl": 0.035308837890625, "learning_rate": 1.1404489499678966e-06, "loss": 0.00035312771797180176, "memory(GiB)": 38.1, "reward": 0.5115988254547119, "reward_std": 0.03464756906032562, "rewards/VisualizationJSONCombinedORM/mean": 0.5115988254547119, "rewards/VisualizationJSONCombinedORM/std": 0.265153169631958, "step": 4852, "train_speed(iter/s)": 0.102311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 284.8125, "completions/min_length": 225.0, "epoch": 4.014061207609594, "grad_norm": 0.19583012163639069, "kl": 0.0509033203125, "learning_rate": 1.1386139230465176e-06, "loss": 0.0005087628960609436, "memory(GiB)": 38.1, "reward": 0.5681397914886475, "reward_std": 0.055986613035202026, "rewards/VisualizationJSONCombinedORM/mean": 0.5681397914886475, "rewards/VisualizationJSONCombinedORM/std": 0.15986119210720062, "step": 4853, "train_speed(iter/s)": 0.102278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 318.9375, "completions/min_length": 254.0, "epoch": 4.0148883374689825, "grad_norm": 0.1736672818660736, "kl": 0.0648193359375, "learning_rate": 1.1367801839154701e-06, "loss": 0.0006472766399383545, "memory(GiB)": 38.1, "reward": 0.39277052879333496, "reward_std": 0.04649658501148224, "rewards/VisualizationJSONCombinedORM/mean": 0.39277052879333496, "rewards/VisualizationJSONCombinedORM/std": 0.23829728364944458, "step": 4854, "train_speed(iter/s)": 0.102235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 333.9375, "completions/min_length": 282.0, "epoch": 4.015715467328371, "grad_norm": 0.19175440073013306, "kl": 0.04168701171875, "learning_rate": 1.134947733186315e-06, "loss": 0.00041685253381729126, "memory(GiB)": 38.1, "reward": 0.6173653602600098, "reward_std": 0.07983475923538208, "rewards/VisualizationJSONCombinedORM/mean": 0.6173653602600098, "rewards/VisualizationJSONCombinedORM/std": 0.07914029061794281, "step": 4855, "train_speed(iter/s)": 0.102196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 316.125, "completions/min_length": 249.0, "epoch": 4.016542597187758, "grad_norm": 0.19082212448120117, "kl": 0.03558349609375, "learning_rate": 1.1331165714701836e-06, "loss": 0.0003557950258255005, "memory(GiB)": 38.1, "reward": 0.5619879961013794, "reward_std": 0.07420910894870758, "rewards/VisualizationJSONCombinedORM/mean": 0.5619879961013794, "rewards/VisualizationJSONCombinedORM/std": 0.07791368663311005, "step": 4856, "train_speed(iter/s)": 0.102152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 309.4375, "completions/min_length": 233.0, "epoch": 4.0173697270471465, "grad_norm": 0.17029057443141937, "kl": 0.06781005859375, "learning_rate": 1.1312866993777771e-06, "loss": 0.0006792023777961731, "memory(GiB)": 38.1, "reward": 0.7684964537620544, "reward_std": 0.07617750763893127, "rewards/VisualizationJSONCombinedORM/mean": 0.7684964537620544, "rewards/VisualizationJSONCombinedORM/std": 0.08845633268356323, "step": 4857, "train_speed(iter/s)": 0.102123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 326.3125, "completions/min_length": 259.0, "epoch": 4.018196856906535, "grad_norm": 0.20140068233013153, "kl": 0.05145263671875, "learning_rate": 1.129458117519363e-06, "loss": 0.0005162432789802551, "memory(GiB)": 38.1, "reward": 0.2894306778907776, "reward_std": 0.03030848503112793, "rewards/VisualizationJSONCombinedORM/mean": 0.2894306778907776, "rewards/VisualizationJSONCombinedORM/std": 0.1288011223077774, "step": 4858, "train_speed(iter/s)": 0.102088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 291.0625, "completions/min_length": 211.0, "epoch": 4.019023986765922, "grad_norm": 0.25281277298927307, "kl": 0.1177978515625, "learning_rate": 1.1276308265047874e-06, "loss": 0.001177951693534851, "memory(GiB)": 38.1, "reward": 0.5588363409042358, "reward_std": 0.07512776553630829, "rewards/VisualizationJSONCombinedORM/mean": 0.5588363409042358, "rewards/VisualizationJSONCombinedORM/std": 0.14169423282146454, "step": 4859, "train_speed(iter/s)": 0.102052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 299.4375, "completions/min_length": 238.0, "epoch": 4.01985111662531, "grad_norm": 0.26781269907951355, "kl": 0.0386962890625, "learning_rate": 1.1258048269434569e-06, "loss": 0.0003865845501422882, "memory(GiB)": 38.1, "reward": 0.4469698369503021, "reward_std": 0.08105193823575974, "rewards/VisualizationJSONCombinedORM/mean": 0.4469698369503021, "rewards/VisualizationJSONCombinedORM/std": 0.32097911834716797, "step": 4860, "train_speed(iter/s)": 0.102015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 279.875, "completions/min_length": 237.0, "epoch": 4.020678246484698, "grad_norm": 0.21922209858894348, "kl": 0.09765625, "learning_rate": 1.1239801194443507e-06, "loss": 0.0009787827730178833, "memory(GiB)": 38.1, "reward": 0.5953839421272278, "reward_std": 0.11484740674495697, "rewards/VisualizationJSONCombinedORM/mean": 0.5953839421272278, "rewards/VisualizationJSONCombinedORM/std": 0.15710408985614777, "step": 4861, "train_speed(iter/s)": 0.101985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 338.75, "completions/min_length": 246.0, "epoch": 4.021505376344086, "grad_norm": 0.1528429090976715, "kl": 0.03814697265625, "learning_rate": 1.1221567046160186e-06, "loss": 0.0003823135048151016, "memory(GiB)": 38.1, "reward": 0.3576764762401581, "reward_std": 0.02409444935619831, "rewards/VisualizationJSONCombinedORM/mean": 0.3576764762401581, "rewards/VisualizationJSONCombinedORM/std": 0.03295287489891052, "step": 4862, "train_speed(iter/s)": 0.101946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 296.5, "completions/min_length": 227.0, "epoch": 4.022332506203474, "grad_norm": 0.24035997688770294, "kl": 0.04559326171875, "learning_rate": 1.120334583066579e-06, "loss": 0.0004566460847854614, "memory(GiB)": 38.1, "reward": 0.5174094438552856, "reward_std": 0.06770564615726471, "rewards/VisualizationJSONCombinedORM/mean": 0.5174094438552856, "rewards/VisualizationJSONCombinedORM/std": 0.08147489279508591, "step": 4863, "train_speed(iter/s)": 0.101904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 340.0625, "completions/min_length": 248.0, "epoch": 4.023159636062862, "grad_norm": 0.13578185439109802, "kl": 0.04705810546875, "learning_rate": 1.1185137554037152e-06, "loss": 0.0004700865247286856, "memory(GiB)": 38.1, "reward": 0.5304033756256104, "reward_std": 0.043163854628801346, "rewards/VisualizationJSONCombinedORM/mean": 0.5304033756256104, "rewards/VisualizationJSONCombinedORM/std": 0.28234267234802246, "step": 4864, "train_speed(iter/s)": 0.101856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 318.6875, "completions/min_length": 250.0, "epoch": 4.02398676592225, "grad_norm": 0.30907803773880005, "kl": 0.10308837890625, "learning_rate": 1.1166942222346828e-06, "loss": 0.0010306313633918762, "memory(GiB)": 38.1, "reward": 0.657088041305542, "reward_std": 0.09159775823354721, "rewards/VisualizationJSONCombinedORM/mean": 0.657088041305542, "rewards/VisualizationJSONCombinedORM/std": 0.08889377862215042, "step": 4865, "train_speed(iter/s)": 0.101821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 311.5, "completions/min_length": 233.0, "epoch": 4.024813895781637, "grad_norm": 0.159550741314888, "kl": 0.033447265625, "learning_rate": 1.1148759841663054e-06, "loss": 0.00033478811383247375, "memory(GiB)": 38.1, "reward": 0.46459925174713135, "reward_std": 0.027223465964198112, "rewards/VisualizationJSONCombinedORM/mean": 0.46459925174713135, "rewards/VisualizationJSONCombinedORM/std": 0.09764283150434494, "step": 4866, "train_speed(iter/s)": 0.101783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 278.5, "completions/min_length": 211.0, "epoch": 4.0256410256410255, "grad_norm": 0.21637886762619019, "kl": 0.07806396484375, "learning_rate": 1.1130590418049741e-06, "loss": 0.0007803365588188171, "memory(GiB)": 38.1, "reward": 0.6520795822143555, "reward_std": 0.08977776765823364, "rewards/VisualizationJSONCombinedORM/mean": 0.6520795822143555, "rewards/VisualizationJSONCombinedORM/std": 0.10035532712936401, "step": 4867, "train_speed(iter/s)": 0.101749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 312.125, "completions/min_length": 241.0, "epoch": 4.026468155500414, "grad_norm": 0.20237159729003906, "kl": 0.052490234375, "learning_rate": 1.1112433957566448e-06, "loss": 0.0005245916545391083, "memory(GiB)": 38.1, "reward": 0.5152443051338196, "reward_std": 0.04046449437737465, "rewards/VisualizationJSONCombinedORM/mean": 0.5152443051338196, "rewards/VisualizationJSONCombinedORM/std": 0.23540064692497253, "step": 4868, "train_speed(iter/s)": 0.101719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 290.3125, "completions/min_length": 237.0, "epoch": 4.027295285359801, "grad_norm": 0.20343272387981415, "kl": 0.0789794921875, "learning_rate": 1.1094290466268493e-06, "loss": 0.0007897019386291504, "memory(GiB)": 38.1, "reward": 0.5121896266937256, "reward_std": 0.03357105702161789, "rewards/VisualizationJSONCombinedORM/mean": 0.5121896266937256, "rewards/VisualizationJSONCombinedORM/std": 0.2460980862379074, "step": 4869, "train_speed(iter/s)": 0.101689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 303.1875, "completions/min_length": 229.0, "epoch": 4.0281224152191895, "grad_norm": 0.18022924661636353, "kl": 0.06854248046875, "learning_rate": 1.1076159950206762e-06, "loss": 0.0006849169731140137, "memory(GiB)": 38.1, "reward": 0.4640825390815735, "reward_std": 0.03910594806075096, "rewards/VisualizationJSONCombinedORM/mean": 0.4640825390815735, "rewards/VisualizationJSONCombinedORM/std": 0.10469073057174683, "step": 4870, "train_speed(iter/s)": 0.101659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 278.625, "completions/min_length": 213.0, "epoch": 4.028949545078578, "grad_norm": 0.17598597705364227, "kl": 0.085693359375, "learning_rate": 1.1058042415427894e-06, "loss": 0.0008568353950977325, "memory(GiB)": 38.1, "reward": 0.4754599332809448, "reward_std": 0.04109592363238335, "rewards/VisualizationJSONCombinedORM/mean": 0.4754599332809448, "rewards/VisualizationJSONCombinedORM/std": 0.06328410655260086, "step": 4871, "train_speed(iter/s)": 0.101623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 317.1875, "completions/min_length": 237.0, "epoch": 4.029776674937965, "grad_norm": 0.15642023086547852, "kl": 0.0328369140625, "learning_rate": 1.1039937867974166e-06, "loss": 0.00032784417271614075, "memory(GiB)": 38.1, "reward": 0.4673650562763214, "reward_std": 0.03835436701774597, "rewards/VisualizationJSONCombinedORM/mean": 0.4673650562763214, "rewards/VisualizationJSONCombinedORM/std": 0.03723658248782158, "step": 4872, "train_speed(iter/s)": 0.101585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 305.3125, "completions/min_length": 250.0, "epoch": 4.030603804797353, "grad_norm": 0.18058443069458008, "kl": 0.1197509765625, "learning_rate": 1.1021846313883539e-06, "loss": 0.0011970046907663345, "memory(GiB)": 38.1, "reward": 0.6512311100959778, "reward_std": 0.07635832577943802, "rewards/VisualizationJSONCombinedORM/mean": 0.6512311100959778, "rewards/VisualizationJSONCombinedORM/std": 0.08215735852718353, "step": 4873, "train_speed(iter/s)": 0.101554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 286.875, "completions/min_length": 233.0, "epoch": 4.031430934656741, "grad_norm": 0.2431267648935318, "kl": 0.032745361328125, "learning_rate": 1.1003767759189598e-06, "loss": 0.0003283396363258362, "memory(GiB)": 38.1, "reward": 0.5297341346740723, "reward_std": 0.03664681315422058, "rewards/VisualizationJSONCombinedORM/mean": 0.5297341346740723, "rewards/VisualizationJSONCombinedORM/std": 0.040686748921871185, "step": 4874, "train_speed(iter/s)": 0.101518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 297.0, "completions/min_length": 234.0, "epoch": 4.032258064516129, "grad_norm": 0.3175750970840454, "kl": 0.09136962890625, "learning_rate": 1.0985702209921677e-06, "loss": 0.0009148083627223969, "memory(GiB)": 38.1, "reward": 0.5176563262939453, "reward_std": 0.06875818967819214, "rewards/VisualizationJSONCombinedORM/mean": 0.5176563262939453, "rewards/VisualizationJSONCombinedORM/std": 0.07124555855989456, "step": 4875, "train_speed(iter/s)": 0.101488 }, { "epoch": 4.032258064516129, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 366.0416666666667, "eval_completions/mean_length": 306.296875, "eval_completions/min_length": 259.6666666666667, "eval_kl": 0.078460693359375, "eval_loss": 0.0007924164528958499, "eval_reward": 0.43073870552082855, "eval_reward_std": 0.04899338625061015, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.43073870552082855, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04899338605658462, "eval_runtime": 311.9139, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 294.8125, "completions/min_length": 233.0, "epoch": 4.033085194375517, "grad_norm": 0.18183836340904236, "kl": 0.064453125, "learning_rate": 1.0967649672104685e-06, "loss": 0.0006444640457630157, "memory(GiB)": 38.1, "reward": 0.5524880290031433, "reward_std": 0.03923122584819794, "rewards/VisualizationJSONCombinedORM/mean": 0.5524880290031433, "rewards/VisualizationJSONCombinedORM/std": 0.1518128514289856, "step": 4876, "train_speed(iter/s)": 0.100805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 310.0, "completions/min_length": 264.0, "epoch": 4.033912324234905, "grad_norm": 0.24608944356441498, "kl": 0.07122802734375, "learning_rate": 1.0949610151759233e-06, "loss": 0.0007130205631256104, "memory(GiB)": 38.1, "reward": 0.5175785422325134, "reward_std": 0.04130959510803223, "rewards/VisualizationJSONCombinedORM/mean": 0.5175785422325134, "rewards/VisualizationJSONCombinedORM/std": 0.04317202791571617, "step": 4877, "train_speed(iter/s)": 0.10077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 330.625, "completions/min_length": 286.0, "epoch": 4.034739454094293, "grad_norm": 0.2007245272397995, "kl": 0.0518798828125, "learning_rate": 1.0931583654901585e-06, "loss": 0.0005185939371585846, "memory(GiB)": 38.1, "reward": 0.41240382194519043, "reward_std": 0.040597304701805115, "rewards/VisualizationJSONCombinedORM/mean": 0.41240382194519043, "rewards/VisualizationJSONCombinedORM/std": 0.06415098905563354, "step": 4878, "train_speed(iter/s)": 0.100733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 302.8125, "completions/min_length": 235.0, "epoch": 4.035566583953681, "grad_norm": 0.1792757362127304, "kl": 0.030975341796875, "learning_rate": 1.0913570187543682e-06, "loss": 0.0003098808228969574, "memory(GiB)": 38.1, "reward": 0.5070605874061584, "reward_std": 0.0724567323923111, "rewards/VisualizationJSONCombinedORM/mean": 0.5070605874061584, "rewards/VisualizationJSONCombinedORM/std": 0.1510988026857376, "step": 4879, "train_speed(iter/s)": 0.100705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 301.25, "completions/min_length": 238.0, "epoch": 4.0363937138130686, "grad_norm": 0.2980961799621582, "kl": 0.427734375, "learning_rate": 1.0895569755693076e-06, "loss": 0.004273697733879089, "memory(GiB)": 38.1, "reward": 0.5593422651290894, "reward_std": 0.05977556109428406, "rewards/VisualizationJSONCombinedORM/mean": 0.5593422651290894, "rewards/VisualizationJSONCombinedORM/std": 0.1607016772031784, "step": 4880, "train_speed(iter/s)": 0.100667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 282.8125, "completions/min_length": 231.0, "epoch": 4.037220843672457, "grad_norm": 0.18471524119377136, "kl": 0.1051025390625, "learning_rate": 1.0877582365352995e-06, "loss": 0.0010493695735931396, "memory(GiB)": 38.1, "reward": 0.656327486038208, "reward_std": 0.11505124717950821, "rewards/VisualizationJSONCombinedORM/mean": 0.656327486038208, "rewards/VisualizationJSONCombinedORM/std": 0.12487225234508514, "step": 4881, "train_speed(iter/s)": 0.100635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 311.625, "completions/min_length": 244.0, "epoch": 4.038047973531844, "grad_norm": 0.19838781654834747, "kl": 0.0960693359375, "learning_rate": 1.0859608022522328e-06, "loss": 0.0009631961584091187, "memory(GiB)": 38.1, "reward": 0.6226778030395508, "reward_std": 0.08422938734292984, "rewards/VisualizationJSONCombinedORM/mean": 0.6226778030395508, "rewards/VisualizationJSONCombinedORM/std": 0.09794570505619049, "step": 4882, "train_speed(iter/s)": 0.100603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 322.5625, "completions/min_length": 247.0, "epoch": 4.0388751033912325, "grad_norm": 0.209730863571167, "kl": 0.030364990234375, "learning_rate": 1.0841646733195616e-06, "loss": 0.00030494295060634613, "memory(GiB)": 38.1, "reward": 0.7750124931335449, "reward_std": 0.04877394437789917, "rewards/VisualizationJSONCombinedORM/mean": 0.7750124931335449, "rewards/VisualizationJSONCombinedORM/std": 0.0611681304872036, "step": 4883, "train_speed(iter/s)": 0.100568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 285.25, "completions/min_length": 212.0, "epoch": 4.039702233250621, "grad_norm": 0.22010594606399536, "kl": 0.08282470703125, "learning_rate": 1.0823698503362989e-06, "loss": 0.0008263736963272095, "memory(GiB)": 38.1, "reward": 0.447098046541214, "reward_std": 0.02373400703072548, "rewards/VisualizationJSONCombinedORM/mean": 0.447098046541214, "rewards/VisualizationJSONCombinedORM/std": 0.16882005333900452, "step": 4884, "train_speed(iter/s)": 0.10053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 313.0, "completions/min_length": 235.0, "epoch": 4.040529363110008, "grad_norm": 0.1879481077194214, "kl": 0.05401611328125, "learning_rate": 1.0805763339010329e-06, "loss": 0.0005407258868217468, "memory(GiB)": 38.1, "reward": 0.5402262210845947, "reward_std": 0.02664494886994362, "rewards/VisualizationJSONCombinedORM/mean": 0.5402262210845947, "rewards/VisualizationJSONCombinedORM/std": 0.09418494254350662, "step": 4885, "train_speed(iter/s)": 0.100498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 315.625, "completions/min_length": 240.0, "epoch": 4.041356492969396, "grad_norm": 0.20192813873291016, "kl": 0.0533447265625, "learning_rate": 1.078784124611904e-06, "loss": 0.000533662736415863, "memory(GiB)": 38.1, "reward": 0.5428922176361084, "reward_std": 0.04848413169384003, "rewards/VisualizationJSONCombinedORM/mean": 0.5428922176361084, "rewards/VisualizationJSONCombinedORM/std": 0.22009533643722534, "step": 4886, "train_speed(iter/s)": 0.100471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 318.0625, "completions/min_length": 253.0, "epoch": 4.042183622828784, "grad_norm": 0.5056394338607788, "kl": 0.076904296875, "learning_rate": 1.0769932230666248e-06, "loss": 0.0007678885012865067, "memory(GiB)": 38.1, "reward": 0.7047903537750244, "reward_std": 0.07352294027805328, "rewards/VisualizationJSONCombinedORM/mean": 0.7047903537750244, "rewards/VisualizationJSONCombinedORM/std": 0.07831370085477829, "step": 4887, "train_speed(iter/s)": 0.100439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 302.5, "completions/min_length": 230.0, "epoch": 4.043010752688172, "grad_norm": 0.192593514919281, "kl": 0.0751953125, "learning_rate": 1.075203629862469e-06, "loss": 0.0007513053715229034, "memory(GiB)": 38.1, "reward": 0.7029837369918823, "reward_std": 0.08252665400505066, "rewards/VisualizationJSONCombinedORM/mean": 0.7029837369918823, "rewards/VisualizationJSONCombinedORM/std": 0.1112237498164177, "step": 4888, "train_speed(iter/s)": 0.100405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 309.4375, "completions/min_length": 224.0, "epoch": 4.04383788254756, "grad_norm": 0.2113843411207199, "kl": 0.032562255859375, "learning_rate": 1.0734153455962765e-06, "loss": 0.0003255307674407959, "memory(GiB)": 38.1, "reward": 0.429947167634964, "reward_std": 0.04453960061073303, "rewards/VisualizationJSONCombinedORM/mean": 0.429947167634964, "rewards/VisualizationJSONCombinedORM/std": 0.0886295959353447, "step": 4889, "train_speed(iter/s)": 0.100372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 333.875, "completions/min_length": 233.0, "epoch": 4.044665012406948, "grad_norm": 0.1964099407196045, "kl": 0.10638427734375, "learning_rate": 1.0716283708644431e-06, "loss": 0.0010652318596839905, "memory(GiB)": 38.1, "reward": 0.3612084686756134, "reward_std": 0.030177561566233635, "rewards/VisualizationJSONCombinedORM/mean": 0.3612084686756134, "rewards/VisualizationJSONCombinedORM/std": 0.22101546823978424, "step": 4890, "train_speed(iter/s)": 0.100336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 288.125, "completions/min_length": 246.0, "epoch": 4.045492142266336, "grad_norm": 0.2280145287513733, "kl": 0.0728759765625, "learning_rate": 1.0698427062629396e-06, "loss": 0.000729050487279892, "memory(GiB)": 38.1, "reward": 0.4983533024787903, "reward_std": 0.08487877994775772, "rewards/VisualizationJSONCombinedORM/mean": 0.4983533024787903, "rewards/VisualizationJSONCombinedORM/std": 0.1883847564458847, "step": 4891, "train_speed(iter/s)": 0.10031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 295.375, "completions/min_length": 242.0, "epoch": 4.046319272125724, "grad_norm": 0.24699953198432922, "kl": 0.03997802734375, "learning_rate": 1.0680583523872894e-06, "loss": 0.00039994344115257263, "memory(GiB)": 38.1, "reward": 0.8211046457290649, "reward_std": 0.09505107998847961, "rewards/VisualizationJSONCombinedORM/mean": 0.8211046457290649, "rewards/VisualizationJSONCombinedORM/std": 0.09306058287620544, "step": 4892, "train_speed(iter/s)": 0.100277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 303.75, "completions/min_length": 200.0, "epoch": 4.0471464019851116, "grad_norm": 0.15476536750793457, "kl": 0.042449951171875, "learning_rate": 1.066275309832584e-06, "loss": 0.0004243701696395874, "memory(GiB)": 38.1, "reward": 0.6260474920272827, "reward_std": 0.03513770550489426, "rewards/VisualizationJSONCombinedORM/mean": 0.6260474920272827, "rewards/VisualizationJSONCombinedORM/std": 0.21543699502944946, "step": 4893, "train_speed(iter/s)": 0.100242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 297.875, "completions/min_length": 243.0, "epoch": 4.0479735318445, "grad_norm": 0.2503628134727478, "kl": 0.120849609375, "learning_rate": 1.0644935791934763e-06, "loss": 0.0012074392288923264, "memory(GiB)": 38.1, "reward": 0.5087215900421143, "reward_std": 0.06377262622117996, "rewards/VisualizationJSONCombinedORM/mean": 0.5087215900421143, "rewards/VisualizationJSONCombinedORM/std": 0.07996430993080139, "step": 4894, "train_speed(iter/s)": 0.100208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 321.25, "completions/min_length": 236.0, "epoch": 4.048800661703887, "grad_norm": 0.16304916143417358, "kl": 0.0467529296875, "learning_rate": 1.0627131610641829e-06, "loss": 0.0004688650369644165, "memory(GiB)": 38.1, "reward": 0.4484400451183319, "reward_std": 0.021715689450502396, "rewards/VisualizationJSONCombinedORM/mean": 0.4484400451183319, "rewards/VisualizationJSONCombinedORM/std": 0.04504403844475746, "step": 4895, "train_speed(iter/s)": 0.100162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 333.5, "completions/min_length": 255.0, "epoch": 4.0496277915632755, "grad_norm": 0.21589110791683197, "kl": 0.03887939453125, "learning_rate": 1.0609340560384796e-06, "loss": 0.0003878399729728699, "memory(GiB)": 38.1, "reward": 0.6156769394874573, "reward_std": 0.056618496775627136, "rewards/VisualizationJSONCombinedORM/mean": 0.6156769394874573, "rewards/VisualizationJSONCombinedORM/std": 0.15611782670021057, "step": 4896, "train_speed(iter/s)": 0.100134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 306.0625, "completions/min_length": 243.0, "epoch": 4.050454921422664, "grad_norm": 0.1734752357006073, "kl": 0.032867431640625, "learning_rate": 1.059156264709707e-06, "loss": 0.00032830797135829926, "memory(GiB)": 38.1, "reward": 0.6570914387702942, "reward_std": 0.05738517642021179, "rewards/VisualizationJSONCombinedORM/mean": 0.6570914387702942, "rewards/VisualizationJSONCombinedORM/std": 0.05888935923576355, "step": 4897, "train_speed(iter/s)": 0.100105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 329.0, "completions/min_length": 256.0, "epoch": 4.051282051282051, "grad_norm": 0.2041478157043457, "kl": 0.09326171875, "learning_rate": 1.0573797876707676e-06, "loss": 0.0009320899844169617, "memory(GiB)": 38.1, "reward": 0.5182421803474426, "reward_std": 0.052619025111198425, "rewards/VisualizationJSONCombinedORM/mean": 0.5182421803474426, "rewards/VisualizationJSONCombinedORM/std": 0.07083238661289215, "step": 4898, "train_speed(iter/s)": 0.10006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 317.625, "completions/min_length": 259.0, "epoch": 4.052109181141439, "grad_norm": 0.17154169082641602, "kl": 0.0677490234375, "learning_rate": 1.055604625514125e-06, "loss": 0.0006791092455387115, "memory(GiB)": 38.1, "reward": 0.2558547258377075, "reward_std": 0.021445654332637787, "rewards/VisualizationJSONCombinedORM/mean": 0.2558547258377075, "rewards/VisualizationJSONCombinedORM/std": 0.07068075984716415, "step": 4899, "train_speed(iter/s)": 0.100025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 318.4375, "completions/min_length": 234.0, "epoch": 4.052936311000827, "grad_norm": 0.18116848170757294, "kl": 0.0726318359375, "learning_rate": 1.0538307788318014e-06, "loss": 0.000725962221622467, "memory(GiB)": 38.1, "reward": 0.5135566592216492, "reward_std": 0.04936311021447182, "rewards/VisualizationJSONCombinedORM/mean": 0.5135566592216492, "rewards/VisualizationJSONCombinedORM/std": 0.15317998826503754, "step": 4900, "train_speed(iter/s)": 0.099993 }, { "epoch": 4.052936311000827, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.4166666666667, "eval_completions/mean_length": 308.4270833333333, "eval_completions/min_length": 255.875, "eval_kl": 0.06408182779947917, "eval_loss": 0.0006460497970692813, "eval_reward": 0.42462821925679844, "eval_reward_std": 0.04409016081141696, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.42462821925679844, "eval_rewards/VisualizationJSONCombinedORM/std": 0.044090161005442496, "eval_runtime": 314.4113, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 273.6875, "completions/min_length": 233.0, "epoch": 4.053763440860215, "grad_norm": 0.16963164508342743, "kl": 0.0303955078125, "learning_rate": 1.0520582482153874e-06, "loss": 0.0003043077886104584, "memory(GiB)": 38.1, "reward": 0.6335272192955017, "reward_std": 0.08391699194908142, "rewards/VisualizationJSONCombinedORM/mean": 0.6335272192955017, "rewards/VisualizationJSONCombinedORM/std": 0.15717031061649323, "step": 4901, "train_speed(iter/s)": 0.099323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 313.3125, "completions/min_length": 249.0, "epoch": 4.054590570719603, "grad_norm": 0.15320929884910583, "kl": 0.033905029296875, "learning_rate": 1.0502870342560262e-06, "loss": 0.0003396347165107727, "memory(GiB)": 38.1, "reward": 0.47950541973114014, "reward_std": 0.05393289029598236, "rewards/VisualizationJSONCombinedORM/mean": 0.47950541973114014, "rewards/VisualizationJSONCombinedORM/std": 0.07628650963306427, "step": 4902, "train_speed(iter/s)": 0.099292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 323.4375, "completions/min_length": 270.0, "epoch": 4.055417700578991, "grad_norm": 0.17771361768245697, "kl": 0.0665283203125, "learning_rate": 1.0485171375444275e-06, "loss": 0.0006636828184127808, "memory(GiB)": 38.1, "reward": 0.6779524683952332, "reward_std": 0.05183468014001846, "rewards/VisualizationJSONCombinedORM/mean": 0.6779524683952332, "rewards/VisualizationJSONCombinedORM/std": 0.05894952267408371, "step": 4903, "train_speed(iter/s)": 0.099263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 295.1875, "completions/min_length": 231.0, "epoch": 4.056244830438379, "grad_norm": 0.2735309600830078, "kl": 0.0611572265625, "learning_rate": 1.046748558670861e-06, "loss": 0.0006128139793872833, "memory(GiB)": 38.1, "reward": 0.44646745920181274, "reward_std": 0.04876402020454407, "rewards/VisualizationJSONCombinedORM/mean": 0.44646745920181274, "rewards/VisualizationJSONCombinedORM/std": 0.08422403037548065, "step": 4904, "train_speed(iter/s)": 0.09923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 303.4375, "completions/min_length": 243.0, "epoch": 4.057071960297767, "grad_norm": 0.1693170815706253, "kl": 0.05902099609375, "learning_rate": 1.0449812982251556e-06, "loss": 0.000592036172747612, "memory(GiB)": 38.1, "reward": 0.28230148553848267, "reward_std": 0.016766980290412903, "rewards/VisualizationJSONCombinedORM/mean": 0.28230148553848267, "rewards/VisualizationJSONCombinedORM/std": 0.04591245949268341, "step": 4905, "train_speed(iter/s)": 0.0992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 314.875, "completions/min_length": 252.0, "epoch": 4.0578990901571546, "grad_norm": 0.21249055862426758, "kl": 0.054443359375, "learning_rate": 1.0432153567966985e-06, "loss": 0.0005445331335067749, "memory(GiB)": 38.1, "reward": 0.4752870798110962, "reward_std": 0.052193425595760345, "rewards/VisualizationJSONCombinedORM/mean": 0.4752870798110962, "rewards/VisualizationJSONCombinedORM/std": 0.06126519292593002, "step": 4906, "train_speed(iter/s)": 0.099175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 311.3125, "completions/min_length": 237.0, "epoch": 4.058726220016543, "grad_norm": 0.20349062979221344, "kl": 0.04473876953125, "learning_rate": 1.041450734974444e-06, "loss": 0.0004462301731109619, "memory(GiB)": 38.1, "reward": 0.5869297385215759, "reward_std": 0.050757162272930145, "rewards/VisualizationJSONCombinedORM/mean": 0.5869297385215759, "rewards/VisualizationJSONCombinedORM/std": 0.14205993711948395, "step": 4907, "train_speed(iter/s)": 0.099135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 331.75, "completions/min_length": 263.0, "epoch": 4.05955334987593, "grad_norm": 0.2382700890302658, "kl": 0.1507568359375, "learning_rate": 1.0396874333468981e-06, "loss": 0.001508796587586403, "memory(GiB)": 38.1, "reward": 0.3467154800891876, "reward_std": 0.04321581870317459, "rewards/VisualizationJSONCombinedORM/mean": 0.3467154800891876, "rewards/VisualizationJSONCombinedORM/std": 0.1477627009153366, "step": 4908, "train_speed(iter/s)": 0.099107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 292.8125, "completions/min_length": 233.0, "epoch": 4.0603804797353185, "grad_norm": 0.18220970034599304, "kl": 0.07904052734375, "learning_rate": 1.037925452502131e-06, "loss": 0.0007910635322332382, "memory(GiB)": 38.1, "reward": 0.48278287053108215, "reward_std": 0.08559437096118927, "rewards/VisualizationJSONCombinedORM/mean": 0.48278287053108215, "rewards/VisualizationJSONCombinedORM/std": 0.08755913376808167, "step": 4909, "train_speed(iter/s)": 0.099077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 301.8125, "completions/min_length": 248.0, "epoch": 4.061207609594707, "grad_norm": 0.19505736231803894, "kl": 0.10400390625, "learning_rate": 1.0361647930277719e-06, "loss": 0.0010391250252723694, "memory(GiB)": 38.1, "reward": 0.46600717306137085, "reward_std": 0.05924099311232567, "rewards/VisualizationJSONCombinedORM/mean": 0.46600717306137085, "rewards/VisualizationJSONCombinedORM/std": 0.14811956882476807, "step": 4910, "train_speed(iter/s)": 0.099046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 282.1875, "completions/min_length": 213.0, "epoch": 4.062034739454094, "grad_norm": 0.18751949071884155, "kl": 0.06097412109375, "learning_rate": 1.0344054555110105e-06, "loss": 0.0006087366491556168, "memory(GiB)": 38.1, "reward": 0.5047430992126465, "reward_std": 0.07908543944358826, "rewards/VisualizationJSONCombinedORM/mean": 0.5047430992126465, "rewards/VisualizationJSONCombinedORM/std": 0.09758654981851578, "step": 4911, "train_speed(iter/s)": 0.099024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 304.5625, "completions/min_length": 236.0, "epoch": 4.062861869313482, "grad_norm": 0.21321897208690643, "kl": 0.05072021484375, "learning_rate": 1.0326474405385906e-06, "loss": 0.0005059656105004251, "memory(GiB)": 38.1, "reward": 0.30239951610565186, "reward_std": 0.02647225186228752, "rewards/VisualizationJSONCombinedORM/mean": 0.30239951610565186, "rewards/VisualizationJSONCombinedORM/std": 0.047748956829309464, "step": 4912, "train_speed(iter/s)": 0.098989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 279.25, "completions/min_length": 228.0, "epoch": 4.06368899917287, "grad_norm": 0.21630451083183289, "kl": 0.073974609375, "learning_rate": 1.0308907486968212e-06, "loss": 0.000741422176361084, "memory(GiB)": 38.1, "reward": 0.4786374568939209, "reward_std": 0.05192501097917557, "rewards/VisualizationJSONCombinedORM/mean": 0.4786374568939209, "rewards/VisualizationJSONCombinedORM/std": 0.1558094024658203, "step": 4913, "train_speed(iter/s)": 0.098957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 290.625, "completions/min_length": 235.0, "epoch": 4.064516129032258, "grad_norm": 0.22954405844211578, "kl": 0.1298828125, "learning_rate": 1.029135380571566e-06, "loss": 0.001294657588005066, "memory(GiB)": 38.1, "reward": 0.589368462562561, "reward_std": 0.07894362509250641, "rewards/VisualizationJSONCombinedORM/mean": 0.589368462562561, "rewards/VisualizationJSONCombinedORM/std": 0.15962225198745728, "step": 4914, "train_speed(iter/s)": 0.098934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 318.0625, "completions/min_length": 202.0, "epoch": 4.065343258891646, "grad_norm": 0.18848329782485962, "kl": 0.1654052734375, "learning_rate": 1.02738133674825e-06, "loss": 0.001654200255870819, "memory(GiB)": 38.1, "reward": 0.32044804096221924, "reward_std": 0.03546445816755295, "rewards/VisualizationJSONCombinedORM/mean": 0.32044804096221924, "rewards/VisualizationJSONCombinedORM/std": 0.10898903757333755, "step": 4915, "train_speed(iter/s)": 0.098898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 298.5625, "completions/min_length": 234.0, "epoch": 4.066170388751034, "grad_norm": 0.20426751673221588, "kl": 0.05340576171875, "learning_rate": 1.0256286178118513e-06, "loss": 0.0005331635475158691, "memory(GiB)": 38.1, "reward": 0.4220966696739197, "reward_std": 0.04162311553955078, "rewards/VisualizationJSONCombinedORM/mean": 0.4220966696739197, "rewards/VisualizationJSONCombinedORM/std": 0.1342955380678177, "step": 4916, "train_speed(iter/s)": 0.098865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 310.0625, "completions/min_length": 245.0, "epoch": 4.066997518610422, "grad_norm": 0.2285909503698349, "kl": 0.06976318359375, "learning_rate": 1.0238772243469153e-06, "loss": 0.0006971918046474457, "memory(GiB)": 38.1, "reward": 0.40573087334632874, "reward_std": 0.05899197235703468, "rewards/VisualizationJSONCombinedORM/mean": 0.40573087334632874, "rewards/VisualizationJSONCombinedORM/std": 0.07938210666179657, "step": 4917, "train_speed(iter/s)": 0.098836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 302.0625, "completions/min_length": 233.0, "epoch": 4.06782464846981, "grad_norm": 0.17604653537273407, "kl": 0.07122802734375, "learning_rate": 1.0221271569375356e-06, "loss": 0.0007112850435078144, "memory(GiB)": 38.1, "reward": 0.5997824668884277, "reward_std": 0.04577039182186127, "rewards/VisualizationJSONCombinedORM/mean": 0.5997824668884277, "rewards/VisualizationJSONCombinedORM/std": 0.10447447001934052, "step": 4918, "train_speed(iter/s)": 0.098798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 294.9375, "completions/min_length": 246.0, "epoch": 4.0686517783291976, "grad_norm": 0.20305787026882172, "kl": 0.0887451171875, "learning_rate": 1.0203784161673697e-06, "loss": 0.0008877553045749664, "memory(GiB)": 38.1, "reward": 0.43767425417900085, "reward_std": 0.07450978457927704, "rewards/VisualizationJSONCombinedORM/mean": 0.43767425417900085, "rewards/VisualizationJSONCombinedORM/std": 0.08650338649749756, "step": 4919, "train_speed(iter/s)": 0.098773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 324.125, "completions/min_length": 258.0, "epoch": 4.069478908188586, "grad_norm": 0.22568075358867645, "kl": 0.03485107421875, "learning_rate": 1.01863100261963e-06, "loss": 0.00034915655851364136, "memory(GiB)": 38.1, "reward": 0.44133391976356506, "reward_std": 0.05339115485548973, "rewards/VisualizationJSONCombinedORM/mean": 0.44133391976356506, "rewards/VisualizationJSONCombinedORM/std": 0.16450347006320953, "step": 4920, "train_speed(iter/s)": 0.098742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 305.0, "completions/min_length": 248.0, "epoch": 4.070306038047973, "grad_norm": 0.19017061591148376, "kl": 0.06982421875, "learning_rate": 1.0168849168770894e-06, "loss": 0.000699356198310852, "memory(GiB)": 38.1, "reward": 0.36512717604637146, "reward_std": 0.021276336163282394, "rewards/VisualizationJSONCombinedORM/mean": 0.36512717604637146, "rewards/VisualizationJSONCombinedORM/std": 0.08033697307109833, "step": 4921, "train_speed(iter/s)": 0.098719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 317.375, "completions/min_length": 253.0, "epoch": 4.0711331679073615, "grad_norm": 0.22914589941501617, "kl": 0.0732421875, "learning_rate": 1.0151401595220712e-06, "loss": 0.0007329583168029785, "memory(GiB)": 38.1, "reward": 0.5544202327728271, "reward_std": 0.07504265010356903, "rewards/VisualizationJSONCombinedORM/mean": 0.5544202327728271, "rewards/VisualizationJSONCombinedORM/std": 0.07725272327661514, "step": 4922, "train_speed(iter/s)": 0.098688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 320.0, "completions/min_length": 257.0, "epoch": 4.07196029776675, "grad_norm": 0.18734951317310333, "kl": 0.06536865234375, "learning_rate": 1.013396731136465e-06, "loss": 0.0006537046283483505, "memory(GiB)": 38.1, "reward": 0.6458930969238281, "reward_std": 0.06134053319692612, "rewards/VisualizationJSONCombinedORM/mean": 0.6458930969238281, "rewards/VisualizationJSONCombinedORM/std": 0.09882756322622299, "step": 4923, "train_speed(iter/s)": 0.098648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 306.875, "completions/min_length": 246.0, "epoch": 4.072787427626137, "grad_norm": 0.154156893491745, "kl": 0.0772705078125, "learning_rate": 1.011654632301709e-06, "loss": 0.0007738727144896984, "memory(GiB)": 38.1, "reward": 0.5980187654495239, "reward_std": 0.07856409251689911, "rewards/VisualizationJSONCombinedORM/mean": 0.5980187654495239, "rewards/VisualizationJSONCombinedORM/std": 0.187578484416008, "step": 4924, "train_speed(iter/s)": 0.098614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 308.8125, "completions/min_length": 235.0, "epoch": 4.073614557485525, "grad_norm": 0.2553057074546814, "kl": 0.04974365234375, "learning_rate": 1.0099138635988026e-06, "loss": 0.0004979558289051056, "memory(GiB)": 38.1, "reward": 0.5115492343902588, "reward_std": 0.05734837055206299, "rewards/VisualizationJSONCombinedORM/mean": 0.5115492343902588, "rewards/VisualizationJSONCombinedORM/std": 0.12689058482646942, "step": 4925, "train_speed(iter/s)": 0.098584 }, { "epoch": 4.073614557485525, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.2083333333333, "eval_completions/mean_length": 303.1354166666667, "eval_completions/min_length": 250.5, "eval_kl": 0.0720062255859375, "eval_loss": 0.0007178199593909085, "eval_reward": 0.4305222301433484, "eval_reward_std": 0.04550531487135837, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4305222301433484, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04550531231022129, "eval_runtime": 314.1801, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 333.9375, "completions/min_length": 224.0, "epoch": 4.074441687344913, "grad_norm": 0.19624845683574677, "kl": 0.064453125, "learning_rate": 1.0081744256083003e-06, "loss": 0.0006447508931159973, "memory(GiB)": 38.1, "reward": 0.21169963479042053, "reward_std": 0.020427964627742767, "rewards/VisualizationJSONCombinedORM/mean": 0.21169963479042053, "rewards/VisualizationJSONCombinedORM/std": 0.038172535598278046, "step": 4926, "train_speed(iter/s)": 0.097931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 327.5, "completions/min_length": 260.0, "epoch": 4.075268817204301, "grad_norm": 0.19081050157546997, "kl": 0.0802001953125, "learning_rate": 1.0064363189103138e-06, "loss": 0.0008030235767364502, "memory(GiB)": 38.1, "reward": 0.333588182926178, "reward_std": 0.03359319642186165, "rewards/VisualizationJSONCombinedORM/mean": 0.333588182926178, "rewards/VisualizationJSONCombinedORM/std": 0.052895355969667435, "step": 4927, "train_speed(iter/s)": 0.0979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 316.5625, "completions/min_length": 243.0, "epoch": 4.076095947063689, "grad_norm": 0.18478301167488098, "kl": 0.030853271484375, "learning_rate": 1.0046995440845076e-06, "loss": 0.0003089606761932373, "memory(GiB)": 38.1, "reward": 0.5523840188980103, "reward_std": 0.03687269613146782, "rewards/VisualizationJSONCombinedORM/mean": 0.5523840188980103, "rewards/VisualizationJSONCombinedORM/std": 0.1398262083530426, "step": 4928, "train_speed(iter/s)": 0.097861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 348.1875, "completions/min_length": 259.0, "epoch": 4.076923076923077, "grad_norm": 0.19416478276252747, "kl": 0.0352783203125, "learning_rate": 1.0029641017101049e-06, "loss": 0.00035257264971733093, "memory(GiB)": 38.1, "reward": 0.5099250674247742, "reward_std": 0.029788941144943237, "rewards/VisualizationJSONCombinedORM/mean": 0.5099250674247742, "rewards/VisualizationJSONCombinedORM/std": 0.28457245230674744, "step": 4929, "train_speed(iter/s)": 0.097827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 278.5, "completions/min_length": 201.0, "epoch": 4.077750206782465, "grad_norm": 0.24985814094543457, "kl": 0.1044921875, "learning_rate": 1.0012299923658848e-06, "loss": 0.0010440461337566376, "memory(GiB)": 38.1, "reward": 0.48076000809669495, "reward_std": 0.07102532684803009, "rewards/VisualizationJSONCombinedORM/mean": 0.48076000809669495, "rewards/VisualizationJSONCombinedORM/std": 0.17904140055179596, "step": 4930, "train_speed(iter/s)": 0.097797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 334.3125, "completions/min_length": 268.0, "epoch": 4.078577336641853, "grad_norm": 0.19997814297676086, "kl": 0.06512451171875, "learning_rate": 9.994972166301815e-07, "loss": 0.0006506219506263733, "memory(GiB)": 38.1, "reward": 0.43885207176208496, "reward_std": 0.04687177389860153, "rewards/VisualizationJSONCombinedORM/mean": 0.43885207176208496, "rewards/VisualizationJSONCombinedORM/std": 0.04711536690592766, "step": 4931, "train_speed(iter/s)": 0.097771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 284.125, "completions/min_length": 236.0, "epoch": 4.079404466501241, "grad_norm": 0.18312948942184448, "kl": 0.04296875, "learning_rate": 9.977657750808794e-07, "loss": 0.00042901188135147095, "memory(GiB)": 38.1, "reward": 0.7444895505905151, "reward_std": 0.08500669151544571, "rewards/VisualizationJSONCombinedORM/mean": 0.7444895505905151, "rewards/VisualizationJSONCombinedORM/std": 0.08260779082775116, "step": 4932, "train_speed(iter/s)": 0.097742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 301.4375, "completions/min_length": 227.0, "epoch": 4.080231596360629, "grad_norm": 0.1892334520816803, "kl": 0.0875244140625, "learning_rate": 9.960356682954293e-07, "loss": 0.0008732136338949203, "memory(GiB)": 38.1, "reward": 0.618268609046936, "reward_std": 0.07891236245632172, "rewards/VisualizationJSONCombinedORM/mean": 0.618268609046936, "rewards/VisualizationJSONCombinedORM/std": 0.19440007209777832, "step": 4933, "train_speed(iter/s)": 0.097716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 304.25, "completions/min_length": 241.0, "epoch": 4.081058726220016, "grad_norm": 0.18542692065238953, "kl": 0.0943603515625, "learning_rate": 9.943068968508247e-07, "loss": 0.0009418018162250519, "memory(GiB)": 38.1, "reward": 0.4695553183555603, "reward_std": 0.053328849375247955, "rewards/VisualizationJSONCombinedORM/mean": 0.4695553183555603, "rewards/VisualizationJSONCombinedORM/std": 0.07350150495767593, "step": 4934, "train_speed(iter/s)": 0.097688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 288.875, "completions/min_length": 248.0, "epoch": 4.0818858560794045, "grad_norm": 0.23198239505290985, "kl": 0.06591796875, "learning_rate": 9.925794613236201e-07, "loss": 0.0006600134074687958, "memory(GiB)": 38.1, "reward": 0.7137503623962402, "reward_std": 0.11380620300769806, "rewards/VisualizationJSONCombinedORM/mean": 0.7137503623962402, "rewards/VisualizationJSONCombinedORM/std": 0.1548888087272644, "step": 4935, "train_speed(iter/s)": 0.097658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 289.75, "completions/min_length": 212.0, "epoch": 4.082712985938793, "grad_norm": 0.23535649478435516, "kl": 0.05169677734375, "learning_rate": 9.90853362289924e-07, "loss": 0.0005178637802600861, "memory(GiB)": 38.1, "reward": 0.3079710006713867, "reward_std": 0.038321178406476974, "rewards/VisualizationJSONCombinedORM/mean": 0.3079710006713867, "rewards/VisualizationJSONCombinedORM/std": 0.06294204294681549, "step": 4936, "train_speed(iter/s)": 0.097625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 297.4375, "completions/min_length": 240.0, "epoch": 4.08354011579818, "grad_norm": 0.21740002930164337, "kl": 0.064208984375, "learning_rate": 9.891286003253997e-07, "loss": 0.0006429683417081833, "memory(GiB)": 38.1, "reward": 0.3678009510040283, "reward_std": 0.04878785461187363, "rewards/VisualizationJSONCombinedORM/mean": 0.3678009510040283, "rewards/VisualizationJSONCombinedORM/std": 0.1291830986738205, "step": 4937, "train_speed(iter/s)": 0.097588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/mean_length": 314.4375, "completions/min_length": 231.0, "epoch": 4.084367245657568, "grad_norm": 0.19643521308898926, "kl": 0.0546875, "learning_rate": 9.874051760052594e-07, "loss": 0.0005468204617500305, "memory(GiB)": 38.1, "reward": 0.5242640972137451, "reward_std": 0.055304549634456635, "rewards/VisualizationJSONCombinedORM/mean": 0.5242640972137451, "rewards/VisualizationJSONCombinedORM/std": 0.06214836612343788, "step": 4938, "train_speed(iter/s)": 0.097558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 300.5625, "completions/min_length": 215.0, "epoch": 4.085194375516956, "grad_norm": 0.2264510691165924, "kl": 0.05120849609375, "learning_rate": 9.856830899042779e-07, "loss": 0.0005124099552631378, "memory(GiB)": 38.1, "reward": 0.6202274560928345, "reward_std": 0.07183543592691422, "rewards/VisualizationJSONCombinedORM/mean": 0.6202274560928345, "rewards/VisualizationJSONCombinedORM/std": 0.19015049934387207, "step": 4939, "train_speed(iter/s)": 0.097534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 329.3125, "completions/min_length": 247.0, "epoch": 4.086021505376344, "grad_norm": 0.16500847041606903, "kl": 0.059326171875, "learning_rate": 9.83962342596776e-07, "loss": 0.000594213604927063, "memory(GiB)": 38.1, "reward": 0.6516923904418945, "reward_std": 0.03603651374578476, "rewards/VisualizationJSONCombinedORM/mean": 0.6516923904418945, "rewards/VisualizationJSONCombinedORM/std": 0.10417322814464569, "step": 4940, "train_speed(iter/s)": 0.097499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 288.75, "completions/min_length": 217.0, "epoch": 4.086848635235732, "grad_norm": 0.18943384289741516, "kl": 0.05029296875, "learning_rate": 9.822429346566314e-07, "loss": 0.0005020573735237122, "memory(GiB)": 38.1, "reward": 0.48666635155677795, "reward_std": 0.029835566878318787, "rewards/VisualizationJSONCombinedORM/mean": 0.48666635155677795, "rewards/VisualizationJSONCombinedORM/std": 0.13356135785579681, "step": 4941, "train_speed(iter/s)": 0.097466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 274.625, "completions/min_length": 229.0, "epoch": 4.08767576509512, "grad_norm": 0.16757771372795105, "kl": 0.049072265625, "learning_rate": 9.805248666572753e-07, "loss": 0.0004909783601760864, "memory(GiB)": 38.1, "reward": 0.7529796361923218, "reward_std": 0.07315582782030106, "rewards/VisualizationJSONCombinedORM/mean": 0.7529796361923218, "rewards/VisualizationJSONCombinedORM/std": 0.1001843512058258, "step": 4942, "train_speed(iter/s)": 0.097433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 308.625, "completions/min_length": 259.0, "epoch": 4.088502894954508, "grad_norm": 0.1579296737909317, "kl": 0.0439453125, "learning_rate": 9.788081391716913e-07, "loss": 0.0004386976361274719, "memory(GiB)": 38.1, "reward": 0.597072958946228, "reward_std": 0.05004621297121048, "rewards/VisualizationJSONCombinedORM/mean": 0.597072958946228, "rewards/VisualizationJSONCombinedORM/std": 0.05094580724835396, "step": 4943, "train_speed(iter/s)": 0.097404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 338.4375, "completions/min_length": 257.0, "epoch": 4.089330024813896, "grad_norm": 0.18574944138526917, "kl": 0.0919189453125, "learning_rate": 9.770927527724173e-07, "loss": 0.00091929966583848, "memory(GiB)": 38.1, "reward": 0.5834255218505859, "reward_std": 0.06773027032613754, "rewards/VisualizationJSONCombinedORM/mean": 0.5834255218505859, "rewards/VisualizationJSONCombinedORM/std": 0.09638885408639908, "step": 4944, "train_speed(iter/s)": 0.097369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 333.3125, "completions/min_length": 247.0, "epoch": 4.090157154673284, "grad_norm": 0.19630013406276703, "kl": 0.09521484375, "learning_rate": 9.753787080315385e-07, "loss": 0.0009527206420898438, "memory(GiB)": 38.1, "reward": 0.6711946725845337, "reward_std": 0.06103289872407913, "rewards/VisualizationJSONCombinedORM/mean": 0.6711946725845337, "rewards/VisualizationJSONCombinedORM/std": 0.19368183612823486, "step": 4945, "train_speed(iter/s)": 0.097324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 289.4375, "completions/min_length": 230.0, "epoch": 4.090984284532672, "grad_norm": 0.1944998949766159, "kl": 0.1893310546875, "learning_rate": 9.73666005520703e-07, "loss": 0.001897267997264862, "memory(GiB)": 38.1, "reward": 0.5856744050979614, "reward_std": 0.07318969070911407, "rewards/VisualizationJSONCombinedORM/mean": 0.5856744050979614, "rewards/VisualizationJSONCombinedORM/std": 0.12566716969013214, "step": 4946, "train_speed(iter/s)": 0.097286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 314.8125, "completions/min_length": 244.0, "epoch": 4.091811414392059, "grad_norm": 0.24414652585983276, "kl": 0.1009521484375, "learning_rate": 9.719546458111002e-07, "loss": 0.0010084807872772217, "memory(GiB)": 38.1, "reward": 0.3427681028842926, "reward_std": 0.04916710779070854, "rewards/VisualizationJSONCombinedORM/mean": 0.3427681028842926, "rewards/VisualizationJSONCombinedORM/std": 0.10817200690507889, "step": 4947, "train_speed(iter/s)": 0.097253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 299.5625, "completions/min_length": 235.0, "epoch": 4.0926385442514475, "grad_norm": 0.22843901813030243, "kl": 0.0498046875, "learning_rate": 9.702446294734775e-07, "loss": 0.0004985406994819641, "memory(GiB)": 38.1, "reward": 0.5693666934967041, "reward_std": 0.07775674760341644, "rewards/VisualizationJSONCombinedORM/mean": 0.5693666934967041, "rewards/VisualizationJSONCombinedORM/std": 0.19784590601921082, "step": 4948, "train_speed(iter/s)": 0.097233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 318.75, "completions/min_length": 243.0, "epoch": 4.093465674110836, "grad_norm": 0.21512259542942047, "kl": 0.04730224609375, "learning_rate": 9.685359570781344e-07, "loss": 0.0004728250205516815, "memory(GiB)": 38.1, "reward": 0.6314871311187744, "reward_std": 0.06741210073232651, "rewards/VisualizationJSONCombinedORM/mean": 0.6314871311187744, "rewards/VisualizationJSONCombinedORM/std": 0.13532587885856628, "step": 4949, "train_speed(iter/s)": 0.097203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 293.5625, "completions/min_length": 236.0, "epoch": 4.094292803970223, "grad_norm": 0.24026037752628326, "kl": 0.03857421875, "learning_rate": 9.668286291949224e-07, "loss": 0.00038587674498558044, "memory(GiB)": 38.1, "reward": 0.6582702398300171, "reward_std": 0.11318324506282806, "rewards/VisualizationJSONCombinedORM/mean": 0.6582702398300171, "rewards/VisualizationJSONCombinedORM/std": 0.11529666930437088, "step": 4950, "train_speed(iter/s)": 0.097165 }, { "epoch": 4.094292803970223, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 370.5, "eval_completions/mean_length": 308.9739583333333, "eval_completions/min_length": 261.75, "eval_kl": 0.087127685546875, "eval_loss": 0.0008757002651691437, "eval_reward": 0.4349748690923055, "eval_reward_std": 0.054278976594408355, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4349748690923055, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05427897810780754, "eval_runtime": 314.8407, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 310.4375, "completions/min_length": 251.0, "epoch": 4.095119933829611, "grad_norm": 0.19580641388893127, "kl": 0.0906982421875, "learning_rate": 9.651226463932406e-07, "loss": 0.0009066332131624222, "memory(GiB)": 38.1, "reward": 0.6572884321212769, "reward_std": 0.03038639761507511, "rewards/VisualizationJSONCombinedORM/mean": 0.6572884321212769, "rewards/VisualizationJSONCombinedORM/std": 0.1950058490037918, "step": 4951, "train_speed(iter/s)": 0.096542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 265.9375, "completions/min_length": 225.0, "epoch": 4.095947063688999, "grad_norm": 0.1874237060546875, "kl": 0.03216552734375, "learning_rate": 9.634180092420426e-07, "loss": 0.00032136961817741394, "memory(GiB)": 38.1, "reward": 0.7074539661407471, "reward_std": 0.04526811093091965, "rewards/VisualizationJSONCombinedORM/mean": 0.7074539661407471, "rewards/VisualizationJSONCombinedORM/std": 0.11548483371734619, "step": 4952, "train_speed(iter/s)": 0.096512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 310.5625, "completions/min_length": 250.0, "epoch": 4.096774193548387, "grad_norm": 0.2686711549758911, "kl": 0.0496826171875, "learning_rate": 9.617147183098341e-07, "loss": 0.000497911125421524, "memory(GiB)": 38.1, "reward": 0.6601823568344116, "reward_std": 0.055572524666786194, "rewards/VisualizationJSONCombinedORM/mean": 0.6601823568344116, "rewards/VisualizationJSONCombinedORM/std": 0.060574743896722794, "step": 4953, "train_speed(iter/s)": 0.096488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 294.875, "completions/min_length": 236.0, "epoch": 4.097601323407775, "grad_norm": 0.24014292657375336, "kl": 0.03375244140625, "learning_rate": 9.600127741646713e-07, "loss": 0.0003371387720108032, "memory(GiB)": 38.1, "reward": 0.6972440481185913, "reward_std": 0.11671364307403564, "rewards/VisualizationJSONCombinedORM/mean": 0.6972440481185913, "rewards/VisualizationJSONCombinedORM/std": 0.1290184110403061, "step": 4954, "train_speed(iter/s)": 0.09646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 281.125, "completions/min_length": 235.0, "epoch": 4.098428453267163, "grad_norm": 0.18047745525836945, "kl": 0.05364990234375, "learning_rate": 9.583121773741571e-07, "loss": 0.0005365535616874695, "memory(GiB)": 38.1, "reward": 0.5047858953475952, "reward_std": 0.06928391009569168, "rewards/VisualizationJSONCombinedORM/mean": 0.5047858953475952, "rewards/VisualizationJSONCombinedORM/std": 0.22876495122909546, "step": 4955, "train_speed(iter/s)": 0.096427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 296.875, "completions/min_length": 247.0, "epoch": 4.099255583126551, "grad_norm": 0.1987265646457672, "kl": 0.0516357421875, "learning_rate": 9.566129285054531e-07, "loss": 0.0005165226757526398, "memory(GiB)": 38.1, "reward": 0.5439637899398804, "reward_std": 0.059995152056217194, "rewards/VisualizationJSONCombinedORM/mean": 0.5439637899398804, "rewards/VisualizationJSONCombinedORM/std": 0.21902737021446228, "step": 4956, "train_speed(iter/s)": 0.096396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 312.9375, "completions/min_length": 229.0, "epoch": 4.100082712985939, "grad_norm": 0.16471698880195618, "kl": 0.0623779296875, "learning_rate": 9.549150281252633e-07, "loss": 0.0006257109344005585, "memory(GiB)": 38.1, "reward": 0.6321605443954468, "reward_std": 0.05655314773321152, "rewards/VisualizationJSONCombinedORM/mean": 0.6321605443954468, "rewards/VisualizationJSONCombinedORM/std": 0.06722650676965714, "step": 4957, "train_speed(iter/s)": 0.096366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 327.6875, "completions/min_length": 237.0, "epoch": 4.100909842845327, "grad_norm": 0.2146427184343338, "kl": 0.0745849609375, "learning_rate": 9.532184767998465e-07, "loss": 0.0007455721497535706, "memory(GiB)": 38.1, "reward": 0.6850210428237915, "reward_std": 0.09177254140377045, "rewards/VisualizationJSONCombinedORM/mean": 0.6850210428237915, "rewards/VisualizationJSONCombinedORM/std": 0.09049361199140549, "step": 4958, "train_speed(iter/s)": 0.096337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 309.625, "completions/min_length": 247.0, "epoch": 4.101736972704715, "grad_norm": 0.19315344095230103, "kl": 0.0484619140625, "learning_rate": 9.515232750950104e-07, "loss": 0.0004848204553127289, "memory(GiB)": 38.1, "reward": 0.5447782278060913, "reward_std": 0.07233095914125443, "rewards/VisualizationJSONCombinedORM/mean": 0.5447782278060913, "rewards/VisualizationJSONCombinedORM/std": 0.2026495635509491, "step": 4959, "train_speed(iter/s)": 0.096317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 290.75, "completions/min_length": 224.0, "epoch": 4.102564102564102, "grad_norm": 0.2090546190738678, "kl": 0.088134765625, "learning_rate": 9.498294235761141e-07, "loss": 0.000880897045135498, "memory(GiB)": 38.1, "reward": 0.351967990398407, "reward_std": 0.034280069172382355, "rewards/VisualizationJSONCombinedORM/mean": 0.351967990398407, "rewards/VisualizationJSONCombinedORM/std": 0.13596229255199432, "step": 4960, "train_speed(iter/s)": 0.096293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 298.0625, "completions/min_length": 232.0, "epoch": 4.1033912324234905, "grad_norm": 0.30531367659568787, "kl": 0.0982666015625, "learning_rate": 9.481369228080611e-07, "loss": 0.000982377678155899, "memory(GiB)": 38.1, "reward": 0.3786122798919678, "reward_std": 0.07351380586624146, "rewards/VisualizationJSONCombinedORM/mean": 0.3786122798919678, "rewards/VisualizationJSONCombinedORM/std": 0.18373481929302216, "step": 4961, "train_speed(iter/s)": 0.096254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 307.9375, "completions/min_length": 228.0, "epoch": 4.104218362282879, "grad_norm": 0.1959134191274643, "kl": 0.03851318359375, "learning_rate": 9.46445773355314e-07, "loss": 0.00038528069853782654, "memory(GiB)": 38.1, "reward": 0.5800746083259583, "reward_std": 0.03144500404596329, "rewards/VisualizationJSONCombinedORM/mean": 0.5800746083259583, "rewards/VisualizationJSONCombinedORM/std": 0.18512369692325592, "step": 4962, "train_speed(iter/s)": 0.09622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 303.0625, "completions/min_length": 226.0, "epoch": 4.105045492142266, "grad_norm": 0.13053438067436218, "kl": 0.027587890625, "learning_rate": 9.447559757818747e-07, "loss": 0.0002762038493528962, "memory(GiB)": 38.1, "reward": 0.4536387324333191, "reward_std": 0.015610987320542336, "rewards/VisualizationJSONCombinedORM/mean": 0.4536387324333191, "rewards/VisualizationJSONCombinedORM/std": 0.23733633756637573, "step": 4963, "train_speed(iter/s)": 0.09619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 334.875, "completions/min_length": 217.0, "epoch": 4.105872622001654, "grad_norm": 0.16698813438415527, "kl": 0.05755615234375, "learning_rate": 9.430675306512998e-07, "loss": 0.0005743764340877533, "memory(GiB)": 38.1, "reward": 0.33666759729385376, "reward_std": 0.01949942670762539, "rewards/VisualizationJSONCombinedORM/mean": 0.33666759729385376, "rewards/VisualizationJSONCombinedORM/std": 0.1654493659734726, "step": 4964, "train_speed(iter/s)": 0.096162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 306.5, "completions/min_length": 254.0, "epoch": 4.106699751861042, "grad_norm": 0.25106731057167053, "kl": 0.05914306640625, "learning_rate": 9.41380438526694e-07, "loss": 0.0005910396575927734, "memory(GiB)": 38.1, "reward": 0.7249909043312073, "reward_std": 0.06899944692850113, "rewards/VisualizationJSONCombinedORM/mean": 0.7249909043312073, "rewards/VisualizationJSONCombinedORM/std": 0.09459372609853745, "step": 4965, "train_speed(iter/s)": 0.096138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 296.8125, "completions/min_length": 249.0, "epoch": 4.10752688172043, "grad_norm": 0.22589240968227386, "kl": 0.1197509765625, "learning_rate": 9.396946999707113e-07, "loss": 0.001196049153804779, "memory(GiB)": 38.1, "reward": 0.5526609420776367, "reward_std": 0.05838033929467201, "rewards/VisualizationJSONCombinedORM/mean": 0.5526609420776367, "rewards/VisualizationJSONCombinedORM/std": 0.22045165300369263, "step": 4966, "train_speed(iter/s)": 0.096111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 309.125, "completions/min_length": 220.0, "epoch": 4.108354011579818, "grad_norm": 0.21078293025493622, "kl": 0.09246826171875, "learning_rate": 9.380103155455512e-07, "loss": 0.0009236931800842285, "memory(GiB)": 38.1, "reward": 0.3790707588195801, "reward_std": 0.04774898663163185, "rewards/VisualizationJSONCombinedORM/mean": 0.3790707588195801, "rewards/VisualizationJSONCombinedORM/std": 0.048278968781232834, "step": 4967, "train_speed(iter/s)": 0.096085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 301.1875, "completions/min_length": 239.0, "epoch": 4.109181141439206, "grad_norm": 0.1809423267841339, "kl": 0.057861328125, "learning_rate": 9.363272858129647e-07, "loss": 0.0005787201225757599, "memory(GiB)": 38.1, "reward": 0.5622223615646362, "reward_std": 0.052005741745233536, "rewards/VisualizationJSONCombinedORM/mean": 0.5622223615646362, "rewards/VisualizationJSONCombinedORM/std": 0.14430417120456696, "step": 4968, "train_speed(iter/s)": 0.096058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 313.375, "completions/min_length": 244.0, "epoch": 4.110008271298594, "grad_norm": 0.19919738173484802, "kl": 0.14990234375, "learning_rate": 9.346456113342506e-07, "loss": 0.0015013031661510468, "memory(GiB)": 38.1, "reward": 0.3437531888484955, "reward_std": 0.04945233091711998, "rewards/VisualizationJSONCombinedORM/mean": 0.3437531888484955, "rewards/VisualizationJSONCombinedORM/std": 0.05302085727453232, "step": 4969, "train_speed(iter/s)": 0.096017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 322.4375, "completions/min_length": 276.0, "epoch": 4.110835401157982, "grad_norm": 0.1637866050004959, "kl": 0.098876953125, "learning_rate": 9.329652926702559e-07, "loss": 0.000987473875284195, "memory(GiB)": 38.1, "reward": 0.41374003887176514, "reward_std": 0.014751216396689415, "rewards/VisualizationJSONCombinedORM/mean": 0.41374003887176514, "rewards/VisualizationJSONCombinedORM/std": 0.2226308286190033, "step": 4970, "train_speed(iter/s)": 0.095985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 306.75, "completions/min_length": 245.0, "epoch": 4.11166253101737, "grad_norm": 0.22103579342365265, "kl": 0.0987548828125, "learning_rate": 9.312863303813712e-07, "loss": 0.000987038016319275, "memory(GiB)": 38.1, "reward": 0.31818193197250366, "reward_std": 0.030324924737215042, "rewards/VisualizationJSONCombinedORM/mean": 0.31818193197250366, "rewards/VisualizationJSONCombinedORM/std": 0.12568417191505432, "step": 4971, "train_speed(iter/s)": 0.095952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 334.8125, "completions/min_length": 280.0, "epoch": 4.112489660876758, "grad_norm": 0.25147444009780884, "kl": 0.12646484375, "learning_rate": 9.296087250275438e-07, "loss": 0.001264931634068489, "memory(GiB)": 38.1, "reward": 0.52178955078125, "reward_std": 0.03785073012113571, "rewards/VisualizationJSONCombinedORM/mean": 0.52178955078125, "rewards/VisualizationJSONCombinedORM/std": 0.15815472602844238, "step": 4972, "train_speed(iter/s)": 0.095923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 308.9375, "completions/min_length": 252.0, "epoch": 4.113316790736145, "grad_norm": 0.16856469213962555, "kl": 0.0662841796875, "learning_rate": 9.279324771682586e-07, "loss": 0.0006614318117499352, "memory(GiB)": 38.1, "reward": 0.6791625022888184, "reward_std": 0.04936349019408226, "rewards/VisualizationJSONCombinedORM/mean": 0.6791625022888184, "rewards/VisualizationJSONCombinedORM/std": 0.07956241071224213, "step": 4973, "train_speed(iter/s)": 0.095891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 309.125, "completions/min_length": 245.0, "epoch": 4.1141439205955335, "grad_norm": 0.192617267370224, "kl": 0.0477294921875, "learning_rate": 9.262575873625529e-07, "loss": 0.0004771370440721512, "memory(GiB)": 38.1, "reward": 0.6206268668174744, "reward_std": 0.05223765969276428, "rewards/VisualizationJSONCombinedORM/mean": 0.6206268668174744, "rewards/VisualizationJSONCombinedORM/std": 0.16468146443367004, "step": 4974, "train_speed(iter/s)": 0.095858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 286.625, "completions/min_length": 239.0, "epoch": 4.114971050454922, "grad_norm": 0.231044203042984, "kl": 0.126220703125, "learning_rate": 9.245840561690117e-07, "loss": 0.0012605506926774979, "memory(GiB)": 38.1, "reward": 0.4522160589694977, "reward_std": 0.04163284972310066, "rewards/VisualizationJSONCombinedORM/mean": 0.4522160589694977, "rewards/VisualizationJSONCombinedORM/std": 0.20694999396800995, "step": 4975, "train_speed(iter/s)": 0.095825 }, { "epoch": 4.114971050454922, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 363.25, "eval_completions/mean_length": 308.6197916666667, "eval_completions/min_length": 254.66666666666666, "eval_kl": 0.060760498046875, "eval_loss": 0.0006086304783821106, "eval_reward": 0.4325321037322283, "eval_reward_std": 0.04745832861711582, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4325321037322283, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04745832954843839, "eval_runtime": 311.4745, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 286.3125, "completions/min_length": 243.0, "epoch": 4.115798180314309, "grad_norm": 0.27390798926353455, "kl": 0.1025390625, "learning_rate": 9.229118841457652e-07, "loss": 0.0010235309600830078, "memory(GiB)": 38.1, "reward": 0.5589261054992676, "reward_std": 0.06718094646930695, "rewards/VisualizationJSONCombinedORM/mean": 0.5589261054992676, "rewards/VisualizationJSONCombinedORM/std": 0.1327243149280548, "step": 4976, "train_speed(iter/s)": 0.095221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 319.375, "completions/min_length": 264.0, "epoch": 4.116625310173697, "grad_norm": 0.18929865956306458, "kl": 0.084228515625, "learning_rate": 9.212410718504883e-07, "loss": 0.0008439179509878159, "memory(GiB)": 38.1, "reward": 0.6552025675773621, "reward_std": 0.0624673031270504, "rewards/VisualizationJSONCombinedORM/mean": 0.6552025675773621, "rewards/VisualizationJSONCombinedORM/std": 0.12540176510810852, "step": 4977, "train_speed(iter/s)": 0.095185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 295.75, "completions/min_length": 228.0, "epoch": 4.117452440033086, "grad_norm": 0.24444209039211273, "kl": 0.06109619140625, "learning_rate": 9.195716198404086e-07, "loss": 0.0006112866103649139, "memory(GiB)": 38.1, "reward": 0.5495909452438354, "reward_std": 0.075139619410038, "rewards/VisualizationJSONCombinedORM/mean": 0.5495909452438354, "rewards/VisualizationJSONCombinedORM/std": 0.16063857078552246, "step": 4978, "train_speed(iter/s)": 0.095167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 300.0, "completions/min_length": 254.0, "epoch": 4.118279569892473, "grad_norm": 0.1777247041463852, "kl": 0.072509765625, "learning_rate": 9.179035286722926e-07, "loss": 0.0007255533710122108, "memory(GiB)": 38.1, "reward": 0.7283782958984375, "reward_std": 0.03663794323801994, "rewards/VisualizationJSONCombinedORM/mean": 0.7283782958984375, "rewards/VisualizationJSONCombinedORM/std": 0.12770545482635498, "step": 4979, "train_speed(iter/s)": 0.095138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 294.9375, "completions/min_length": 210.0, "epoch": 4.119106699751861, "grad_norm": 0.16266131401062012, "kl": 0.0537109375, "learning_rate": 9.162367989024584e-07, "loss": 0.000537484884262085, "memory(GiB)": 38.1, "reward": 0.4155709445476532, "reward_std": 0.033020660281181335, "rewards/VisualizationJSONCombinedORM/mean": 0.4155709445476532, "rewards/VisualizationJSONCombinedORM/std": 0.23315469920635223, "step": 4980, "train_speed(iter/s)": 0.095114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 298.5625, "completions/min_length": 244.0, "epoch": 4.119933829611249, "grad_norm": 0.23788513243198395, "kl": 0.188720703125, "learning_rate": 9.145714310867676e-07, "loss": 0.001884501427412033, "memory(GiB)": 38.1, "reward": 0.4183517098426819, "reward_std": 0.03655996918678284, "rewards/VisualizationJSONCombinedORM/mean": 0.4183517098426819, "rewards/VisualizationJSONCombinedORM/std": 0.1489548236131668, "step": 4981, "train_speed(iter/s)": 0.095086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 343.9375, "completions/min_length": 280.0, "epoch": 4.120760959470637, "grad_norm": 0.16572482883930206, "kl": 0.04034423828125, "learning_rate": 9.129074257806292e-07, "loss": 0.0004031956195831299, "memory(GiB)": 38.1, "reward": 0.6183879971504211, "reward_std": 0.040342364460229874, "rewards/VisualizationJSONCombinedORM/mean": 0.6183879971504211, "rewards/VisualizationJSONCombinedORM/std": 0.12047964334487915, "step": 4982, "train_speed(iter/s)": 0.095057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 327.0625, "completions/min_length": 256.0, "epoch": 4.121588089330025, "grad_norm": 0.175192728638649, "kl": 0.03741455078125, "learning_rate": 9.112447835389953e-07, "loss": 0.00037359632551670074, "memory(GiB)": 38.1, "reward": 0.5802466869354248, "reward_std": 0.019926808774471283, "rewards/VisualizationJSONCombinedORM/mean": 0.5802466869354248, "rewards/VisualizationJSONCombinedORM/std": 0.2939314544200897, "step": 4983, "train_speed(iter/s)": 0.09502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 321.875, "completions/min_length": 244.0, "epoch": 4.122415219189413, "grad_norm": 0.19126567244529724, "kl": 0.08197021484375, "learning_rate": 9.095835049163648e-07, "loss": 0.0008184686303138733, "memory(GiB)": 38.1, "reward": 0.5641236305236816, "reward_std": 0.05811868980526924, "rewards/VisualizationJSONCombinedORM/mean": 0.5641236305236816, "rewards/VisualizationJSONCombinedORM/std": 0.12625400722026825, "step": 4984, "train_speed(iter/s)": 0.094993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 307.9375, "completions/min_length": 256.0, "epoch": 4.123242349048801, "grad_norm": 0.2545083165168762, "kl": 0.083251953125, "learning_rate": 9.079235904667826e-07, "loss": 0.0008302964270114899, "memory(GiB)": 38.1, "reward": 0.34177857637405396, "reward_std": 0.032389916479587555, "rewards/VisualizationJSONCombinedORM/mean": 0.34177857637405396, "rewards/VisualizationJSONCombinedORM/std": 0.12144686281681061, "step": 4985, "train_speed(iter/s)": 0.094957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 320.0625, "completions/min_length": 270.0, "epoch": 4.124069478908188, "grad_norm": 0.19797463715076447, "kl": 0.03900146484375, "learning_rate": 9.062650407438395e-07, "loss": 0.00039025023579597473, "memory(GiB)": 38.1, "reward": 0.5585812330245972, "reward_std": 0.03981488198041916, "rewards/VisualizationJSONCombinedORM/mean": 0.5585812330245972, "rewards/VisualizationJSONCombinedORM/std": 0.040103841572999954, "step": 4986, "train_speed(iter/s)": 0.09492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 311.25, "completions/min_length": 243.0, "epoch": 4.1248966087675765, "grad_norm": 0.19375324249267578, "kl": 0.080322265625, "learning_rate": 9.046078563006655e-07, "loss": 0.0008050687611103058, "memory(GiB)": 38.1, "reward": 0.5632039904594421, "reward_std": 0.03726246953010559, "rewards/VisualizationJSONCombinedORM/mean": 0.5632039904594421, "rewards/VisualizationJSONCombinedORM/std": 0.1849001944065094, "step": 4987, "train_speed(iter/s)": 0.094893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 331.0625, "completions/min_length": 275.0, "epoch": 4.125723738626965, "grad_norm": 0.1781376302242279, "kl": 0.0343017578125, "learning_rate": 9.029520376899442e-07, "loss": 0.000342637300491333, "memory(GiB)": 38.1, "reward": 0.8633683323860168, "reward_std": 0.039761222898960114, "rewards/VisualizationJSONCombinedORM/mean": 0.8633683323860168, "rewards/VisualizationJSONCombinedORM/std": 0.045321203768253326, "step": 4988, "train_speed(iter/s)": 0.094863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 317.75, "completions/min_length": 218.0, "epoch": 4.126550868486352, "grad_norm": 0.2218363881111145, "kl": 0.055419921875, "learning_rate": 9.01297585463895e-07, "loss": 0.0005560070276260376, "memory(GiB)": 38.1, "reward": 0.4252510070800781, "reward_std": 0.028725624084472656, "rewards/VisualizationJSONCombinedORM/mean": 0.4252510070800781, "rewards/VisualizationJSONCombinedORM/std": 0.16539804637432098, "step": 4989, "train_speed(iter/s)": 0.094833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 313.0, "completions/min_length": 265.0, "epoch": 4.12737799834574, "grad_norm": 0.1900589019060135, "kl": 0.1148681640625, "learning_rate": 8.996445001742871e-07, "loss": 0.0011515021324157715, "memory(GiB)": 38.1, "reward": 0.3882025480270386, "reward_std": 0.03345280513167381, "rewards/VisualizationJSONCombinedORM/mean": 0.3882025480270386, "rewards/VisualizationJSONCombinedORM/std": 0.17080773413181305, "step": 4990, "train_speed(iter/s)": 0.094805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 301.4375, "completions/min_length": 220.0, "epoch": 4.128205128205128, "grad_norm": 0.17968608438968658, "kl": 0.0465087890625, "learning_rate": 8.979927823724321e-07, "loss": 0.000464642042061314, "memory(GiB)": 38.1, "reward": 0.6892510652542114, "reward_std": 0.049488894641399384, "rewards/VisualizationJSONCombinedORM/mean": 0.6892510652542114, "rewards/VisualizationJSONCombinedORM/std": 0.09449947625398636, "step": 4991, "train_speed(iter/s)": 0.094776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 289.375, "completions/min_length": 231.0, "epoch": 4.129032258064516, "grad_norm": 0.22793035209178925, "kl": 0.040863037109375, "learning_rate": 8.963424326091868e-07, "loss": 0.00040862709283828735, "memory(GiB)": 38.1, "reward": 0.6227725744247437, "reward_std": 0.07395344972610474, "rewards/VisualizationJSONCombinedORM/mean": 0.6227725744247437, "rewards/VisualizationJSONCombinedORM/std": 0.19223785400390625, "step": 4992, "train_speed(iter/s)": 0.094755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 318.3125, "completions/min_length": 227.0, "epoch": 4.129859387923904, "grad_norm": 0.20309601724147797, "kl": 0.06353759765625, "learning_rate": 8.946934514349465e-07, "loss": 0.0006361603736877441, "memory(GiB)": 38.1, "reward": 0.39359229803085327, "reward_std": 0.03909078240394592, "rewards/VisualizationJSONCombinedORM/mean": 0.39359229803085327, "rewards/VisualizationJSONCombinedORM/std": 0.08504656702280045, "step": 4993, "train_speed(iter/s)": 0.094725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 305.75, "completions/min_length": 262.0, "epoch": 4.130686517783292, "grad_norm": 0.18923069536685944, "kl": 0.0604248046875, "learning_rate": 8.930458393996599e-07, "loss": 0.0006043724715709686, "memory(GiB)": 38.1, "reward": 0.5162556171417236, "reward_std": 0.03714080527424812, "rewards/VisualizationJSONCombinedORM/mean": 0.5162556171417236, "rewards/VisualizationJSONCombinedORM/std": 0.17582331597805023, "step": 4994, "train_speed(iter/s)": 0.094691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 333.3125, "completions/min_length": 269.0, "epoch": 4.13151364764268, "grad_norm": 0.17083248496055603, "kl": 0.08349609375, "learning_rate": 8.913995970528089e-07, "loss": 0.0008347034454345703, "memory(GiB)": 38.1, "reward": 0.47104954719543457, "reward_std": 0.03962697833776474, "rewards/VisualizationJSONCombinedORM/mean": 0.47104954719543457, "rewards/VisualizationJSONCombinedORM/std": 0.06827375292778015, "step": 4995, "train_speed(iter/s)": 0.09465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 284.1875, "completions/min_length": 217.0, "epoch": 4.132340777502068, "grad_norm": 0.19348762929439545, "kl": 0.06402587890625, "learning_rate": 8.897547249434247e-07, "loss": 0.0006414763629436493, "memory(GiB)": 38.1, "reward": 0.4530490040779114, "reward_std": 0.042417436838150024, "rewards/VisualizationJSONCombinedORM/mean": 0.4530490040779114, "rewards/VisualizationJSONCombinedORM/std": 0.30914661288261414, "step": 4996, "train_speed(iter/s)": 0.094621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 336.5, "completions/min_length": 256.0, "epoch": 4.133167907361456, "grad_norm": 0.26647475361824036, "kl": 0.09375, "learning_rate": 8.881112236200795e-07, "loss": 0.0009384676814079285, "memory(GiB)": 38.1, "reward": 0.433322936296463, "reward_std": 0.10103486478328705, "rewards/VisualizationJSONCombinedORM/mean": 0.433322936296463, "rewards/VisualizationJSONCombinedORM/std": 0.12414734810590744, "step": 4997, "train_speed(iter/s)": 0.09459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 308.6875, "completions/min_length": 247.0, "epoch": 4.133995037220844, "grad_norm": 0.19414527714252472, "kl": 0.0726318359375, "learning_rate": 8.864690936308906e-07, "loss": 0.0007269866764545441, "memory(GiB)": 38.1, "reward": 0.3984091877937317, "reward_std": 0.03227947652339935, "rewards/VisualizationJSONCombinedORM/mean": 0.3984091877937317, "rewards/VisualizationJSONCombinedORM/std": 0.03241370618343353, "step": 4998, "train_speed(iter/s)": 0.094568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 297.4375, "completions/min_length": 229.0, "epoch": 4.134822167080231, "grad_norm": 0.27710750699043274, "kl": 0.04888916015625, "learning_rate": 8.848283355235127e-07, "loss": 0.0004883091896772385, "memory(GiB)": 38.1, "reward": 0.6202588081359863, "reward_std": 0.10419851541519165, "rewards/VisualizationJSONCombinedORM/mean": 0.6202588081359863, "rewards/VisualizationJSONCombinedORM/std": 0.1012226790189743, "step": 4999, "train_speed(iter/s)": 0.094535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 295.75, "completions/min_length": 251.0, "epoch": 4.1356492969396195, "grad_norm": 0.20795321464538574, "kl": 0.076171875, "learning_rate": 8.831889498451474e-07, "loss": 0.000762108713388443, "memory(GiB)": 38.1, "reward": 0.387764036655426, "reward_std": 0.058285702019929886, "rewards/VisualizationJSONCombinedORM/mean": 0.387764036655426, "rewards/VisualizationJSONCombinedORM/std": 0.09414885193109512, "step": 5000, "train_speed(iter/s)": 0.094501 }, { "epoch": 4.1356492969396195, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 362.875, "eval_completions/mean_length": 304.4270833333333, "eval_completions/min_length": 258.75, "eval_kl": 0.08794148763020833, "eval_loss": 0.000886179506778717, "eval_reward": 0.4333296312640111, "eval_reward_std": 0.04764809789291272, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4333296312640111, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04764809894065062, "eval_runtime": 310.3623, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 332.3125, "completions/min_length": 281.0, "epoch": 4.136476426799008, "grad_norm": 0.20074336230754852, "kl": 0.087158203125, "learning_rate": 8.815509371425385e-07, "loss": 0.0008731111884117126, "memory(GiB)": 38.1, "reward": 0.5130344033241272, "reward_std": 0.027931123971939087, "rewards/VisualizationJSONCombinedORM/mean": 0.5130344033241272, "rewards/VisualizationJSONCombinedORM/std": 0.20896127820014954, "step": 5001, "train_speed(iter/s)": 0.093927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 312.0, "completions/min_length": 229.0, "epoch": 4.137303556658395, "grad_norm": 0.1945084035396576, "kl": 0.07733154296875, "learning_rate": 8.799142979619718e-07, "loss": 0.0007718605920672417, "memory(GiB)": 38.1, "reward": 0.42958346009254456, "reward_std": 0.022771358489990234, "rewards/VisualizationJSONCombinedORM/mean": 0.42958346009254456, "rewards/VisualizationJSONCombinedORM/std": 0.09670323133468628, "step": 5002, "train_speed(iter/s)": 0.093898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 330.875, "completions/min_length": 272.0, "epoch": 4.138130686517783, "grad_norm": 0.19640061259269714, "kl": 0.06011962890625, "learning_rate": 8.782790328492702e-07, "loss": 0.0006014183163642883, "memory(GiB)": 38.13, "reward": 0.47685182094573975, "reward_std": 0.03551442176103592, "rewards/VisualizationJSONCombinedORM/mean": 0.47685182094573975, "rewards/VisualizationJSONCombinedORM/std": 0.060233231633901596, "step": 5003, "train_speed(iter/s)": 0.093858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 297.375, "completions/min_length": 225.0, "epoch": 4.138957816377172, "grad_norm": 0.24141189455986023, "kl": 0.04400634765625, "learning_rate": 8.766451423498068e-07, "loss": 0.0004392266273498535, "memory(GiB)": 38.13, "reward": 0.6942646503448486, "reward_std": 0.07239652425050735, "rewards/VisualizationJSONCombinedORM/mean": 0.6942646503448486, "rewards/VisualizationJSONCombinedORM/std": 0.14478951692581177, "step": 5004, "train_speed(iter/s)": 0.093833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 314.8125, "completions/min_length": 224.0, "epoch": 4.139784946236559, "grad_norm": 0.12457022070884705, "kl": 0.028289794921875, "learning_rate": 8.750126270084891e-07, "loss": 0.0002829402219504118, "memory(GiB)": 38.13, "reward": 0.5469894409179688, "reward_std": 0.025813261047005653, "rewards/VisualizationJSONCombinedORM/mean": 0.5469894409179688, "rewards/VisualizationJSONCombinedORM/std": 0.08773212134838104, "step": 5005, "train_speed(iter/s)": 0.093803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 289.125, "completions/min_length": 227.0, "epoch": 4.140612076095947, "grad_norm": 0.20314663648605347, "kl": 0.046142578125, "learning_rate": 8.733814873697688e-07, "loss": 0.0004610791802406311, "memory(GiB)": 38.13, "reward": 0.6003422737121582, "reward_std": 0.10682602226734161, "rewards/VisualizationJSONCombinedORM/mean": 0.6003422737121582, "rewards/VisualizationJSONCombinedORM/std": 0.13808582723140717, "step": 5006, "train_speed(iter/s)": 0.093768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 320.0, "completions/min_length": 262.0, "epoch": 4.141439205955335, "grad_norm": 0.23521557450294495, "kl": 0.1036376953125, "learning_rate": 8.71751723977639e-07, "loss": 0.001036219298839569, "memory(GiB)": 38.13, "reward": 0.5581584572792053, "reward_std": 0.05237698554992676, "rewards/VisualizationJSONCombinedORM/mean": 0.5581584572792053, "rewards/VisualizationJSONCombinedORM/std": 0.17163126170635223, "step": 5007, "train_speed(iter/s)": 0.093741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 327.625, "completions/min_length": 228.0, "epoch": 4.142266335814723, "grad_norm": 0.18419890105724335, "kl": 0.0531005859375, "learning_rate": 8.701233373756352e-07, "loss": 0.0005306825041770935, "memory(GiB)": 38.13, "reward": 0.5479129552841187, "reward_std": 0.04144100472331047, "rewards/VisualizationJSONCombinedORM/mean": 0.5479129552841187, "rewards/VisualizationJSONCombinedORM/std": 0.04775620996952057, "step": 5008, "train_speed(iter/s)": 0.093714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 276.6875, "completions/min_length": 222.0, "epoch": 4.143093465674111, "grad_norm": 0.20924252271652222, "kl": 0.059600830078125, "learning_rate": 8.684963281068276e-07, "loss": 0.0005949214100837708, "memory(GiB)": 38.13, "reward": 0.3417685329914093, "reward_std": 0.041746750473976135, "rewards/VisualizationJSONCombinedORM/mean": 0.3417685329914093, "rewards/VisualizationJSONCombinedORM/std": 0.09905995428562164, "step": 5009, "train_speed(iter/s)": 0.093688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 318.5625, "completions/min_length": 264.0, "epoch": 4.143920595533499, "grad_norm": 0.19724519550800323, "kl": 0.1209716796875, "learning_rate": 8.668706967138363e-07, "loss": 0.0012089977972209454, "memory(GiB)": 38.13, "reward": 0.46103334426879883, "reward_std": 0.04453011602163315, "rewards/VisualizationJSONCombinedORM/mean": 0.46103334426879883, "rewards/VisualizationJSONCombinedORM/std": 0.047439273446798325, "step": 5010, "train_speed(iter/s)": 0.093667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 289.6875, "completions/min_length": 211.0, "epoch": 4.144747725392887, "grad_norm": 0.21385498344898224, "kl": 0.099853515625, "learning_rate": 8.652464437388136e-07, "loss": 0.0009987149387598038, "memory(GiB)": 38.13, "reward": 0.42329418659210205, "reward_std": 0.04691446200013161, "rewards/VisualizationJSONCombinedORM/mean": 0.42329418659210205, "rewards/VisualizationJSONCombinedORM/std": 0.15395362675189972, "step": 5011, "train_speed(iter/s)": 0.093634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 307.0, "completions/min_length": 244.0, "epoch": 4.145574855252274, "grad_norm": 0.1666226089000702, "kl": 0.05029296875, "learning_rate": 8.636235697234563e-07, "loss": 0.0005009174346923828, "memory(GiB)": 38.13, "reward": 0.6839883327484131, "reward_std": 0.054991692304611206, "rewards/VisualizationJSONCombinedORM/mean": 0.6839883327484131, "rewards/VisualizationJSONCombinedORM/std": 0.11321356147527695, "step": 5012, "train_speed(iter/s)": 0.093606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/mean_length": 297.625, "completions/min_length": 228.0, "epoch": 4.1464019851116625, "grad_norm": 0.1908513456583023, "kl": 0.035064697265625, "learning_rate": 8.620020752090008e-07, "loss": 0.00035068392753601074, "memory(GiB)": 38.13, "reward": 0.5602490901947021, "reward_std": 0.06451957672834396, "rewards/VisualizationJSONCombinedORM/mean": 0.5602490901947021, "rewards/VisualizationJSONCombinedORM/std": 0.07977854460477829, "step": 5013, "train_speed(iter/s)": 0.093567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 304.375, "completions/min_length": 232.0, "epoch": 4.147229114971051, "grad_norm": 0.20193348824977875, "kl": 0.06744384765625, "learning_rate": 8.603819607362246e-07, "loss": 0.0006748568266630173, "memory(GiB)": 38.13, "reward": 0.5891944169998169, "reward_std": 0.05402550846338272, "rewards/VisualizationJSONCombinedORM/mean": 0.5891944169998169, "rewards/VisualizationJSONCombinedORM/std": 0.15526746213436127, "step": 5014, "train_speed(iter/s)": 0.093542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 315.0625, "completions/min_length": 236.0, "epoch": 4.148056244830438, "grad_norm": 0.20086602866649628, "kl": 0.0858154296875, "learning_rate": 8.587632268454405e-07, "loss": 0.0008590941433794796, "memory(GiB)": 38.13, "reward": 0.5590687990188599, "reward_std": 0.021526724100112915, "rewards/VisualizationJSONCombinedORM/mean": 0.5590687990188599, "rewards/VisualizationJSONCombinedORM/std": 0.2072397768497467, "step": 5015, "train_speed(iter/s)": 0.093521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 304.75, "completions/min_length": 228.0, "epoch": 4.148883374689826, "grad_norm": 0.18909983336925507, "kl": 0.04522705078125, "learning_rate": 8.571458740765054e-07, "loss": 0.00045235268771648407, "memory(GiB)": 38.13, "reward": 0.5065350532531738, "reward_std": 0.04260621219873428, "rewards/VisualizationJSONCombinedORM/mean": 0.5065350532531738, "rewards/VisualizationJSONCombinedORM/std": 0.13137713074684143, "step": 5016, "train_speed(iter/s)": 0.093495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 332.0625, "completions/min_length": 268.0, "epoch": 4.149710504549215, "grad_norm": 0.15165916085243225, "kl": 0.032684326171875, "learning_rate": 8.555299029688141e-07, "loss": 0.0003259517252445221, "memory(GiB)": 38.13, "reward": 0.7230139970779419, "reward_std": 0.047096338123083115, "rewards/VisualizationJSONCombinedORM/mean": 0.7230139970779419, "rewards/VisualizationJSONCombinedORM/std": 0.09354355186223984, "step": 5017, "train_speed(iter/s)": 0.093466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 332.3125, "completions/min_length": 238.0, "epoch": 4.150537634408602, "grad_norm": 0.24458524584770203, "kl": 0.05426025390625, "learning_rate": 8.53915314061301e-07, "loss": 0.0005424655973911285, "memory(GiB)": 38.13, "reward": 0.49751603603363037, "reward_std": 0.06233870983123779, "rewards/VisualizationJSONCombinedORM/mean": 0.49751603603363037, "rewards/VisualizationJSONCombinedORM/std": 0.21535231173038483, "step": 5018, "train_speed(iter/s)": 0.093438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 310.0, "completions/min_length": 251.0, "epoch": 4.15136476426799, "grad_norm": 0.20626255869865417, "kl": 0.072265625, "learning_rate": 8.523021078924359e-07, "loss": 0.0007221028208732605, "memory(GiB)": 38.13, "reward": 0.3693513870239258, "reward_std": 0.0470128208398819, "rewards/VisualizationJSONCombinedORM/mean": 0.3693513870239258, "rewards/VisualizationJSONCombinedORM/std": 0.14827357232570648, "step": 5019, "train_speed(iter/s)": 0.093419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 292.8125, "completions/min_length": 229.0, "epoch": 4.152191894127378, "grad_norm": 0.20195844769477844, "kl": 0.0628662109375, "learning_rate": 8.506902850002358e-07, "loss": 0.0006277486681938171, "memory(GiB)": 38.13, "reward": 0.5154445171356201, "reward_std": 0.061776913702487946, "rewards/VisualizationJSONCombinedORM/mean": 0.5154445171356201, "rewards/VisualizationJSONCombinedORM/std": 0.24160626530647278, "step": 5020, "train_speed(iter/s)": 0.093392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 330.125, "completions/min_length": 277.0, "epoch": 4.153019023986766, "grad_norm": 0.1891898661851883, "kl": 0.07977294921875, "learning_rate": 8.490798459222477e-07, "loss": 0.0007981173694133759, "memory(GiB)": 38.13, "reward": 0.5205003619194031, "reward_std": 0.035186849534511566, "rewards/VisualizationJSONCombinedORM/mean": 0.5205003619194031, "rewards/VisualizationJSONCombinedORM/std": 0.038679592311382294, "step": 5021, "train_speed(iter/s)": 0.093366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 340.875, "completions/min_length": 250.0, "epoch": 4.153846153846154, "grad_norm": 0.24167191982269287, "kl": 0.1773681640625, "learning_rate": 8.474707911955604e-07, "loss": 0.0017710719257593155, "memory(GiB)": 38.13, "reward": 0.5430781245231628, "reward_std": 0.07490341365337372, "rewards/VisualizationJSONCombinedORM/mean": 0.5430781245231628, "rewards/VisualizationJSONCombinedORM/std": 0.15703913569450378, "step": 5022, "train_speed(iter/s)": 0.093335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 297.25, "completions/min_length": 224.0, "epoch": 4.154673283705542, "grad_norm": 0.18138541281223297, "kl": 0.024078369140625, "learning_rate": 8.458631213568024e-07, "loss": 0.00024561211466789246, "memory(GiB)": 38.13, "reward": 0.7020269632339478, "reward_std": 0.01831589639186859, "rewards/VisualizationJSONCombinedORM/mean": 0.7020269632339478, "rewards/VisualizationJSONCombinedORM/std": 0.18236315250396729, "step": 5023, "train_speed(iter/s)": 0.093307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 295.3125, "completions/min_length": 237.0, "epoch": 4.15550041356493, "grad_norm": 0.26793161034584045, "kl": 0.07501220703125, "learning_rate": 8.442568369421406e-07, "loss": 0.0007494427263736725, "memory(GiB)": 38.13, "reward": 0.6313809156417847, "reward_std": 0.05042077600955963, "rewards/VisualizationJSONCombinedORM/mean": 0.6313809156417847, "rewards/VisualizationJSONCombinedORM/std": 0.06841806322336197, "step": 5024, "train_speed(iter/s)": 0.093281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 324.3125, "completions/min_length": 260.0, "epoch": 4.156327543424317, "grad_norm": 0.18515129387378693, "kl": 0.093505859375, "learning_rate": 8.426519384872733e-07, "loss": 0.0009328406304121017, "memory(GiB)": 38.13, "reward": 0.6364047527313232, "reward_std": 0.07418908178806305, "rewards/VisualizationJSONCombinedORM/mean": 0.6364047527313232, "rewards/VisualizationJSONCombinedORM/std": 0.10663497447967529, "step": 5025, "train_speed(iter/s)": 0.093249 }, { "epoch": 4.156327543424317, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.875, "eval_completions/mean_length": 309.7916666666667, "eval_completions/min_length": 258.0833333333333, "eval_kl": 0.08406575520833333, "eval_loss": 0.0008465573191642761, "eval_reward": 0.44275100839634735, "eval_reward_std": 0.05019304559876522, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44275100839634735, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05019304598681629, "eval_runtime": 315.3229, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 331.125, "completions/min_length": 272.0, "epoch": 4.1571546732837055, "grad_norm": 0.17595018446445465, "kl": 0.05633544921875, "learning_rate": 8.410484265274466e-07, "loss": 0.0005634352564811707, "memory(GiB)": 38.13, "reward": 0.4320530295372009, "reward_std": 0.1189093068242073, "rewards/VisualizationJSONCombinedORM/mean": 0.4320530295372009, "rewards/VisualizationJSONCombinedORM/std": 0.18422462046146393, "step": 5026, "train_speed(iter/s)": 0.092681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 308.875, "completions/min_length": 235.0, "epoch": 4.157981803143094, "grad_norm": 0.17472678422927856, "kl": 0.123779296875, "learning_rate": 8.394463015974391e-07, "loss": 0.0012358799576759338, "memory(GiB)": 38.13, "reward": 0.4337007999420166, "reward_std": 0.04668477922677994, "rewards/VisualizationJSONCombinedORM/mean": 0.4337007999420166, "rewards/VisualizationJSONCombinedORM/std": 0.06323760747909546, "step": 5027, "train_speed(iter/s)": 0.092656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 298.875, "completions/min_length": 231.0, "epoch": 4.158808933002481, "grad_norm": 0.2007777988910675, "kl": 0.05072021484375, "learning_rate": 8.378455642315636e-07, "loss": 0.0005064792931079865, "memory(GiB)": 38.13, "reward": 0.41191691160202026, "reward_std": 0.03610706329345703, "rewards/VisualizationJSONCombinedORM/mean": 0.41191691160202026, "rewards/VisualizationJSONCombinedORM/std": 0.0365922637283802, "step": 5028, "train_speed(iter/s)": 0.092631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 303.375, "completions/min_length": 247.0, "epoch": 4.159636062861869, "grad_norm": 0.24497374892234802, "kl": 0.08642578125, "learning_rate": 8.362462149636757e-07, "loss": 0.000866335816681385, "memory(GiB)": 38.13, "reward": 0.41819947957992554, "reward_std": 0.037375904619693756, "rewards/VisualizationJSONCombinedORM/mean": 0.41819947957992554, "rewards/VisualizationJSONCombinedORM/std": 0.22968046367168427, "step": 5029, "train_speed(iter/s)": 0.092613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 338.875, "completions/min_length": 267.0, "epoch": 4.160463192721258, "grad_norm": 0.18985828757286072, "kl": 0.049102783203125, "learning_rate": 8.346482543271656e-07, "loss": 0.0004903264343738556, "memory(GiB)": 38.13, "reward": 0.4769126772880554, "reward_std": 0.07557006925344467, "rewards/VisualizationJSONCombinedORM/mean": 0.4769126772880554, "rewards/VisualizationJSONCombinedORM/std": 0.18170759081840515, "step": 5030, "train_speed(iter/s)": 0.092584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 310.75, "completions/min_length": 246.0, "epoch": 4.161290322580645, "grad_norm": 0.20405946671962738, "kl": 0.05804443359375, "learning_rate": 8.330516828549618e-07, "loss": 0.0005801096558570862, "memory(GiB)": 38.13, "reward": 0.6351894736289978, "reward_std": 0.047741517424583435, "rewards/VisualizationJSONCombinedORM/mean": 0.6351894736289978, "rewards/VisualizationJSONCombinedORM/std": 0.14475904405117035, "step": 5031, "train_speed(iter/s)": 0.092561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 319.625, "completions/min_length": 249.0, "epoch": 4.162117452440033, "grad_norm": 0.23718374967575073, "kl": 0.0545654296875, "learning_rate": 8.314565010795245e-07, "loss": 0.000545889139175415, "memory(GiB)": 38.13, "reward": 0.7863274216651917, "reward_std": 0.10062570869922638, "rewards/VisualizationJSONCombinedORM/mean": 0.7863274216651917, "rewards/VisualizationJSONCombinedORM/std": 0.10335847735404968, "step": 5032, "train_speed(iter/s)": 0.092537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 319.0, "completions/min_length": 246.0, "epoch": 4.162944582299421, "grad_norm": 0.1690436750650406, "kl": 0.13916015625, "learning_rate": 8.298627095328593e-07, "loss": 0.0013894550502300262, "memory(GiB)": 38.13, "reward": 0.4564076066017151, "reward_std": 0.0799206867814064, "rewards/VisualizationJSONCombinedORM/mean": 0.4564076066017151, "rewards/VisualizationJSONCombinedORM/std": 0.14060452580451965, "step": 5033, "train_speed(iter/s)": 0.092508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 307.6875, "completions/min_length": 234.0, "epoch": 4.163771712158809, "grad_norm": 0.17622633278369904, "kl": 0.0640869140625, "learning_rate": 8.282703087464999e-07, "loss": 0.0006408188492059708, "memory(GiB)": 38.13, "reward": 0.6764479875564575, "reward_std": 0.12960372865200043, "rewards/VisualizationJSONCombinedORM/mean": 0.6764479875564575, "rewards/VisualizationJSONCombinedORM/std": 0.1772848218679428, "step": 5034, "train_speed(iter/s)": 0.092485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 328.75, "completions/min_length": 256.0, "epoch": 4.164598842018197, "grad_norm": 0.19968245923519135, "kl": 0.0789794921875, "learning_rate": 8.266792992515199e-07, "loss": 0.0007894374430179596, "memory(GiB)": 38.13, "reward": 0.35527661442756653, "reward_std": 0.04310593754053116, "rewards/VisualizationJSONCombinedORM/mean": 0.35527661442756653, "rewards/VisualizationJSONCombinedORM/std": 0.057145487517118454, "step": 5035, "train_speed(iter/s)": 0.092453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 291.0, "completions/min_length": 222.0, "epoch": 4.165425971877585, "grad_norm": 0.1773228943347931, "kl": 0.05084228515625, "learning_rate": 8.250896815785292e-07, "loss": 0.0005038231611251831, "memory(GiB)": 38.13, "reward": 0.5028419494628906, "reward_std": 0.02801639400422573, "rewards/VisualizationJSONCombinedORM/mean": 0.5028419494628906, "rewards/VisualizationJSONCombinedORM/std": 0.1294410079717636, "step": 5036, "train_speed(iter/s)": 0.092427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 315.3125, "completions/min_length": 232.0, "epoch": 4.166253101736973, "grad_norm": 0.1888968050479889, "kl": 0.07305908203125, "learning_rate": 8.235014562576732e-07, "loss": 0.0007319003343582153, "memory(GiB)": 38.13, "reward": 0.6244901418685913, "reward_std": 0.05951698124408722, "rewards/VisualizationJSONCombinedORM/mean": 0.6244901418685913, "rewards/VisualizationJSONCombinedORM/std": 0.18441353738307953, "step": 5037, "train_speed(iter/s)": 0.092402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 323.6875, "completions/min_length": 246.0, "epoch": 4.16708023159636, "grad_norm": 0.19315733015537262, "kl": 0.1004638671875, "learning_rate": 8.219146238186304e-07, "loss": 0.001004357822239399, "memory(GiB)": 38.13, "reward": 0.5203932523727417, "reward_std": 0.07398159056901932, "rewards/VisualizationJSONCombinedORM/mean": 0.5203932523727417, "rewards/VisualizationJSONCombinedORM/std": 0.11533473432064056, "step": 5038, "train_speed(iter/s)": 0.092367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 289.375, "completions/min_length": 236.0, "epoch": 4.1679073614557485, "grad_norm": 0.2283913940191269, "kl": 0.1322021484375, "learning_rate": 8.203291847906181e-07, "loss": 0.0013197138905525208, "memory(GiB)": 38.13, "reward": 0.5098912119865417, "reward_std": 0.07145769149065018, "rewards/VisualizationJSONCombinedORM/mean": 0.5098912119865417, "rewards/VisualizationJSONCombinedORM/std": 0.10781068354845047, "step": 5039, "train_speed(iter/s)": 0.092343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 310.875, "completions/min_length": 223.0, "epoch": 4.168734491315137, "grad_norm": 0.19508695602416992, "kl": 0.0621337890625, "learning_rate": 8.187451397023877e-07, "loss": 0.0006221942603588104, "memory(GiB)": 38.13, "reward": 0.4310886263847351, "reward_std": 0.03969535231590271, "rewards/VisualizationJSONCombinedORM/mean": 0.4310886263847351, "rewards/VisualizationJSONCombinedORM/std": 0.05968797206878662, "step": 5040, "train_speed(iter/s)": 0.09232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 289.375, "completions/min_length": 241.0, "epoch": 4.169561621174524, "grad_norm": 0.2010360211133957, "kl": 0.0540771484375, "learning_rate": 8.171624890822266e-07, "loss": 0.000539558008313179, "memory(GiB)": 38.13, "reward": 0.6665188670158386, "reward_std": 0.05912307649850845, "rewards/VisualizationJSONCombinedORM/mean": 0.6665188670158386, "rewards/VisualizationJSONCombinedORM/std": 0.14076659083366394, "step": 5041, "train_speed(iter/s)": 0.092297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 309.0625, "completions/min_length": 213.0, "epoch": 4.170388751033912, "grad_norm": 0.1908196657896042, "kl": 0.085693359375, "learning_rate": 8.155812334579532e-07, "loss": 0.0008553564548492432, "memory(GiB)": 38.13, "reward": 0.6254433393478394, "reward_std": 0.04764217138290405, "rewards/VisualizationJSONCombinedORM/mean": 0.6254433393478394, "rewards/VisualizationJSONCombinedORM/std": 0.1341215968132019, "step": 5042, "train_speed(iter/s)": 0.092265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 321.875, "completions/min_length": 261.0, "epoch": 4.171215880893301, "grad_norm": 0.17606744170188904, "kl": 0.0406494140625, "learning_rate": 8.140013733569274e-07, "loss": 0.00040519237518310547, "memory(GiB)": 38.13, "reward": 0.7017409205436707, "reward_std": 0.046663593500852585, "rewards/VisualizationJSONCombinedORM/mean": 0.7017409205436707, "rewards/VisualizationJSONCombinedORM/std": 0.0662882998585701, "step": 5043, "train_speed(iter/s)": 0.092238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 301.0, "completions/min_length": 237.0, "epoch": 4.172043010752688, "grad_norm": 0.15748289227485657, "kl": 0.07550048828125, "learning_rate": 8.124229093060371e-07, "loss": 0.0007553212344646454, "memory(GiB)": 38.13, "reward": 0.6220755577087402, "reward_std": 0.016558345407247543, "rewards/VisualizationJSONCombinedORM/mean": 0.6220755577087402, "rewards/VisualizationJSONCombinedORM/std": 0.08315285295248032, "step": 5044, "train_speed(iter/s)": 0.092213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 319.0625, "completions/min_length": 268.0, "epoch": 4.172870140612076, "grad_norm": 0.1947827935218811, "kl": 0.0428466796875, "learning_rate": 8.108458418317089e-07, "loss": 0.0004276186227798462, "memory(GiB)": 38.13, "reward": 0.6454691886901855, "reward_std": 0.0609213262796402, "rewards/VisualizationJSONCombinedORM/mean": 0.6454691886901855, "rewards/VisualizationJSONCombinedORM/std": 0.16944681107997894, "step": 5045, "train_speed(iter/s)": 0.092184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 276.25, "completions/min_length": 228.0, "epoch": 4.173697270471464, "grad_norm": 0.1984272599220276, "kl": 0.10302734375, "learning_rate": 8.09270171459901e-07, "loss": 0.0010299161076545715, "memory(GiB)": 38.13, "reward": 0.3748806118965149, "reward_std": 0.036953460425138474, "rewards/VisualizationJSONCombinedORM/mean": 0.3748806118965149, "rewards/VisualizationJSONCombinedORM/std": 0.04188511520624161, "step": 5046, "train_speed(iter/s)": 0.092156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 306.5, "completions/min_length": 250.0, "epoch": 4.174524400330852, "grad_norm": 0.17828556895256042, "kl": 0.0909423828125, "learning_rate": 8.076958987161093e-07, "loss": 0.0009081996977329254, "memory(GiB)": 38.13, "reward": 0.4701228141784668, "reward_std": 0.030848830938339233, "rewards/VisualizationJSONCombinedORM/mean": 0.4701228141784668, "rewards/VisualizationJSONCombinedORM/std": 0.05712313577532768, "step": 5047, "train_speed(iter/s)": 0.092123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 261.125, "completions/min_length": 207.0, "epoch": 4.17535153019024, "grad_norm": 0.2821100652217865, "kl": 0.0623779296875, "learning_rate": 8.06123024125357e-07, "loss": 0.0006226301193237305, "memory(GiB)": 38.13, "reward": 0.4812869429588318, "reward_std": 0.07879342138767242, "rewards/VisualizationJSONCombinedORM/mean": 0.4812869429588318, "rewards/VisualizationJSONCombinedORM/std": 0.1460103541612625, "step": 5048, "train_speed(iter/s)": 0.092105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 317.5, "completions/min_length": 239.0, "epoch": 4.176178660049628, "grad_norm": 0.1857023537158966, "kl": 0.130859375, "learning_rate": 8.045515482122096e-07, "loss": 0.0013067275285720825, "memory(GiB)": 38.13, "reward": 0.6638223528862, "reward_std": 0.08304573595523834, "rewards/VisualizationJSONCombinedORM/mean": 0.6638223528862, "rewards/VisualizationJSONCombinedORM/std": 0.08892600238323212, "step": 5049, "train_speed(iter/s)": 0.092074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 299.125, "completions/min_length": 231.0, "epoch": 4.177005789909016, "grad_norm": 0.18347105383872986, "kl": 0.08367919921875, "learning_rate": 8.029814715007589e-07, "loss": 0.00083884596824646, "memory(GiB)": 38.13, "reward": 0.5641165971755981, "reward_std": 0.04883520305156708, "rewards/VisualizationJSONCombinedORM/mean": 0.5641165971755981, "rewards/VisualizationJSONCombinedORM/std": 0.17297013103961945, "step": 5050, "train_speed(iter/s)": 0.092044 }, { "epoch": 4.177005789909016, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 376.7916666666667, "eval_completions/mean_length": 311.359375, "eval_completions/min_length": 256.3333333333333, "eval_kl": 0.08745320638020833, "eval_loss": 0.0009003554587252438, "eval_reward": 0.4333483719577392, "eval_reward_std": 0.048705127206631005, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4333483719577392, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04870512840958933, "eval_runtime": 318.796, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 287.25, "completions/min_length": 211.0, "epoch": 4.177832919768404, "grad_norm": 0.1735859513282776, "kl": 0.07049560546875, "learning_rate": 8.014127945146333e-07, "loss": 0.0007061189971864223, "memory(GiB)": 38.13, "reward": 0.6892027854919434, "reward_std": 0.06755027920007706, "rewards/VisualizationJSONCombinedORM/mean": 0.6892027854919434, "rewards/VisualizationJSONCombinedORM/std": 0.08933515101671219, "step": 5051, "train_speed(iter/s)": 0.091493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 311.875, "completions/min_length": 221.0, "epoch": 4.1786600496277915, "grad_norm": 0.2740870416164398, "kl": 0.09185791015625, "learning_rate": 7.998455177769942e-07, "loss": 0.0009194239974021912, "memory(GiB)": 38.13, "reward": 0.3995969295501709, "reward_std": 0.0639071837067604, "rewards/VisualizationJSONCombinedORM/mean": 0.3995969295501709, "rewards/VisualizationJSONCombinedORM/std": 0.21404114365577698, "step": 5052, "train_speed(iter/s)": 0.091462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 328.875, "completions/min_length": 260.0, "epoch": 4.17948717948718, "grad_norm": 0.19394910335540771, "kl": 0.064453125, "learning_rate": 7.98279641810537e-07, "loss": 0.0006428360939025879, "memory(GiB)": 38.13, "reward": 0.46752068400382996, "reward_std": 0.07377844303846359, "rewards/VisualizationJSONCombinedORM/mean": 0.46752068400382996, "rewards/VisualizationJSONCombinedORM/std": 0.16891123354434967, "step": 5053, "train_speed(iter/s)": 0.091435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 312.375, "completions/min_length": 240.0, "epoch": 4.180314309346567, "grad_norm": 0.23117588460445404, "kl": 0.0989990234375, "learning_rate": 7.967151671374863e-07, "loss": 0.0009885765612125397, "memory(GiB)": 38.13, "reward": 0.44870662689208984, "reward_std": 0.0900474414229393, "rewards/VisualizationJSONCombinedORM/mean": 0.44870662689208984, "rewards/VisualizationJSONCombinedORM/std": 0.08716810494661331, "step": 5054, "train_speed(iter/s)": 0.091415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 275.125, "completions/min_length": 238.0, "epoch": 4.181141439205955, "grad_norm": 0.12881746888160706, "kl": 0.04071044921875, "learning_rate": 7.951520942796026e-07, "loss": 0.00040727690793573856, "memory(GiB)": 38.13, "reward": 0.6143280863761902, "reward_std": 0.016806377097964287, "rewards/VisualizationJSONCombinedORM/mean": 0.6143280863761902, "rewards/VisualizationJSONCombinedORM/std": 0.2169681340456009, "step": 5055, "train_speed(iter/s)": 0.091394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 297.4375, "completions/min_length": 223.0, "epoch": 4.181968569065344, "grad_norm": 0.16603101789951324, "kl": 0.07110595703125, "learning_rate": 7.935904237581788e-07, "loss": 0.0007101744413375854, "memory(GiB)": 38.13, "reward": 0.4604361951351166, "reward_std": 0.0325201041996479, "rewards/VisualizationJSONCombinedORM/mean": 0.4604361951351166, "rewards/VisualizationJSONCombinedORM/std": 0.15535815060138702, "step": 5056, "train_speed(iter/s)": 0.091365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 291.5, "completions/min_length": 237.0, "epoch": 4.182795698924731, "grad_norm": 0.3118191063404083, "kl": 0.25341796875, "learning_rate": 7.920301560940391e-07, "loss": 0.00252663716673851, "memory(GiB)": 38.13, "reward": 0.6439746618270874, "reward_std": 0.06313247233629227, "rewards/VisualizationJSONCombinedORM/mean": 0.6439746618270874, "rewards/VisualizationJSONCombinedORM/std": 0.0800199881196022, "step": 5057, "train_speed(iter/s)": 0.091347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 309.6875, "completions/min_length": 226.0, "epoch": 4.183622828784119, "grad_norm": 0.23003900051116943, "kl": 0.1590576171875, "learning_rate": 7.904712918075381e-07, "loss": 0.0015892833471298218, "memory(GiB)": 38.13, "reward": 0.3521348536014557, "reward_std": 0.035852447152137756, "rewards/VisualizationJSONCombinedORM/mean": 0.3521348536014557, "rewards/VisualizationJSONCombinedORM/std": 0.07138539105653763, "step": 5058, "train_speed(iter/s)": 0.091325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 307.0, "completions/min_length": 248.0, "epoch": 4.184449958643507, "grad_norm": 0.19785495102405548, "kl": 0.1131591796875, "learning_rate": 7.88913831418568e-07, "loss": 0.0011316202580928802, "memory(GiB)": 38.13, "reward": 0.6534603834152222, "reward_std": 0.03882983326911926, "rewards/VisualizationJSONCombinedORM/mean": 0.6534603834152222, "rewards/VisualizationJSONCombinedORM/std": 0.12604866921901703, "step": 5059, "train_speed(iter/s)": 0.091301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 317.0, "completions/min_length": 272.0, "epoch": 4.185277088502895, "grad_norm": 0.20118477940559387, "kl": 0.1326904296875, "learning_rate": 7.873577754465456e-07, "loss": 0.0013262033462524414, "memory(GiB)": 38.13, "reward": 0.5796487331390381, "reward_std": 0.06933539360761642, "rewards/VisualizationJSONCombinedORM/mean": 0.5796487331390381, "rewards/VisualizationJSONCombinedORM/std": 0.07575967162847519, "step": 5060, "train_speed(iter/s)": 0.09128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 317.5, "completions/min_length": 242.0, "epoch": 4.186104218362283, "grad_norm": 0.21107368171215057, "kl": 0.0908203125, "learning_rate": 7.858031244104247e-07, "loss": 0.000909171998500824, "memory(GiB)": 38.13, "reward": 0.5867280960083008, "reward_std": 0.04459042847156525, "rewards/VisualizationJSONCombinedORM/mean": 0.5867280960083008, "rewards/VisualizationJSONCombinedORM/std": 0.0882357805967331, "step": 5061, "train_speed(iter/s)": 0.091253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 318.625, "completions/min_length": 263.0, "epoch": 4.186931348221671, "grad_norm": 0.17609688639640808, "kl": 0.07891845703125, "learning_rate": 7.842498788286884e-07, "loss": 0.000790674239397049, "memory(GiB)": 38.13, "reward": 0.5946343541145325, "reward_std": 0.06730682402849197, "rewards/VisualizationJSONCombinedORM/mean": 0.5946343541145325, "rewards/VisualizationJSONCombinedORM/std": 0.06816650182008743, "step": 5062, "train_speed(iter/s)": 0.091218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 310.0, "completions/min_length": 240.0, "epoch": 4.187758478081059, "grad_norm": 0.17907144129276276, "kl": 0.098388671875, "learning_rate": 7.826980392193523e-07, "loss": 0.0009869150817394257, "memory(GiB)": 38.13, "reward": 0.38593605160713196, "reward_std": 0.05695237219333649, "rewards/VisualizationJSONCombinedORM/mean": 0.38593605160713196, "rewards/VisualizationJSONCombinedORM/std": 0.11624252051115036, "step": 5063, "train_speed(iter/s)": 0.091193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 319.0, "completions/min_length": 261.0, "epoch": 4.188585607940446, "grad_norm": 0.16717681288719177, "kl": 0.0467529296875, "learning_rate": 7.81147606099959e-07, "loss": 0.000467836856842041, "memory(GiB)": 38.13, "reward": 0.6525522470474243, "reward_std": 0.028063111007213593, "rewards/VisualizationJSONCombinedORM/mean": 0.6525522470474243, "rewards/VisualizationJSONCombinedORM/std": 0.13489466905593872, "step": 5064, "train_speed(iter/s)": 0.091158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 302.125, "completions/min_length": 246.0, "epoch": 4.1894127377998345, "grad_norm": 0.19437013566493988, "kl": 0.0350341796875, "learning_rate": 7.7959857998759e-07, "loss": 0.0003504455089569092, "memory(GiB)": 38.13, "reward": 0.7526719570159912, "reward_std": 0.052413344383239746, "rewards/VisualizationJSONCombinedORM/mean": 0.7526719570159912, "rewards/VisualizationJSONCombinedORM/std": 0.12776897847652435, "step": 5065, "train_speed(iter/s)": 0.091127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/mean_length": 320.8125, "completions/min_length": 235.0, "epoch": 4.190239867659223, "grad_norm": 0.20803801715373993, "kl": 0.04754638671875, "learning_rate": 7.780509613988485e-07, "loss": 0.00047578662633895874, "memory(GiB)": 38.13, "reward": 0.6046216487884521, "reward_std": 0.03654889017343521, "rewards/VisualizationJSONCombinedORM/mean": 0.6046216487884521, "rewards/VisualizationJSONCombinedORM/std": 0.23997530341148376, "step": 5066, "train_speed(iter/s)": 0.091091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 324.5625, "completions/min_length": 241.0, "epoch": 4.19106699751861, "grad_norm": 0.19101615250110626, "kl": 0.05084228515625, "learning_rate": 7.765047508498741e-07, "loss": 0.0005085915327072144, "memory(GiB)": 38.13, "reward": 0.5591806769371033, "reward_std": 0.052026450634002686, "rewards/VisualizationJSONCombinedORM/mean": 0.5591806769371033, "rewards/VisualizationJSONCombinedORM/std": 0.19976316392421722, "step": 5067, "train_speed(iter/s)": 0.091066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 310.375, "completions/min_length": 241.0, "epoch": 4.191894127377998, "grad_norm": 0.19278442859649658, "kl": 0.06927490234375, "learning_rate": 7.749599488563359e-07, "loss": 0.0006931722164154053, "memory(GiB)": 38.13, "reward": 0.4033365249633789, "reward_std": 0.037133269011974335, "rewards/VisualizationJSONCombinedORM/mean": 0.4033365249633789, "rewards/VisualizationJSONCombinedORM/std": 0.15709678828716278, "step": 5068, "train_speed(iter/s)": 0.091043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 283.75, "completions/min_length": 225.0, "epoch": 4.192721257237387, "grad_norm": 0.19438791275024414, "kl": 0.0775146484375, "learning_rate": 7.734165559334327e-07, "loss": 0.0007759258151054382, "memory(GiB)": 38.13, "reward": 0.48965945839881897, "reward_std": 0.06040593981742859, "rewards/VisualizationJSONCombinedORM/mean": 0.48965945839881897, "rewards/VisualizationJSONCombinedORM/std": 0.2824670076370239, "step": 5069, "train_speed(iter/s)": 0.09102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 320.5625, "completions/min_length": 235.0, "epoch": 4.193548387096774, "grad_norm": 0.17196889221668243, "kl": 0.1124267578125, "learning_rate": 7.718745725958914e-07, "loss": 0.0011244751513004303, "memory(GiB)": 38.13, "reward": 0.5996297597885132, "reward_std": 0.055935487151145935, "rewards/VisualizationJSONCombinedORM/mean": 0.5996297597885132, "rewards/VisualizationJSONCombinedORM/std": 0.1585472673177719, "step": 5070, "train_speed(iter/s)": 0.09099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 307.0625, "completions/min_length": 251.0, "epoch": 4.194375516956162, "grad_norm": 0.20607814192771912, "kl": 0.074462890625, "learning_rate": 7.703339993579723e-07, "loss": 0.0007437616586685181, "memory(GiB)": 38.13, "reward": 0.47724777460098267, "reward_std": 0.05267217010259628, "rewards/VisualizationJSONCombinedORM/mean": 0.47724777460098267, "rewards/VisualizationJSONCombinedORM/std": 0.2776682674884796, "step": 5071, "train_speed(iter/s)": 0.090969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 339.5625, "completions/min_length": 280.0, "epoch": 4.19520264681555, "grad_norm": 0.1911536455154419, "kl": 0.201904296875, "learning_rate": 7.687948367334636e-07, "loss": 0.002016723155975342, "memory(GiB)": 38.13, "reward": 0.6374456882476807, "reward_std": 0.04289791360497475, "rewards/VisualizationJSONCombinedORM/mean": 0.6374456882476807, "rewards/VisualizationJSONCombinedORM/std": 0.0923440009355545, "step": 5072, "train_speed(iter/s)": 0.090943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 330.625, "completions/min_length": 264.0, "epoch": 4.196029776674938, "grad_norm": 0.24557079374790192, "kl": 0.06964111328125, "learning_rate": 7.672570852356837e-07, "loss": 0.0006960928440093994, "memory(GiB)": 38.13, "reward": 0.809050440788269, "reward_std": 0.04032401368021965, "rewards/VisualizationJSONCombinedORM/mean": 0.809050440788269, "rewards/VisualizationJSONCombinedORM/std": 0.05318340286612511, "step": 5073, "train_speed(iter/s)": 0.090916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 322.3125, "completions/min_length": 256.0, "epoch": 4.196856906534326, "grad_norm": 0.1827652007341385, "kl": 0.133544921875, "learning_rate": 7.657207453774767e-07, "loss": 0.0013317354023456573, "memory(GiB)": 38.13, "reward": 0.5727169513702393, "reward_std": 0.03970722109079361, "rewards/VisualizationJSONCombinedORM/mean": 0.5727169513702393, "rewards/VisualizationJSONCombinedORM/std": 0.30134570598602295, "step": 5074, "train_speed(iter/s)": 0.090888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 302.5, "completions/min_length": 256.0, "epoch": 4.197684036393714, "grad_norm": 0.19374418258666992, "kl": 0.072998046875, "learning_rate": 7.641858176712241e-07, "loss": 0.0007289871573448181, "memory(GiB)": 38.13, "reward": 0.5964989066123962, "reward_std": 0.03232577443122864, "rewards/VisualizationJSONCombinedORM/mean": 0.5964989066123962, "rewards/VisualizationJSONCombinedORM/std": 0.13479486107826233, "step": 5075, "train_speed(iter/s)": 0.090865 }, { "epoch": 4.197684036393714, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.5833333333333, "eval_completions/mean_length": 305.5625, "eval_completions/min_length": 255.29166666666666, "eval_kl": 0.08289591471354167, "eval_loss": 0.0008271988481283188, "eval_reward": 0.4419046528637409, "eval_reward_std": 0.05752596140761549, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4419046528637409, "eval_rewards/VisualizationJSONCombinedORM/std": 0.057525960980759315, "eval_runtime": 313.0698, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 326.125, "completions/min_length": 254.0, "epoch": 4.198511166253102, "grad_norm": 0.25550711154937744, "kl": 0.08172607421875, "learning_rate": 7.626523026288279e-07, "loss": 0.0008179843425750732, "memory(GiB)": 38.13, "reward": 0.6639910936355591, "reward_std": 0.07906349003314972, "rewards/VisualizationJSONCombinedORM/mean": 0.6639910936355591, "rewards/VisualizationJSONCombinedORM/std": 0.17874445021152496, "step": 5076, "train_speed(iter/s)": 0.09033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 313.75, "completions/min_length": 237.0, "epoch": 4.19933829611249, "grad_norm": 0.18927128612995148, "kl": 0.0604248046875, "learning_rate": 7.611202007617241e-07, "loss": 0.0006042085587978363, "memory(GiB)": 38.13, "reward": 0.44334521889686584, "reward_std": 0.0275276992470026, "rewards/VisualizationJSONCombinedORM/mean": 0.44334521889686584, "rewards/VisualizationJSONCombinedORM/std": 0.3010423183441162, "step": 5077, "train_speed(iter/s)": 0.090303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 285.3125, "completions/min_length": 219.0, "epoch": 4.2001654259718775, "grad_norm": 0.1994229555130005, "kl": 0.0574951171875, "learning_rate": 7.595895125808749e-07, "loss": 0.0005766786634922028, "memory(GiB)": 38.13, "reward": 0.6881206631660461, "reward_std": 0.0748467743396759, "rewards/VisualizationJSONCombinedORM/mean": 0.6881206631660461, "rewards/VisualizationJSONCombinedORM/std": 0.14462922513484955, "step": 5078, "train_speed(iter/s)": 0.09027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 310.25, "completions/min_length": 251.0, "epoch": 4.200992555831266, "grad_norm": 0.1917998194694519, "kl": 0.13189697265625, "learning_rate": 7.58060238596774e-07, "loss": 0.0013167932629585266, "memory(GiB)": 38.13, "reward": 0.39343032240867615, "reward_std": 0.04770590364933014, "rewards/VisualizationJSONCombinedORM/mean": 0.39343032240867615, "rewards/VisualizationJSONCombinedORM/std": 0.18311959505081177, "step": 5079, "train_speed(iter/s)": 0.090247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 314.25, "completions/min_length": 222.0, "epoch": 4.201819685690653, "grad_norm": 0.2514728307723999, "kl": 0.07110595703125, "learning_rate": 7.565323793194373e-07, "loss": 0.00070982426404953, "memory(GiB)": 38.13, "reward": 0.6528012752532959, "reward_std": 0.06408152729272842, "rewards/VisualizationJSONCombinedORM/mean": 0.6528012752532959, "rewards/VisualizationJSONCombinedORM/std": 0.14247387647628784, "step": 5080, "train_speed(iter/s)": 0.090221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 303.25, "completions/min_length": 252.0, "epoch": 4.202646815550041, "grad_norm": 0.21493574976921082, "kl": 0.04974365234375, "learning_rate": 7.550059352584182e-07, "loss": 0.0004962831735610962, "memory(GiB)": 38.13, "reward": 0.50932776927948, "reward_std": 0.06296303868293762, "rewards/VisualizationJSONCombinedORM/mean": 0.50932776927948, "rewards/VisualizationJSONCombinedORM/std": 0.07274266332387924, "step": 5081, "train_speed(iter/s)": 0.090193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 293.0, "completions/min_length": 233.0, "epoch": 4.20347394540943, "grad_norm": 0.2587360441684723, "kl": 0.07220458984375, "learning_rate": 7.534809069227894e-07, "loss": 0.0007224157452583313, "memory(GiB)": 38.13, "reward": 0.6376641988754272, "reward_std": 0.07521598041057587, "rewards/VisualizationJSONCombinedORM/mean": 0.6376641988754272, "rewards/VisualizationJSONCombinedORM/std": 0.11423281580209732, "step": 5082, "train_speed(iter/s)": 0.090177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 313.875, "completions/min_length": 233.0, "epoch": 4.204301075268817, "grad_norm": 0.20091184973716736, "kl": 0.0738525390625, "learning_rate": 7.519572948211556e-07, "loss": 0.0007403120398521423, "memory(GiB)": 38.13, "reward": 0.5607317686080933, "reward_std": 0.05446441471576691, "rewards/VisualizationJSONCombinedORM/mean": 0.5607317686080933, "rewards/VisualizationJSONCombinedORM/std": 0.06690215319395065, "step": 5083, "train_speed(iter/s)": 0.090149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 313.0625, "completions/min_length": 251.0, "epoch": 4.205128205128205, "grad_norm": 0.1903037279844284, "kl": 0.0733642578125, "learning_rate": 7.504350994616488e-07, "loss": 0.0007324293255805969, "memory(GiB)": 38.13, "reward": 0.48300325870513916, "reward_std": 0.04105617478489876, "rewards/VisualizationJSONCombinedORM/mean": 0.48300325870513916, "rewards/VisualizationJSONCombinedORM/std": 0.04166365787386894, "step": 5084, "train_speed(iter/s)": 0.090116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 333.25, "completions/min_length": 258.0, "epoch": 4.205955334987593, "grad_norm": 0.226377934217453, "kl": 0.06005859375, "learning_rate": 7.489143213519301e-07, "loss": 0.0006008706986904144, "memory(GiB)": 38.13, "reward": 0.39093315601348877, "reward_std": 0.04920639842748642, "rewards/VisualizationJSONCombinedORM/mean": 0.39093315601348877, "rewards/VisualizationJSONCombinedORM/std": 0.07815814018249512, "step": 5085, "train_speed(iter/s)": 0.090092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 285.75, "completions/min_length": 236.0, "epoch": 4.206782464846981, "grad_norm": 0.1679670214653015, "kl": 0.0355224609375, "learning_rate": 7.473949609991832e-07, "loss": 0.0003546029329299927, "memory(GiB)": 38.13, "reward": 0.7719916701316833, "reward_std": 0.04497050493955612, "rewards/VisualizationJSONCombinedORM/mean": 0.7719916701316833, "rewards/VisualizationJSONCombinedORM/std": 0.061720553785562515, "step": 5086, "train_speed(iter/s)": 0.09007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 289.4375, "completions/min_length": 236.0, "epoch": 4.207609594706369, "grad_norm": 0.18477049469947815, "kl": 0.213134765625, "learning_rate": 7.458770189101228e-07, "loss": 0.0021325983107089996, "memory(GiB)": 38.13, "reward": 0.3909187912940979, "reward_std": 0.02188846468925476, "rewards/VisualizationJSONCombinedORM/mean": 0.3909187912940979, "rewards/VisualizationJSONCombinedORM/std": 0.19880065321922302, "step": 5087, "train_speed(iter/s)": 0.090043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 332.25, "completions/min_length": 251.0, "epoch": 4.208436724565757, "grad_norm": 0.20116059482097626, "kl": 0.09710693359375, "learning_rate": 7.4436049559099e-07, "loss": 0.0009709224104881287, "memory(GiB)": 38.13, "reward": 0.6315712928771973, "reward_std": 0.10215108096599579, "rewards/VisualizationJSONCombinedORM/mean": 0.6315712928771973, "rewards/VisualizationJSONCombinedORM/std": 0.1304970532655716, "step": 5088, "train_speed(iter/s)": 0.090018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 294.5625, "completions/min_length": 224.0, "epoch": 4.209263854425145, "grad_norm": 0.181348517537117, "kl": 0.04986572265625, "learning_rate": 7.428453915475542e-07, "loss": 0.0004989206790924072, "memory(GiB)": 38.13, "reward": 0.48933982849121094, "reward_std": 0.018595661967992783, "rewards/VisualizationJSONCombinedORM/mean": 0.48933982849121094, "rewards/VisualizationJSONCombinedORM/std": 0.296176940202713, "step": 5089, "train_speed(iter/s)": 0.089993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 315.5, "completions/min_length": 225.0, "epoch": 4.210090984284533, "grad_norm": 0.32327625155448914, "kl": 0.047119140625, "learning_rate": 7.413317072851051e-07, "loss": 0.00047091394662857056, "memory(GiB)": 38.13, "reward": 0.5430662631988525, "reward_std": 0.10127997398376465, "rewards/VisualizationJSONCombinedORM/mean": 0.5430662631988525, "rewards/VisualizationJSONCombinedORM/std": 0.241085022687912, "step": 5090, "train_speed(iter/s)": 0.089963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 318.25, "completions/min_length": 243.0, "epoch": 4.2109181141439205, "grad_norm": 0.20847971737384796, "kl": 0.1220703125, "learning_rate": 7.398194433084688e-07, "loss": 0.0012240875512361526, "memory(GiB)": 38.13, "reward": 0.3505147695541382, "reward_std": 0.04359498992562294, "rewards/VisualizationJSONCombinedORM/mean": 0.3505147695541382, "rewards/VisualizationJSONCombinedORM/std": 0.1117783859372139, "step": 5091, "train_speed(iter/s)": 0.089942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 304.75, "completions/min_length": 259.0, "epoch": 4.211745244003309, "grad_norm": 0.17259149253368378, "kl": 0.0404052734375, "learning_rate": 7.383086001219886e-07, "loss": 0.00040410831570625305, "memory(GiB)": 38.13, "reward": 0.3680747151374817, "reward_std": 0.03306388854980469, "rewards/VisualizationJSONCombinedORM/mean": 0.3680747151374817, "rewards/VisualizationJSONCombinedORM/std": 0.07733297348022461, "step": 5092, "train_speed(iter/s)": 0.089914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 277.8125, "completions/min_length": 230.0, "epoch": 4.212572373862696, "grad_norm": 0.15710777044296265, "kl": 0.05267333984375, "learning_rate": 7.367991782295392e-07, "loss": 0.0005264654755592346, "memory(GiB)": 38.13, "reward": 0.6458770036697388, "reward_std": 0.06728850305080414, "rewards/VisualizationJSONCombinedORM/mean": 0.6458770036697388, "rewards/VisualizationJSONCombinedORM/std": 0.15417152643203735, "step": 5093, "train_speed(iter/s)": 0.089892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 308.0625, "completions/min_length": 251.0, "epoch": 4.213399503722084, "grad_norm": 0.2376195788383484, "kl": 0.1331787109375, "learning_rate": 7.352911781345201e-07, "loss": 0.0013308078050613403, "memory(GiB)": 38.13, "reward": 0.6926762461662292, "reward_std": 0.08217597007751465, "rewards/VisualizationJSONCombinedORM/mean": 0.6926762461662292, "rewards/VisualizationJSONCombinedORM/std": 0.08173912018537521, "step": 5094, "train_speed(iter/s)": 0.089866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 300.25, "completions/min_length": 253.0, "epoch": 4.214226633581473, "grad_norm": 0.16673386096954346, "kl": 0.112060546875, "learning_rate": 7.337846003398568e-07, "loss": 0.0011209528893232346, "memory(GiB)": 38.13, "reward": 0.3205469250679016, "reward_std": 0.02115633152425289, "rewards/VisualizationJSONCombinedORM/mean": 0.3205469250679016, "rewards/VisualizationJSONCombinedORM/std": 0.06734258681535721, "step": 5095, "train_speed(iter/s)": 0.089844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 318.6875, "completions/min_length": 249.0, "epoch": 4.21505376344086, "grad_norm": 0.17890888452529907, "kl": 0.07366943359375, "learning_rate": 7.32279445347997e-07, "loss": 0.0007353685796260834, "memory(GiB)": 38.13, "reward": 0.37767767906188965, "reward_std": 0.024432823061943054, "rewards/VisualizationJSONCombinedORM/mean": 0.37767767906188965, "rewards/VisualizationJSONCombinedORM/std": 0.17345905303955078, "step": 5096, "train_speed(iter/s)": 0.089815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 334.25, "completions/min_length": 235.0, "epoch": 4.215880893300248, "grad_norm": 0.18085229396820068, "kl": 0.06292724609375, "learning_rate": 7.307757136609218e-07, "loss": 0.0006291568279266357, "memory(GiB)": 38.13, "reward": 0.4237247705459595, "reward_std": 0.03922726958990097, "rewards/VisualizationJSONCombinedORM/mean": 0.4237247705459595, "rewards/VisualizationJSONCombinedORM/std": 0.15823699533939362, "step": 5097, "train_speed(iter/s)": 0.089787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 312.0625, "completions/min_length": 248.0, "epoch": 4.216708023159636, "grad_norm": 0.18475963175296783, "kl": 0.0921630859375, "learning_rate": 7.292734057801287e-07, "loss": 0.000922027975320816, "memory(GiB)": 38.13, "reward": 0.6082786321640015, "reward_std": 0.05201420560479164, "rewards/VisualizationJSONCombinedORM/mean": 0.6082786321640015, "rewards/VisualizationJSONCombinedORM/std": 0.13856111466884613, "step": 5098, "train_speed(iter/s)": 0.089764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 314.3125, "completions/min_length": 257.0, "epoch": 4.217535153019024, "grad_norm": 0.23058965802192688, "kl": 0.21728515625, "learning_rate": 7.277725222066467e-07, "loss": 0.002167981117963791, "memory(GiB)": 38.13, "reward": 0.44518235325813293, "reward_std": 0.04570865258574486, "rewards/VisualizationJSONCombinedORM/mean": 0.44518235325813293, "rewards/VisualizationJSONCombinedORM/std": 0.14665117859840393, "step": 5099, "train_speed(iter/s)": 0.089742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 318.125, "completions/min_length": 258.0, "epoch": 4.218362282878412, "grad_norm": 0.14085233211517334, "kl": 0.031463623046875, "learning_rate": 7.262730634410259e-07, "loss": 0.000315413111820817, "memory(GiB)": 38.13, "reward": 0.5755539536476135, "reward_std": 0.03549312800168991, "rewards/VisualizationJSONCombinedORM/mean": 0.5755539536476135, "rewards/VisualizationJSONCombinedORM/std": 0.08473464846611023, "step": 5100, "train_speed(iter/s)": 0.089715 }, { "epoch": 4.218362282878412, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 378.0416666666667, "eval_completions/mean_length": 311.9479166666667, "eval_completions/min_length": 260.7083333333333, "eval_kl": 0.07658894856770833, "eval_loss": 0.0007697765831835568, "eval_reward": 0.44188931460181874, "eval_reward_std": 0.05077755282400176, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44188931460181874, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05077755334787071, "eval_runtime": 319.9871, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 320.625, "completions/min_length": 243.0, "epoch": 4.2191894127378, "grad_norm": 0.23152117431163788, "kl": 0.05841064453125, "learning_rate": 7.24775029983345e-07, "loss": 0.0005836039781570435, "memory(GiB)": 38.13, "reward": 0.5089685916900635, "reward_std": 0.05039111524820328, "rewards/VisualizationJSONCombinedORM/mean": 0.5089685916900635, "rewards/VisualizationJSONCombinedORM/std": 0.055756039917469025, "step": 5101, "train_speed(iter/s)": 0.089178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 278.75, "completions/min_length": 247.0, "epoch": 4.220016542597188, "grad_norm": 0.28908541798591614, "kl": 0.0777587890625, "learning_rate": 7.232784223332029e-07, "loss": 0.0007764026522636414, "memory(GiB)": 38.13, "reward": 0.6791344881057739, "reward_std": 0.07074996083974838, "rewards/VisualizationJSONCombinedORM/mean": 0.6791344881057739, "rewards/VisualizationJSONCombinedORM/std": 0.0699339434504509, "step": 5102, "train_speed(iter/s)": 0.089149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 286.875, "completions/min_length": 235.0, "epoch": 4.220843672456576, "grad_norm": 0.17058345675468445, "kl": 0.0364990234375, "learning_rate": 7.217832409897263e-07, "loss": 0.00036606565117836, "memory(GiB)": 38.13, "reward": 0.7358807325363159, "reward_std": 0.04754828289151192, "rewards/VisualizationJSONCombinedORM/mean": 0.7358807325363159, "rewards/VisualizationJSONCombinedORM/std": 0.05922775715589523, "step": 5103, "train_speed(iter/s)": 0.089126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 313.125, "completions/min_length": 232.0, "epoch": 4.2216708023159635, "grad_norm": 0.18975286185741425, "kl": 0.1038818359375, "learning_rate": 7.202894864515647e-07, "loss": 0.0010406747460365295, "memory(GiB)": 38.13, "reward": 0.5841716527938843, "reward_std": 0.03779412433505058, "rewards/VisualizationJSONCombinedORM/mean": 0.5841716527938843, "rewards/VisualizationJSONCombinedORM/std": 0.19622546434402466, "step": 5104, "train_speed(iter/s)": 0.089102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 301.0625, "completions/min_length": 215.0, "epoch": 4.222497932175352, "grad_norm": 0.19883352518081665, "kl": 0.10443115234375, "learning_rate": 7.187971592168936e-07, "loss": 0.0010440442711114883, "memory(GiB)": 38.13, "reward": 0.6093020439147949, "reward_std": 0.08539294451475143, "rewards/VisualizationJSONCombinedORM/mean": 0.6093020439147949, "rewards/VisualizationJSONCombinedORM/std": 0.14639998972415924, "step": 5105, "train_speed(iter/s)": 0.089088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 322.6875, "completions/min_length": 251.0, "epoch": 4.223325062034739, "grad_norm": 0.17763568460941315, "kl": 0.05792236328125, "learning_rate": 7.17306259783408e-07, "loss": 0.0005798730999231339, "memory(GiB)": 38.13, "reward": 0.6713132858276367, "reward_std": 0.061700910329818726, "rewards/VisualizationJSONCombinedORM/mean": 0.6713132858276367, "rewards/VisualizationJSONCombinedORM/std": 0.08352421969175339, "step": 5106, "train_speed(iter/s)": 0.089065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 335.8125, "completions/min_length": 270.0, "epoch": 4.224152191894127, "grad_norm": 0.1614745706319809, "kl": 0.0567626953125, "learning_rate": 7.158167886483342e-07, "loss": 0.0005666837096214294, "memory(GiB)": 38.13, "reward": 0.42098888754844666, "reward_std": 0.024680688977241516, "rewards/VisualizationJSONCombinedORM/mean": 0.42098888754844666, "rewards/VisualizationJSONCombinedORM/std": 0.12708023190498352, "step": 5107, "train_speed(iter/s)": 0.08904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 307.75, "completions/min_length": 253.0, "epoch": 4.224979321753516, "grad_norm": 0.24592554569244385, "kl": 0.071044921875, "learning_rate": 7.143287463084142e-07, "loss": 0.0007090289145708084, "memory(GiB)": 38.13, "reward": 0.359308123588562, "reward_std": 0.06706994771957397, "rewards/VisualizationJSONCombinedORM/mean": 0.359308123588562, "rewards/VisualizationJSONCombinedORM/std": 0.10159774124622345, "step": 5108, "train_speed(iter/s)": 0.089021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 306.375, "completions/min_length": 245.0, "epoch": 4.225806451612903, "grad_norm": 0.2037731558084488, "kl": 0.06549072265625, "learning_rate": 7.128421332599189e-07, "loss": 0.0006552711129188538, "memory(GiB)": 38.13, "reward": 0.453105092048645, "reward_std": 0.03901955857872963, "rewards/VisualizationJSONCombinedORM/mean": 0.453105092048645, "rewards/VisualizationJSONCombinedORM/std": 0.07901951670646667, "step": 5109, "train_speed(iter/s)": 0.089001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 296.0625, "completions/min_length": 242.0, "epoch": 4.226633581472291, "grad_norm": 0.19597335159778595, "kl": 0.05682373046875, "learning_rate": 7.113569499986401e-07, "loss": 0.0005691088736057281, "memory(GiB)": 38.13, "reward": 0.6159120798110962, "reward_std": 0.08573172986507416, "rewards/VisualizationJSONCombinedORM/mean": 0.6159120798110962, "rewards/VisualizationJSONCombinedORM/std": 0.09397336840629578, "step": 5110, "train_speed(iter/s)": 0.088981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 339.5, "completions/min_length": 235.0, "epoch": 4.227460711331679, "grad_norm": 0.17710795998573303, "kl": 0.0845947265625, "learning_rate": 7.09873197019893e-07, "loss": 0.0008451789617538452, "memory(GiB)": 38.13, "reward": 0.4933691620826721, "reward_std": 0.04371694475412369, "rewards/VisualizationJSONCombinedORM/mean": 0.4933691620826721, "rewards/VisualizationJSONCombinedORM/std": 0.15766826272010803, "step": 5111, "train_speed(iter/s)": 0.088956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 326.375, "completions/min_length": 260.0, "epoch": 4.228287841191067, "grad_norm": 0.18658505380153656, "kl": 0.0513916015625, "learning_rate": 7.083908748185181e-07, "loss": 0.0005133487284183502, "memory(GiB)": 38.13, "reward": 0.663856029510498, "reward_std": 0.05764896422624588, "rewards/VisualizationJSONCombinedORM/mean": 0.663856029510498, "rewards/VisualizationJSONCombinedORM/std": 0.0817941352725029, "step": 5112, "train_speed(iter/s)": 0.08893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 294.4375, "completions/min_length": 235.0, "epoch": 4.229114971050455, "grad_norm": 0.2137780785560608, "kl": 0.0718994140625, "learning_rate": 7.069099838888733e-07, "loss": 0.0007202625274658203, "memory(GiB)": 38.13, "reward": 0.6691913604736328, "reward_std": 0.09206748008728027, "rewards/VisualizationJSONCombinedORM/mean": 0.6691913604736328, "rewards/VisualizationJSONCombinedORM/std": 0.10477730631828308, "step": 5113, "train_speed(iter/s)": 0.088908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 357.9375, "completions/min_length": 276.0, "epoch": 4.229942100909843, "grad_norm": 0.1903676837682724, "kl": 0.105712890625, "learning_rate": 7.054305247248467e-07, "loss": 0.0010564476251602173, "memory(GiB)": 38.13, "reward": 0.755408525466919, "reward_std": 0.09031562507152557, "rewards/VisualizationJSONCombinedORM/mean": 0.755408525466919, "rewards/VisualizationJSONCombinedORM/std": 0.09077788144350052, "step": 5114, "train_speed(iter/s)": 0.088886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 316.125, "completions/min_length": 264.0, "epoch": 4.230769230769231, "grad_norm": 0.2738427221775055, "kl": 0.064697265625, "learning_rate": 7.039524978198414e-07, "loss": 0.000647280365228653, "memory(GiB)": 38.13, "reward": 0.6577579975128174, "reward_std": 0.06921225786209106, "rewards/VisualizationJSONCombinedORM/mean": 0.6577579975128174, "rewards/VisualizationJSONCombinedORM/std": 0.1056300550699234, "step": 5115, "train_speed(iter/s)": 0.088868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 318.4375, "completions/min_length": 207.0, "epoch": 4.231596360628619, "grad_norm": 0.2048509269952774, "kl": 0.05718994140625, "learning_rate": 7.024759036667883e-07, "loss": 0.000573374330997467, "memory(GiB)": 38.13, "reward": 0.7074952721595764, "reward_std": 0.08633783459663391, "rewards/VisualizationJSONCombinedORM/mean": 0.7074952721595764, "rewards/VisualizationJSONCombinedORM/std": 0.0960192158818245, "step": 5116, "train_speed(iter/s)": 0.088846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 294.625, "completions/min_length": 259.0, "epoch": 4.2324234904880065, "grad_norm": 0.23264725506305695, "kl": 0.06964111328125, "learning_rate": 7.010007427581378e-07, "loss": 0.0006975755095481873, "memory(GiB)": 38.13, "reward": 0.57635498046875, "reward_std": 0.07219266891479492, "rewards/VisualizationJSONCombinedORM/mean": 0.57635498046875, "rewards/VisualizationJSONCombinedORM/std": 0.12926067411899567, "step": 5117, "train_speed(iter/s)": 0.088822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 305.75, "completions/min_length": 256.0, "epoch": 4.233250620347395, "grad_norm": 0.1739165484905243, "kl": 0.0509033203125, "learning_rate": 6.995270155858641e-07, "loss": 0.0005090795457363129, "memory(GiB)": 38.13, "reward": 0.22354775667190552, "reward_std": 0.01961909607052803, "rewards/VisualizationJSONCombinedORM/mean": 0.22354775667190552, "rewards/VisualizationJSONCombinedORM/std": 0.027232449501752853, "step": 5118, "train_speed(iter/s)": 0.0888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 288.375, "completions/min_length": 212.0, "epoch": 4.234077750206782, "grad_norm": 0.16767491400241852, "kl": 0.028656005859375, "learning_rate": 6.980547226414591e-07, "loss": 0.0002872981131076813, "memory(GiB)": 38.13, "reward": 0.686032772064209, "reward_std": 0.04938954859972, "rewards/VisualizationJSONCombinedORM/mean": 0.686032772064209, "rewards/VisualizationJSONCombinedORM/std": 0.07157480716705322, "step": 5119, "train_speed(iter/s)": 0.088777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 309.125, "completions/min_length": 267.0, "epoch": 4.23490488006617, "grad_norm": 0.15930995345115662, "kl": 0.05804443359375, "learning_rate": 6.965838644159434e-07, "loss": 0.0005805734544992447, "memory(GiB)": 38.13, "reward": 0.7105575203895569, "reward_std": 0.04593651741743088, "rewards/VisualizationJSONCombinedORM/mean": 0.7105575203895569, "rewards/VisualizationJSONCombinedORM/std": 0.138085275888443, "step": 5120, "train_speed(iter/s)": 0.088756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 323.3125, "completions/min_length": 278.0, "epoch": 4.235732009925559, "grad_norm": 0.16154655814170837, "kl": 0.088623046875, "learning_rate": 6.951144413998517e-07, "loss": 0.000886090099811554, "memory(GiB)": 38.13, "reward": 0.3609936833381653, "reward_std": 0.01798015646636486, "rewards/VisualizationJSONCombinedORM/mean": 0.3609936833381653, "rewards/VisualizationJSONCombinedORM/std": 0.17051339149475098, "step": 5121, "train_speed(iter/s)": 0.088722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 311.0625, "completions/min_length": 205.0, "epoch": 4.236559139784946, "grad_norm": 0.20719195902347565, "kl": 0.067626953125, "learning_rate": 6.936464540832455e-07, "loss": 0.0006754621863365173, "memory(GiB)": 38.13, "reward": 0.6710191965103149, "reward_std": 0.03511664271354675, "rewards/VisualizationJSONCombinedORM/mean": 0.6710191965103149, "rewards/VisualizationJSONCombinedORM/std": 0.07228049635887146, "step": 5122, "train_speed(iter/s)": 0.0887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 312.8125, "completions/min_length": 249.0, "epoch": 4.237386269644334, "grad_norm": 0.16263189911842346, "kl": 0.062255859375, "learning_rate": 6.921799029557042e-07, "loss": 0.0006215572357177734, "memory(GiB)": 38.13, "reward": 0.6872192621231079, "reward_std": 0.06718548387289047, "rewards/VisualizationJSONCombinedORM/mean": 0.6872192621231079, "rewards/VisualizationJSONCombinedORM/std": 0.0806296244263649, "step": 5123, "train_speed(iter/s)": 0.088672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 293.6875, "completions/min_length": 236.0, "epoch": 4.238213399503722, "grad_norm": 0.23137573897838593, "kl": 0.1710205078125, "learning_rate": 6.907147885063315e-07, "loss": 0.0017090141773223877, "memory(GiB)": 38.13, "reward": 0.40332144498825073, "reward_std": 0.04071342945098877, "rewards/VisualizationJSONCombinedORM/mean": 0.40332144498825073, "rewards/VisualizationJSONCombinedORM/std": 0.17583830654621124, "step": 5124, "train_speed(iter/s)": 0.088658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 317.75, "completions/min_length": 250.0, "epoch": 4.23904052936311, "grad_norm": 0.1391185224056244, "kl": 0.0927734375, "learning_rate": 6.892511112237472e-07, "loss": 0.0009300642996095121, "memory(GiB)": 38.13, "reward": 0.47659170627593994, "reward_std": 0.012062118388712406, "rewards/VisualizationJSONCombinedORM/mean": 0.47659170627593994, "rewards/VisualizationJSONCombinedORM/std": 0.210198774933815, "step": 5125, "train_speed(iter/s)": 0.088632 }, { "epoch": 4.23904052936311, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 380.2916666666667, "eval_completions/mean_length": 311.8072916666667, "eval_completions/min_length": 259.5416666666667, "eval_kl": 0.08482869466145833, "eval_loss": 0.0008593077654950321, "eval_reward": 0.44507823263605434, "eval_reward_std": 0.04746424435870722, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44507823263605434, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04746424414527913, "eval_runtime": 321.0638, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 316.25, "completions/min_length": 234.0, "epoch": 4.239867659222498, "grad_norm": 0.1631670743227005, "kl": 0.08251953125, "learning_rate": 6.877888715960956e-07, "loss": 0.0008249655365943909, "memory(GiB)": 38.13, "reward": 0.4017106592655182, "reward_std": 0.02536148577928543, "rewards/VisualizationJSONCombinedORM/mean": 0.4017106592655182, "rewards/VisualizationJSONCombinedORM/std": 0.04622397571802139, "step": 5126, "train_speed(iter/s)": 0.088114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 285.25, "completions/min_length": 246.0, "epoch": 4.240694789081886, "grad_norm": 0.1545323133468628, "kl": 0.1292724609375, "learning_rate": 6.863280701110409e-07, "loss": 0.0012918561697006226, "memory(GiB)": 38.13, "reward": 0.43108701705932617, "reward_std": 0.039521872997283936, "rewards/VisualizationJSONCombinedORM/mean": 0.43108701705932617, "rewards/VisualizationJSONCombinedORM/std": 0.14588037133216858, "step": 5127, "train_speed(iter/s)": 0.088101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 276.625, "completions/min_length": 228.0, "epoch": 4.241521918941274, "grad_norm": 0.17881250381469727, "kl": 0.06060791015625, "learning_rate": 6.84868707255768e-07, "loss": 0.0006047859787940979, "memory(GiB)": 38.13, "reward": 0.6474788188934326, "reward_std": 0.043490491807460785, "rewards/VisualizationJSONCombinedORM/mean": 0.6474788188934326, "rewards/VisualizationJSONCombinedORM/std": 0.10994663089513779, "step": 5128, "train_speed(iter/s)": 0.08808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 340.625, "completions/min_length": 281.0, "epoch": 4.242349048800662, "grad_norm": 0.2657186985015869, "kl": 0.1038818359375, "learning_rate": 6.834107835169784e-07, "loss": 0.0010366179049015045, "memory(GiB)": 38.13, "reward": 0.49947991967201233, "reward_std": 0.03795451670885086, "rewards/VisualizationJSONCombinedORM/mean": 0.49947991967201233, "rewards/VisualizationJSONCombinedORM/std": 0.24128974974155426, "step": 5129, "train_speed(iter/s)": 0.088053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 317.0, "completions/min_length": 249.0, "epoch": 4.2431761786600495, "grad_norm": 0.19382384419441223, "kl": 0.08392333984375, "learning_rate": 6.819542993809003e-07, "loss": 0.0008399859070777893, "memory(GiB)": 38.13, "reward": 0.5486668944358826, "reward_std": 0.06674700230360031, "rewards/VisualizationJSONCombinedORM/mean": 0.5486668944358826, "rewards/VisualizationJSONCombinedORM/std": 0.14488889276981354, "step": 5130, "train_speed(iter/s)": 0.08803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 308.125, "completions/min_length": 241.0, "epoch": 4.244003308519438, "grad_norm": 0.17566944658756256, "kl": 0.179931640625, "learning_rate": 6.804992553332746e-07, "loss": 0.0018003135919570923, "memory(GiB)": 38.13, "reward": 0.2757861018180847, "reward_std": 0.026219157502055168, "rewards/VisualizationJSONCombinedORM/mean": 0.2757861018180847, "rewards/VisualizationJSONCombinedORM/std": 0.0255193579941988, "step": 5131, "train_speed(iter/s)": 0.088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 293.625, "completions/min_length": 232.0, "epoch": 4.244830438378825, "grad_norm": 0.1874542534351349, "kl": 0.110107421875, "learning_rate": 6.790456518593669e-07, "loss": 0.0011002607643604279, "memory(GiB)": 38.13, "reward": 0.4624956548213959, "reward_std": 0.04937776178121567, "rewards/VisualizationJSONCombinedORM/mean": 0.4624956548213959, "rewards/VisualizationJSONCombinedORM/std": 0.07429875433444977, "step": 5132, "train_speed(iter/s)": 0.087981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 331.9375, "completions/min_length": 242.0, "epoch": 4.245657568238213, "grad_norm": 0.19918474555015564, "kl": 0.039825439453125, "learning_rate": 6.775934894439606e-07, "loss": 0.000398389995098114, "memory(GiB)": 38.13, "reward": 0.6820200681686401, "reward_std": 0.08492957800626755, "rewards/VisualizationJSONCombinedORM/mean": 0.6820200681686401, "rewards/VisualizationJSONCombinedORM/std": 0.08297258615493774, "step": 5133, "train_speed(iter/s)": 0.087958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 308.1875, "completions/min_length": 225.0, "epoch": 4.246484698097602, "grad_norm": 0.39715713262557983, "kl": 0.0853271484375, "learning_rate": 6.7614276857136e-07, "loss": 0.0008547678589820862, "memory(GiB)": 38.13, "reward": 0.674832820892334, "reward_std": 0.04906567186117172, "rewards/VisualizationJSONCombinedORM/mean": 0.674832820892334, "rewards/VisualizationJSONCombinedORM/std": 0.07979829609394073, "step": 5134, "train_speed(iter/s)": 0.087933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 321.8125, "completions/min_length": 275.0, "epoch": 4.247311827956989, "grad_norm": 0.2028435468673706, "kl": 0.07452392578125, "learning_rate": 6.746934897253832e-07, "loss": 0.0007460042834281921, "memory(GiB)": 38.13, "reward": 0.6521037817001343, "reward_std": 0.062027327716350555, "rewards/VisualizationJSONCombinedORM/mean": 0.6521037817001343, "rewards/VisualizationJSONCombinedORM/std": 0.1539086103439331, "step": 5135, "train_speed(iter/s)": 0.087907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.125, "completions/min_length": 222.0, "epoch": 4.248138957816377, "grad_norm": 0.24630749225616455, "kl": 0.0469970703125, "learning_rate": 6.732456533893767e-07, "loss": 0.00046953000128269196, "memory(GiB)": 38.13, "reward": 0.7088414430618286, "reward_std": 0.0691700130701065, "rewards/VisualizationJSONCombinedORM/mean": 0.7088414430618286, "rewards/VisualizationJSONCombinedORM/std": 0.07983829826116562, "step": 5136, "train_speed(iter/s)": 0.087884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 307.875, "completions/min_length": 257.0, "epoch": 4.248966087675765, "grad_norm": 0.1905180960893631, "kl": 0.06292724609375, "learning_rate": 6.71799260046197e-07, "loss": 0.0006300956010818481, "memory(GiB)": 38.13, "reward": 0.39081498980522156, "reward_std": 0.03020627051591873, "rewards/VisualizationJSONCombinedORM/mean": 0.39081498980522156, "rewards/VisualizationJSONCombinedORM/std": 0.03943469002842903, "step": 5137, "train_speed(iter/s)": 0.087856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 319.25, "completions/min_length": 255.0, "epoch": 4.249793217535153, "grad_norm": 0.18512704968452454, "kl": 0.1544189453125, "learning_rate": 6.703543101782244e-07, "loss": 0.0015459144487977028, "memory(GiB)": 38.13, "reward": 0.23786643147468567, "reward_std": 0.020016096532344818, "rewards/VisualizationJSONCombinedORM/mean": 0.23786643147468567, "rewards/VisualizationJSONCombinedORM/std": 0.021183371543884277, "step": 5138, "train_speed(iter/s)": 0.087832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 344.1875, "completions/min_length": 258.0, "epoch": 4.250620347394541, "grad_norm": 0.16386638581752777, "kl": 0.04998779296875, "learning_rate": 6.689108042673564e-07, "loss": 0.0004998594522476196, "memory(GiB)": 38.13, "reward": 0.5478776693344116, "reward_std": 0.022598423063755035, "rewards/VisualizationJSONCombinedORM/mean": 0.5478776693344116, "rewards/VisualizationJSONCombinedORM/std": 0.1636863648891449, "step": 5139, "train_speed(iter/s)": 0.087809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 315.625, "completions/min_length": 242.0, "epoch": 4.251447477253929, "grad_norm": 0.22276006639003754, "kl": 0.23095703125, "learning_rate": 6.6746874279501e-07, "loss": 0.002312947064638138, "memory(GiB)": 38.13, "reward": 0.4822548031806946, "reward_std": 0.11247891187667847, "rewards/VisualizationJSONCombinedORM/mean": 0.4822548031806946, "rewards/VisualizationJSONCombinedORM/std": 0.11689293384552002, "step": 5140, "train_speed(iter/s)": 0.087778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 293.75, "completions/min_length": 231.0, "epoch": 4.252274607113317, "grad_norm": 0.23360364139080048, "kl": 0.09716796875, "learning_rate": 6.66028126242117e-07, "loss": 0.0009710956364870071, "memory(GiB)": 38.13, "reward": 0.43464672565460205, "reward_std": 0.06601858139038086, "rewards/VisualizationJSONCombinedORM/mean": 0.43464672565460205, "rewards/VisualizationJSONCombinedORM/std": 0.15337927639484406, "step": 5141, "train_speed(iter/s)": 0.087759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 305.3125, "completions/min_length": 241.0, "epoch": 4.253101736972705, "grad_norm": 0.2194070965051651, "kl": 0.034515380859375, "learning_rate": 6.64588955089131e-07, "loss": 0.0003452785313129425, "memory(GiB)": 38.13, "reward": 0.3361999988555908, "reward_std": 0.04022638872265816, "rewards/VisualizationJSONCombinedORM/mean": 0.3361999988555908, "rewards/VisualizationJSONCombinedORM/std": 0.16130201518535614, "step": 5142, "train_speed(iter/s)": 0.087736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 278.5, "completions/min_length": 236.0, "epoch": 4.2539288668320925, "grad_norm": 0.19893790781497955, "kl": 0.1724853515625, "learning_rate": 6.631512298160225e-07, "loss": 0.0017245709896087646, "memory(GiB)": 38.13, "reward": 0.5125130414962769, "reward_std": 0.07755796611309052, "rewards/VisualizationJSONCombinedORM/mean": 0.5125130414962769, "rewards/VisualizationJSONCombinedORM/std": 0.07531756162643433, "step": 5143, "train_speed(iter/s)": 0.087708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 317.25, "completions/min_length": 246.0, "epoch": 4.254755996691481, "grad_norm": 0.18583817780017853, "kl": 0.083251953125, "learning_rate": 6.617149509022807e-07, "loss": 0.0008329078555107117, "memory(GiB)": 38.13, "reward": 0.3244783878326416, "reward_std": 0.02673729509115219, "rewards/VisualizationJSONCombinedORM/mean": 0.3244783878326416, "rewards/VisualizationJSONCombinedORM/std": 0.028994476422667503, "step": 5144, "train_speed(iter/s)": 0.087692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 304.0625, "completions/min_length": 230.0, "epoch": 4.255583126550868, "grad_norm": 0.20714929699897766, "kl": 0.04913330078125, "learning_rate": 6.602801188269081e-07, "loss": 0.0004911571741104126, "memory(GiB)": 38.13, "reward": 0.5796345472335815, "reward_std": 0.0266568586230278, "rewards/VisualizationJSONCombinedORM/mean": 0.5796345472335815, "rewards/VisualizationJSONCombinedORM/std": 0.14652763307094574, "step": 5145, "train_speed(iter/s)": 0.087664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 316.4375, "completions/min_length": 245.0, "epoch": 4.256410256410256, "grad_norm": 0.21053661406040192, "kl": 0.038330078125, "learning_rate": 6.588467340684324e-07, "loss": 0.00038318149745464325, "memory(GiB)": 38.13, "reward": 0.4166443645954132, "reward_std": 0.03497745096683502, "rewards/VisualizationJSONCombinedORM/mean": 0.4166443645954132, "rewards/VisualizationJSONCombinedORM/std": 0.148084819316864, "step": 5146, "train_speed(iter/s)": 0.087649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 332.6875, "completions/min_length": 276.0, "epoch": 4.257237386269645, "grad_norm": 0.16462473571300507, "kl": 0.04852294921875, "learning_rate": 6.574147971048899e-07, "loss": 0.00048530474305152893, "memory(GiB)": 38.13, "reward": 0.4507179260253906, "reward_std": 0.03794044256210327, "rewards/VisualizationJSONCombinedORM/mean": 0.4507179260253906, "rewards/VisualizationJSONCombinedORM/std": 0.2570021152496338, "step": 5147, "train_speed(iter/s)": 0.087624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 332.75, "completions/min_length": 261.0, "epoch": 4.258064516129032, "grad_norm": 0.23115307092666626, "kl": 0.05841064453125, "learning_rate": 6.559843084138406e-07, "loss": 0.0005838721990585327, "memory(GiB)": 38.13, "reward": 0.6566911935806274, "reward_std": 0.04760785773396492, "rewards/VisualizationJSONCombinedORM/mean": 0.6566911935806274, "rewards/VisualizationJSONCombinedORM/std": 0.08443880826234818, "step": 5148, "train_speed(iter/s)": 0.087597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 319.9375, "completions/min_length": 262.0, "epoch": 4.25889164598842, "grad_norm": 0.18721385300159454, "kl": 0.08172607421875, "learning_rate": 6.545552684723583e-07, "loss": 0.0008190497756004333, "memory(GiB)": 38.13, "reward": 0.5670344829559326, "reward_std": 0.05259787663817406, "rewards/VisualizationJSONCombinedORM/mean": 0.5670344829559326, "rewards/VisualizationJSONCombinedORM/std": 0.13378764688968658, "step": 5149, "train_speed(iter/s)": 0.087577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 300.4375, "completions/min_length": 236.0, "epoch": 4.259718775847809, "grad_norm": 0.20189695060253143, "kl": 0.1439208984375, "learning_rate": 6.531276777570361e-07, "loss": 0.001437261700630188, "memory(GiB)": 38.13, "reward": 0.5236215591430664, "reward_std": 0.10484066605567932, "rewards/VisualizationJSONCombinedORM/mean": 0.5236215591430664, "rewards/VisualizationJSONCombinedORM/std": 0.114084392786026, "step": 5150, "train_speed(iter/s)": 0.08756 }, { "epoch": 4.259718775847809, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 386.25, "eval_completions/mean_length": 310.296875, "eval_completions/min_length": 257.7083333333333, "eval_kl": 0.08583577473958333, "eval_loss": 0.0008509333129040897, "eval_reward": 0.4436113741248846, "eval_reward_std": 0.05798076636468371, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4436113741248846, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05798076648109903, "eval_runtime": 324.8352, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 300.375, "completions/min_length": 228.0, "epoch": 4.260545905707196, "grad_norm": 0.1612955629825592, "kl": 0.0369873046875, "learning_rate": 6.517015367439788e-07, "loss": 0.00037093088030815125, "memory(GiB)": 38.13, "reward": 0.4891760051250458, "reward_std": 0.024615267291665077, "rewards/VisualizationJSONCombinedORM/mean": 0.4891760051250458, "rewards/VisualizationJSONCombinedORM/std": 0.0782124325633049, "step": 5151, "train_speed(iter/s)": 0.087056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 342.6875, "completions/min_length": 278.0, "epoch": 4.261373035566584, "grad_norm": 0.23689861595630646, "kl": 0.1231689453125, "learning_rate": 6.502768459088154e-07, "loss": 0.0012330692261457443, "memory(GiB)": 38.13, "reward": 0.6449617147445679, "reward_std": 0.06701777875423431, "rewards/VisualizationJSONCombinedORM/mean": 0.6449617147445679, "rewards/VisualizationJSONCombinedORM/std": 0.07017587870359421, "step": 5152, "train_speed(iter/s)": 0.087033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 318.5625, "completions/min_length": 236.0, "epoch": 4.262200165425972, "grad_norm": 0.19235454499721527, "kl": 0.037445068359375, "learning_rate": 6.488536057266842e-07, "loss": 0.00037407130002975464, "memory(GiB)": 38.13, "reward": 0.4563795328140259, "reward_std": 0.027587680146098137, "rewards/VisualizationJSONCombinedORM/mean": 0.4563795328140259, "rewards/VisualizationJSONCombinedORM/std": 0.09219025820493698, "step": 5153, "train_speed(iter/s)": 0.087007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 291.9375, "completions/min_length": 228.0, "epoch": 4.26302729528536, "grad_norm": 0.16760969161987305, "kl": 0.04876708984375, "learning_rate": 6.474318166722427e-07, "loss": 0.0004875361919403076, "memory(GiB)": 38.13, "reward": 0.6237318515777588, "reward_std": 0.06507384777069092, "rewards/VisualizationJSONCombinedORM/mean": 0.6237318515777588, "rewards/VisualizationJSONCombinedORM/std": 0.11302933096885681, "step": 5154, "train_speed(iter/s)": 0.086982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 314.5, "completions/min_length": 255.0, "epoch": 4.263854425144748, "grad_norm": 0.18909017741680145, "kl": 0.0765380859375, "learning_rate": 6.460114792196642e-07, "loss": 0.0007664412260055542, "memory(GiB)": 38.13, "reward": 0.4903860092163086, "reward_std": 0.04615286737680435, "rewards/VisualizationJSONCombinedORM/mean": 0.4903860092163086, "rewards/VisualizationJSONCombinedORM/std": 0.05715867877006531, "step": 5155, "train_speed(iter/s)": 0.086952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 264.75, "completions/min_length": 213.0, "epoch": 4.2646815550041355, "grad_norm": 0.18844833970069885, "kl": 0.0733642578125, "learning_rate": 6.445925938426401e-07, "loss": 0.0007331520318984985, "memory(GiB)": 38.13, "reward": 0.5296232104301453, "reward_std": 0.05065010488033295, "rewards/VisualizationJSONCombinedORM/mean": 0.5296232104301453, "rewards/VisualizationJSONCombinedORM/std": 0.23812855780124664, "step": 5156, "train_speed(iter/s)": 0.086926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 302.5625, "completions/min_length": 213.0, "epoch": 4.265508684863524, "grad_norm": 0.17788703739643097, "kl": 0.05230712890625, "learning_rate": 6.431751610143716e-07, "loss": 0.0005236640572547913, "memory(GiB)": 38.13, "reward": 0.5553996562957764, "reward_std": 0.06319096684455872, "rewards/VisualizationJSONCombinedORM/mean": 0.5553996562957764, "rewards/VisualizationJSONCombinedORM/std": 0.062447838485240936, "step": 5157, "train_speed(iter/s)": 0.086902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 321.5625, "completions/min_length": 243.0, "epoch": 4.266335814722911, "grad_norm": 0.22828775644302368, "kl": 0.201171875, "learning_rate": 6.417591812075818e-07, "loss": 0.0020143981091678143, "memory(GiB)": 38.13, "reward": 0.5856452584266663, "reward_std": 0.04081696271896362, "rewards/VisualizationJSONCombinedORM/mean": 0.5856452584266663, "rewards/VisualizationJSONCombinedORM/std": 0.1534985452890396, "step": 5158, "train_speed(iter/s)": 0.086878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 285.5, "completions/min_length": 225.0, "epoch": 4.267162944582299, "grad_norm": 0.3043261468410492, "kl": 0.08197021484375, "learning_rate": 6.403446548945052e-07, "loss": 0.0008198842406272888, "memory(GiB)": 38.13, "reward": 0.4436057507991791, "reward_std": 0.08723051846027374, "rewards/VisualizationJSONCombinedORM/mean": 0.4436057507991791, "rewards/VisualizationJSONCombinedORM/std": 0.16059303283691406, "step": 5159, "train_speed(iter/s)": 0.086859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 307.3125, "completions/min_length": 231.0, "epoch": 4.267990074441688, "grad_norm": 0.18833744525909424, "kl": 0.063720703125, "learning_rate": 6.38931582546895e-07, "loss": 0.000636618584394455, "memory(GiB)": 38.13, "reward": 0.7033271789550781, "reward_std": 0.07952328026294708, "rewards/VisualizationJSONCombinedORM/mean": 0.7033271789550781, "rewards/VisualizationJSONCombinedORM/std": 0.11588913947343826, "step": 5160, "train_speed(iter/s)": 0.086834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 302.375, "completions/min_length": 223.0, "epoch": 4.268817204301075, "grad_norm": 0.17546558380126953, "kl": 0.10089111328125, "learning_rate": 6.375199646360142e-07, "loss": 0.0010100416839122772, "memory(GiB)": 38.13, "reward": 0.5676885843276978, "reward_std": 0.06435421109199524, "rewards/VisualizationJSONCombinedORM/mean": 0.5676885843276978, "rewards/VisualizationJSONCombinedORM/std": 0.209966242313385, "step": 5161, "train_speed(iter/s)": 0.086811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 325.125, "completions/min_length": 265.0, "epoch": 4.269644334160463, "grad_norm": 0.16569465398788452, "kl": 0.04986572265625, "learning_rate": 6.361098016326478e-07, "loss": 0.0004985183477401733, "memory(GiB)": 38.13, "reward": 0.46300655603408813, "reward_std": 0.018884770572185516, "rewards/VisualizationJSONCombinedORM/mean": 0.46300655603408813, "rewards/VisualizationJSONCombinedORM/std": 0.17851240932941437, "step": 5162, "train_speed(iter/s)": 0.086792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 318.125, "completions/min_length": 248.0, "epoch": 4.270471464019851, "grad_norm": 0.19847097992897034, "kl": 0.07421875, "learning_rate": 6.347010940070886e-07, "loss": 0.0007427707314491272, "memory(GiB)": 38.13, "reward": 0.6570196151733398, "reward_std": 0.0904320776462555, "rewards/VisualizationJSONCombinedORM/mean": 0.6570196151733398, "rewards/VisualizationJSONCombinedORM/std": 0.08852168172597885, "step": 5163, "train_speed(iter/s)": 0.086774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 295.0625, "completions/min_length": 234.0, "epoch": 4.271298593879239, "grad_norm": 0.22091390192508698, "kl": 0.07342529296875, "learning_rate": 6.332938422291485e-07, "loss": 0.0007358528673648834, "memory(GiB)": 38.13, "reward": 0.5511068105697632, "reward_std": 0.08854783326387405, "rewards/VisualizationJSONCombinedORM/mean": 0.5511068105697632, "rewards/VisualizationJSONCombinedORM/std": 0.1861412078142166, "step": 5164, "train_speed(iter/s)": 0.086749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 351.3125, "completions/min_length": 263.0, "epoch": 4.272125723738627, "grad_norm": 0.18858623504638672, "kl": 0.0572509765625, "learning_rate": 6.318880467681527e-07, "loss": 0.0005728229880332947, "memory(GiB)": 38.13, "reward": 0.581259548664093, "reward_std": 0.17337045073509216, "rewards/VisualizationJSONCombinedORM/mean": 0.581259548664093, "rewards/VisualizationJSONCombinedORM/std": 0.18493054807186127, "step": 5165, "train_speed(iter/s)": 0.086723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 312.9375, "completions/min_length": 236.0, "epoch": 4.272952853598015, "grad_norm": 0.17956112325191498, "kl": 0.05438232421875, "learning_rate": 6.30483708092941e-07, "loss": 0.0005425512790679932, "memory(GiB)": 38.13, "reward": 0.6306285262107849, "reward_std": 0.045649245381355286, "rewards/VisualizationJSONCombinedORM/mean": 0.6306285262107849, "rewards/VisualizationJSONCombinedORM/std": 0.13665547966957092, "step": 5166, "train_speed(iter/s)": 0.086704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 316.8125, "completions/min_length": 241.0, "epoch": 4.273779983457403, "grad_norm": 0.22591237723827362, "kl": 0.06463623046875, "learning_rate": 6.290808266718639e-07, "loss": 0.0006448337808251381, "memory(GiB)": 38.13, "reward": 0.550432026386261, "reward_std": 0.09503746777772903, "rewards/VisualizationJSONCombinedORM/mean": 0.550432026386261, "rewards/VisualizationJSONCombinedORM/std": 0.17092221975326538, "step": 5167, "train_speed(iter/s)": 0.086682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 287.5625, "completions/min_length": 250.0, "epoch": 4.274607113316791, "grad_norm": 0.1843091994524002, "kl": 0.06488037109375, "learning_rate": 6.276794029727939e-07, "loss": 0.0006481632590293884, "memory(GiB)": 38.13, "reward": 0.5294099450111389, "reward_std": 0.04682830348610878, "rewards/VisualizationJSONCombinedORM/mean": 0.5294099450111389, "rewards/VisualizationJSONCombinedORM/std": 0.13098672032356262, "step": 5168, "train_speed(iter/s)": 0.086662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 326.5, "completions/min_length": 252.0, "epoch": 4.2754342431761785, "grad_norm": 0.15999411046504974, "kl": 0.041748046875, "learning_rate": 6.262794374631082e-07, "loss": 0.00041688233613967896, "memory(GiB)": 38.13, "reward": 0.5886598229408264, "reward_std": 0.04906078428030014, "rewards/VisualizationJSONCombinedORM/mean": 0.5886598229408264, "rewards/VisualizationJSONCombinedORM/std": 0.12379282712936401, "step": 5169, "train_speed(iter/s)": 0.086637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 321.625, "completions/min_length": 269.0, "epoch": 4.276261373035567, "grad_norm": 0.16726958751678467, "kl": 0.04705810546875, "learning_rate": 6.248809306097036e-07, "loss": 0.0004710718058049679, "memory(GiB)": 38.13, "reward": 0.6692502498626709, "reward_std": 0.04931158199906349, "rewards/VisualizationJSONCombinedORM/mean": 0.6692502498626709, "rewards/VisualizationJSONCombinedORM/std": 0.160991370677948, "step": 5170, "train_speed(iter/s)": 0.086614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 330.375, "completions/min_length": 230.0, "epoch": 4.277088502894954, "grad_norm": 0.1719483584165573, "kl": 0.2059326171875, "learning_rate": 6.234838828789886e-07, "loss": 0.0020578093826770782, "memory(GiB)": 38.13, "reward": 0.44363051652908325, "reward_std": 0.055135633796453476, "rewards/VisualizationJSONCombinedORM/mean": 0.44363051652908325, "rewards/VisualizationJSONCombinedORM/std": 0.18832197785377502, "step": 5171, "train_speed(iter/s)": 0.086595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 307.0, "completions/min_length": 254.0, "epoch": 4.277915632754342, "grad_norm": 0.18725018203258514, "kl": 0.03570556640625, "learning_rate": 6.220882947368856e-07, "loss": 0.00035616010427474976, "memory(GiB)": 38.13, "reward": 0.695340633392334, "reward_std": 0.04970473423600197, "rewards/VisualizationJSONCombinedORM/mean": 0.695340633392334, "rewards/VisualizationJSONCombinedORM/std": 0.07684478908777237, "step": 5172, "train_speed(iter/s)": 0.086575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 321.0625, "completions/min_length": 256.0, "epoch": 4.278742762613731, "grad_norm": 0.1696311980485916, "kl": 0.04827880859375, "learning_rate": 6.206941666488287e-07, "loss": 0.0004822760820388794, "memory(GiB)": 38.13, "reward": 0.6901786923408508, "reward_std": 0.03819561377167702, "rewards/VisualizationJSONCombinedORM/mean": 0.6901786923408508, "rewards/VisualizationJSONCombinedORM/std": 0.11016692221164703, "step": 5173, "train_speed(iter/s)": 0.086555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 328.9375, "completions/min_length": 281.0, "epoch": 4.279569892473118, "grad_norm": 0.22206120193004608, "kl": 0.224853515625, "learning_rate": 6.193014990797663e-07, "loss": 0.0022474415600299835, "memory(GiB)": 38.13, "reward": 0.47910988330841064, "reward_std": 0.060179874300956726, "rewards/VisualizationJSONCombinedORM/mean": 0.47910988330841064, "rewards/VisualizationJSONCombinedORM/std": 0.07240699976682663, "step": 5174, "train_speed(iter/s)": 0.086534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 329.1875, "completions/min_length": 265.0, "epoch": 4.280397022332506, "grad_norm": 0.16260835528373718, "kl": 0.037353515625, "learning_rate": 6.179102924941599e-07, "loss": 0.00037381798028945923, "memory(GiB)": 38.13, "reward": 0.5458859205245972, "reward_std": 0.017641402781009674, "rewards/VisualizationJSONCombinedORM/mean": 0.5458859205245972, "rewards/VisualizationJSONCombinedORM/std": 0.32454031705856323, "step": 5175, "train_speed(iter/s)": 0.086509 }, { "epoch": 4.280397022332506, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 384.4166666666667, "eval_completions/mean_length": 311.9583333333333, "eval_completions/min_length": 261.5833333333333, "eval_kl": 0.084320068359375, "eval_loss": 0.0008463244885206223, "eval_reward": 0.4455694618324439, "eval_reward_std": 0.052991659729741514, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4455694618324439, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0529916575178504, "eval_runtime": 323.9917, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 316.375, "completions/min_length": 248.0, "epoch": 4.281224152191895, "grad_norm": 0.15476039052009583, "kl": 0.0784912109375, "learning_rate": 6.165205473559843e-07, "loss": 0.0007841140031814575, "memory(GiB)": 38.13, "reward": 0.5458524227142334, "reward_std": 0.050112299621105194, "rewards/VisualizationJSONCombinedORM/mean": 0.5458524227142334, "rewards/VisualizationJSONCombinedORM/std": 0.0620913989841938, "step": 5176, "train_speed(iter/s)": 0.086021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 298.8125, "completions/min_length": 233.0, "epoch": 4.282051282051282, "grad_norm": 0.18361149728298187, "kl": 0.09375, "learning_rate": 6.151322641287233e-07, "loss": 0.0009359996765851974, "memory(GiB)": 38.13, "reward": 0.5304723978042603, "reward_std": 0.03484737128019333, "rewards/VisualizationJSONCombinedORM/mean": 0.5304723978042603, "rewards/VisualizationJSONCombinedORM/std": 0.05994626507163048, "step": 5177, "train_speed(iter/s)": 0.085995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 343.875, "completions/min_length": 224.0, "epoch": 4.28287841191067, "grad_norm": 0.19118866324424744, "kl": 0.134521484375, "learning_rate": 6.137454432753798e-07, "loss": 0.00134345144033432, "memory(GiB)": 38.13, "reward": 0.3963879942893982, "reward_std": 0.04255978763103485, "rewards/VisualizationJSONCombinedORM/mean": 0.3963879942893982, "rewards/VisualizationJSONCombinedORM/std": 0.10794920474290848, "step": 5178, "train_speed(iter/s)": 0.085967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 307.375, "completions/min_length": 248.0, "epoch": 4.283705541770058, "grad_norm": 0.23023058474063873, "kl": 0.109619140625, "learning_rate": 6.123600852584616e-07, "loss": 0.001096881926059723, "memory(GiB)": 38.13, "reward": 0.7179772257804871, "reward_std": 0.07139256596565247, "rewards/VisualizationJSONCombinedORM/mean": 0.7179772257804871, "rewards/VisualizationJSONCombinedORM/std": 0.08610143512487411, "step": 5179, "train_speed(iter/s)": 0.085948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 327.4375, "completions/min_length": 265.0, "epoch": 4.284532671629446, "grad_norm": 0.20789015293121338, "kl": 0.0908203125, "learning_rate": 6.10976190539993e-07, "loss": 0.0009085759520530701, "memory(GiB)": 38.13, "reward": 0.7776771187782288, "reward_std": 0.04140862077474594, "rewards/VisualizationJSONCombinedORM/mean": 0.7776771187782288, "rewards/VisualizationJSONCombinedORM/std": 0.04615640640258789, "step": 5180, "train_speed(iter/s)": 0.085924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 319.0, "completions/min_length": 236.0, "epoch": 4.285359801488834, "grad_norm": 0.1892477422952652, "kl": 0.104736328125, "learning_rate": 6.095937595815104e-07, "loss": 0.0010462775826454163, "memory(GiB)": 38.13, "reward": 0.30714473128318787, "reward_std": 0.05230540782213211, "rewards/VisualizationJSONCombinedORM/mean": 0.30714473128318787, "rewards/VisualizationJSONCombinedORM/std": 0.060166679322719574, "step": 5181, "train_speed(iter/s)": 0.085894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 317.8125, "completions/min_length": 247.0, "epoch": 4.2861869313482215, "grad_norm": 0.20851482450962067, "kl": 0.04559326171875, "learning_rate": 6.082127928440612e-07, "loss": 0.0004558190703392029, "memory(GiB)": 38.13, "reward": 0.5190631747245789, "reward_std": 0.04867297038435936, "rewards/VisualizationJSONCombinedORM/mean": 0.5190631747245789, "rewards/VisualizationJSONCombinedORM/std": 0.10010810941457748, "step": 5182, "train_speed(iter/s)": 0.085874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 287.75, "completions/min_length": 235.0, "epoch": 4.28701406120761, "grad_norm": 0.1751013994216919, "kl": 0.103240966796875, "learning_rate": 6.068332907882013e-07, "loss": 0.0010320506989955902, "memory(GiB)": 38.13, "reward": 0.20787423849105835, "reward_std": 0.014659562148153782, "rewards/VisualizationJSONCombinedORM/mean": 0.20787423849105835, "rewards/VisualizationJSONCombinedORM/std": 0.02143462933599949, "step": 5183, "train_speed(iter/s)": 0.085854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 255.8125, "completions/min_length": 220.0, "epoch": 4.287841191066997, "grad_norm": 0.1720844805240631, "kl": 0.150146484375, "learning_rate": 6.054552538740055e-07, "loss": 0.0015009045600891113, "memory(GiB)": 38.13, "reward": 0.539466381072998, "reward_std": 0.09963200986385345, "rewards/VisualizationJSONCombinedORM/mean": 0.539466381072998, "rewards/VisualizationJSONCombinedORM/std": 0.12185434252023697, "step": 5184, "train_speed(iter/s)": 0.085844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 308.4375, "completions/min_length": 271.0, "epoch": 4.288668320926385, "grad_norm": 0.2128508985042572, "kl": 0.1158447265625, "learning_rate": 6.040786825610518e-07, "loss": 0.0011578965932130814, "memory(GiB)": 38.13, "reward": 0.6401667594909668, "reward_std": 0.04577498510479927, "rewards/VisualizationJSONCombinedORM/mean": 0.6401667594909668, "rewards/VisualizationJSONCombinedORM/std": 0.1509205847978592, "step": 5185, "train_speed(iter/s)": 0.085823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 292.1875, "completions/min_length": 235.0, "epoch": 4.289495450785774, "grad_norm": 0.20492109656333923, "kl": 0.0745849609375, "learning_rate": 6.02703577308435e-07, "loss": 0.0007474906742572784, "memory(GiB)": 38.13, "reward": 0.66722172498703, "reward_std": 0.05284088850021362, "rewards/VisualizationJSONCombinedORM/mean": 0.66722172498703, "rewards/VisualizationJSONCombinedORM/std": 0.1394922435283661, "step": 5186, "train_speed(iter/s)": 0.085798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 336.75, "completions/min_length": 241.0, "epoch": 4.290322580645161, "grad_norm": 0.21820467710494995, "kl": 0.0482177734375, "learning_rate": 6.013299385747584e-07, "loss": 0.0004815738648176193, "memory(GiB)": 38.13, "reward": 0.4868524670600891, "reward_std": 0.06686747074127197, "rewards/VisualizationJSONCombinedORM/mean": 0.4868524670600891, "rewards/VisualizationJSONCombinedORM/std": 0.07489665597677231, "step": 5187, "train_speed(iter/s)": 0.085777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 324.875, "completions/min_length": 226.0, "epoch": 4.291149710504549, "grad_norm": 0.17781895399093628, "kl": 0.03668212890625, "learning_rate": 5.999577668181389e-07, "loss": 0.0003667287528514862, "memory(GiB)": 38.13, "reward": 0.49303990602493286, "reward_std": 0.044598691165447235, "rewards/VisualizationJSONCombinedORM/mean": 0.49303990602493286, "rewards/VisualizationJSONCombinedORM/std": 0.11786939948797226, "step": 5188, "train_speed(iter/s)": 0.085755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 297.125, "completions/min_length": 245.0, "epoch": 4.291976840363937, "grad_norm": 0.23767435550689697, "kl": 0.0572509765625, "learning_rate": 5.985870624961993e-07, "loss": 0.0005735233426094055, "memory(GiB)": 38.13, "reward": 0.6527536511421204, "reward_std": 0.10416547954082489, "rewards/VisualizationJSONCombinedORM/mean": 0.6527536511421204, "rewards/VisualizationJSONCombinedORM/std": 0.11526978015899658, "step": 5189, "train_speed(iter/s)": 0.085737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 296.1875, "completions/min_length": 224.0, "epoch": 4.292803970223325, "grad_norm": 0.20403625071048737, "kl": 0.06793212890625, "learning_rate": 5.972178260660771e-07, "loss": 0.0006777271628379822, "memory(GiB)": 38.13, "reward": 0.5269907712936401, "reward_std": 0.061836063861846924, "rewards/VisualizationJSONCombinedORM/mean": 0.5269907712936401, "rewards/VisualizationJSONCombinedORM/std": 0.1092325747013092, "step": 5190, "train_speed(iter/s)": 0.08572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 326.9375, "completions/min_length": 260.0, "epoch": 4.293631100082713, "grad_norm": 0.2566314935684204, "kl": 0.0640869140625, "learning_rate": 5.958500579844195e-07, "loss": 0.0006407052278518677, "memory(GiB)": 38.13, "reward": 0.5578234791755676, "reward_std": 0.05209391564130783, "rewards/VisualizationJSONCombinedORM/mean": 0.5578234791755676, "rewards/VisualizationJSONCombinedORM/std": 0.09250105917453766, "step": 5191, "train_speed(iter/s)": 0.085701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 332.1875, "completions/min_length": 244.0, "epoch": 4.294458229942101, "grad_norm": 0.1859266757965088, "kl": 0.07012939453125, "learning_rate": 5.944837587073843e-07, "loss": 0.0007021566852927208, "memory(GiB)": 38.13, "reward": 0.5518720746040344, "reward_std": 0.05512487143278122, "rewards/VisualizationJSONCombinedORM/mean": 0.5518720746040344, "rewards/VisualizationJSONCombinedORM/std": 0.1470668464899063, "step": 5192, "train_speed(iter/s)": 0.085681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 285.875, "completions/min_length": 227.0, "epoch": 4.295285359801489, "grad_norm": 0.1790972799062729, "kl": 0.05401611328125, "learning_rate": 5.931189286906358e-07, "loss": 0.0005400395020842552, "memory(GiB)": 38.13, "reward": 0.6887876987457275, "reward_std": 0.07373258471488953, "rewards/VisualizationJSONCombinedORM/mean": 0.6887876987457275, "rewards/VisualizationJSONCombinedORM/std": 0.11400869488716125, "step": 5193, "train_speed(iter/s)": 0.085662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 325.1875, "completions/min_length": 244.0, "epoch": 4.296112489660877, "grad_norm": 0.17400088906288147, "kl": 0.0802001953125, "learning_rate": 5.917555683893544e-07, "loss": 0.0008028037846088409, "memory(GiB)": 38.13, "reward": 0.5806852579116821, "reward_std": 0.06271817535161972, "rewards/VisualizationJSONCombinedORM/mean": 0.5806852579116821, "rewards/VisualizationJSONCombinedORM/std": 0.1949671357870102, "step": 5194, "train_speed(iter/s)": 0.08564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 315.75, "completions/min_length": 263.0, "epoch": 4.2969396195202645, "grad_norm": 0.1918339878320694, "kl": 0.10662841796875, "learning_rate": 5.903936782582253e-07, "loss": 0.0010662861168384552, "memory(GiB)": 38.13, "reward": 0.633742094039917, "reward_std": 0.16731640696525574, "rewards/VisualizationJSONCombinedORM/mean": 0.633742094039917, "rewards/VisualizationJSONCombinedORM/std": 0.18598754703998566, "step": 5195, "train_speed(iter/s)": 0.085623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 325.3125, "completions/min_length": 269.0, "epoch": 4.297766749379653, "grad_norm": 0.18340419232845306, "kl": 0.0550537109375, "learning_rate": 5.890332587514457e-07, "loss": 0.00054931640625, "memory(GiB)": 38.13, "reward": 0.7055981755256653, "reward_std": 0.08345289528369904, "rewards/VisualizationJSONCombinedORM/mean": 0.7055981755256653, "rewards/VisualizationJSONCombinedORM/std": 0.152944877743721, "step": 5196, "train_speed(iter/s)": 0.085601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 356.5, "completions/min_length": 243.0, "epoch": 4.29859387923904, "grad_norm": 0.20789207518100739, "kl": 0.0751953125, "learning_rate": 5.876743103227217e-07, "loss": 0.0007522329688072205, "memory(GiB)": 38.13, "reward": 0.5386656522750854, "reward_std": 0.04353372007608414, "rewards/VisualizationJSONCombinedORM/mean": 0.5386656522750854, "rewards/VisualizationJSONCombinedORM/std": 0.07983605563640594, "step": 5197, "train_speed(iter/s)": 0.085581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 298.6875, "completions/min_length": 254.0, "epoch": 4.299421009098428, "grad_norm": 0.1926046907901764, "kl": 0.05615234375, "learning_rate": 5.863168334252695e-07, "loss": 0.0005602240562438965, "memory(GiB)": 38.13, "reward": 0.6417815685272217, "reward_std": 0.03973338380455971, "rewards/VisualizationJSONCombinedORM/mean": 0.6417815685272217, "rewards/VisualizationJSONCombinedORM/std": 0.20155906677246094, "step": 5198, "train_speed(iter/s)": 0.085566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 320.375, "completions/min_length": 266.0, "epoch": 4.300248138957817, "grad_norm": 0.16589617729187012, "kl": 0.09637451171875, "learning_rate": 5.849608285118147e-07, "loss": 0.0009636655449867249, "memory(GiB)": 38.13, "reward": 0.6361158490180969, "reward_std": 0.03333652392029762, "rewards/VisualizationJSONCombinedORM/mean": 0.6361158490180969, "rewards/VisualizationJSONCombinedORM/std": 0.08969534933567047, "step": 5199, "train_speed(iter/s)": 0.085547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 315.125, "completions/min_length": 254.0, "epoch": 4.301075268817204, "grad_norm": 0.18097707629203796, "kl": 0.15771484375, "learning_rate": 5.836062960345878e-07, "loss": 0.0015791580080986023, "memory(GiB)": 38.13, "reward": 0.3854284882545471, "reward_std": 0.02746449038386345, "rewards/VisualizationJSONCombinedORM/mean": 0.3854284882545471, "rewards/VisualizationJSONCombinedORM/std": 0.07970710843801498, "step": 5200, "train_speed(iter/s)": 0.085516 }, { "epoch": 4.301075268817204, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.75, "eval_completions/mean_length": 311.1927083333333, "eval_completions/min_length": 259.2916666666667, "eval_kl": 0.08880615234375, "eval_loss": 0.0008836736087687314, "eval_reward": 0.45848549840350944, "eval_reward_std": 0.05917525380694618, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45848549840350944, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05917525400097171, "eval_runtime": 313.7011, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 276.5625, "completions/min_length": 217.0, "epoch": 4.301902398676592, "grad_norm": 0.21574506163597107, "kl": 0.041748046875, "learning_rate": 5.822532364453365e-07, "loss": 0.0004169698804616928, "memory(GiB)": 38.13, "reward": 0.3344666659832001, "reward_std": 0.03601952642202377, "rewards/VisualizationJSONCombinedORM/mean": 0.3344666659832001, "rewards/VisualizationJSONCombinedORM/std": 0.10942325741052628, "step": 5201, "train_speed(iter/s)": 0.085058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 314.5625, "completions/min_length": 268.0, "epoch": 4.302729528535981, "grad_norm": 0.1981251984834671, "kl": 0.05438232421875, "learning_rate": 5.80901650195309e-07, "loss": 0.0005455240607261658, "memory(GiB)": 38.13, "reward": 0.7143098711967468, "reward_std": 0.05159405618906021, "rewards/VisualizationJSONCombinedORM/mean": 0.7143098711967468, "rewards/VisualizationJSONCombinedORM/std": 0.08632229268550873, "step": 5202, "train_speed(iter/s)": 0.085042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 302.9375, "completions/min_length": 225.0, "epoch": 4.303556658395368, "grad_norm": 0.18859341740608215, "kl": 0.078369140625, "learning_rate": 5.795515377352673e-07, "loss": 0.0007853098213672638, "memory(GiB)": 38.13, "reward": 0.7219449281692505, "reward_std": 0.0715978592634201, "rewards/VisualizationJSONCombinedORM/mean": 0.7219449281692505, "rewards/VisualizationJSONCombinedORM/std": 0.08546585589647293, "step": 5203, "train_speed(iter/s)": 0.085021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 303.0, "completions/min_length": 243.0, "epoch": 4.304383788254756, "grad_norm": 0.17647987604141235, "kl": 0.1285400390625, "learning_rate": 5.782028995154792e-07, "loss": 0.0012868233025074005, "memory(GiB)": 38.13, "reward": 0.3899475336074829, "reward_std": 0.03794971853494644, "rewards/VisualizationJSONCombinedORM/mean": 0.3899475336074829, "rewards/VisualizationJSONCombinedORM/std": 0.24606280028820038, "step": 5204, "train_speed(iter/s)": 0.085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 343.375, "completions/min_length": 261.0, "epoch": 4.305210918114144, "grad_norm": 0.2570165693759918, "kl": 0.114990234375, "learning_rate": 5.768557359857241e-07, "loss": 0.0011508427560329437, "memory(GiB)": 38.13, "reward": 0.26517218351364136, "reward_std": 0.034598395228385925, "rewards/VisualizationJSONCombinedORM/mean": 0.26517218351364136, "rewards/VisualizationJSONCombinedORM/std": 0.08740060776472092, "step": 5205, "train_speed(iter/s)": 0.084979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 332.3125, "completions/min_length": 226.0, "epoch": 4.306038047973532, "grad_norm": 0.1895744651556015, "kl": 0.05078125, "learning_rate": 5.75510047595283e-07, "loss": 0.0005075447261333466, "memory(GiB)": 38.13, "reward": 0.4505007266998291, "reward_std": 0.04113544896245003, "rewards/VisualizationJSONCombinedORM/mean": 0.4505007266998291, "rewards/VisualizationJSONCombinedORM/std": 0.16645777225494385, "step": 5206, "train_speed(iter/s)": 0.084959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 300.75, "completions/min_length": 224.0, "epoch": 4.30686517783292, "grad_norm": 0.18297377228736877, "kl": 0.1011962890625, "learning_rate": 5.741658347929541e-07, "loss": 0.001010790467262268, "memory(GiB)": 38.13, "reward": 0.8232877850532532, "reward_std": 0.08343628793954849, "rewards/VisualizationJSONCombinedORM/mean": 0.8232877850532532, "rewards/VisualizationJSONCombinedORM/std": 0.10972417145967484, "step": 5207, "train_speed(iter/s)": 0.084941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 312.3125, "completions/min_length": 256.0, "epoch": 4.3076923076923075, "grad_norm": 0.21143747866153717, "kl": 0.05535888671875, "learning_rate": 5.728230980270355e-07, "loss": 0.0005537569522857666, "memory(GiB)": 38.13, "reward": 0.5693778991699219, "reward_std": 0.07059014588594437, "rewards/VisualizationJSONCombinedORM/mean": 0.5693778991699219, "rewards/VisualizationJSONCombinedORM/std": 0.11359845846891403, "step": 5208, "train_speed(iter/s)": 0.084921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 301.5, "completions/min_length": 215.0, "epoch": 4.308519437551696, "grad_norm": 0.19117236137390137, "kl": 0.05035400390625, "learning_rate": 5.714818377453374e-07, "loss": 0.0005035325884819031, "memory(GiB)": 38.13, "reward": 0.6426326036453247, "reward_std": 0.06949668377637863, "rewards/VisualizationJSONCombinedORM/mean": 0.6426326036453247, "rewards/VisualizationJSONCombinedORM/std": 0.1129423975944519, "step": 5209, "train_speed(iter/s)": 0.084901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 307.375, "completions/min_length": 252.0, "epoch": 4.309346567411083, "grad_norm": 0.2909412682056427, "kl": 0.05963134765625, "learning_rate": 5.701420543951757e-07, "loss": 0.0005950666964054108, "memory(GiB)": 38.13, "reward": 0.5033729672431946, "reward_std": 0.08069800585508347, "rewards/VisualizationJSONCombinedORM/mean": 0.5033729672431946, "rewards/VisualizationJSONCombinedORM/std": 0.08271085470914841, "step": 5210, "train_speed(iter/s)": 0.084882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 328.9375, "completions/min_length": 237.0, "epoch": 4.310173697270471, "grad_norm": 0.24950949847698212, "kl": 0.0992431640625, "learning_rate": 5.688037484233766e-07, "loss": 0.0009945370256900787, "memory(GiB)": 38.13, "reward": 0.3532721996307373, "reward_std": 0.0585874579846859, "rewards/VisualizationJSONCombinedORM/mean": 0.3532721996307373, "rewards/VisualizationJSONCombinedORM/std": 0.13439162075519562, "step": 5211, "train_speed(iter/s)": 0.084868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 288.4375, "completions/min_length": 210.0, "epoch": 4.31100082712986, "grad_norm": 0.17647188901901245, "kl": 0.06365966796875, "learning_rate": 5.674669202762684e-07, "loss": 0.0006372183561325073, "memory(GiB)": 38.13, "reward": 0.5004329085350037, "reward_std": 0.04471641033887863, "rewards/VisualizationJSONCombinedORM/mean": 0.5004329085350037, "rewards/VisualizationJSONCombinedORM/std": 0.1077096164226532, "step": 5212, "train_speed(iter/s)": 0.084849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 303.4375, "completions/min_length": 255.0, "epoch": 4.311827956989247, "grad_norm": 0.22455109655857086, "kl": 0.1004638671875, "learning_rate": 5.661315703996905e-07, "loss": 0.0010038837790489197, "memory(GiB)": 38.13, "reward": 0.6213820576667786, "reward_std": 0.0648496225476265, "rewards/VisualizationJSONCombinedORM/mean": 0.6213820576667786, "rewards/VisualizationJSONCombinedORM/std": 0.09008011966943741, "step": 5213, "train_speed(iter/s)": 0.084835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 315.3125, "completions/min_length": 243.0, "epoch": 4.312655086848635, "grad_norm": 0.21028316020965576, "kl": 0.12841796875, "learning_rate": 5.647976992389892e-07, "loss": 0.0012841075658798218, "memory(GiB)": 38.13, "reward": 0.653937816619873, "reward_std": 0.07846073806285858, "rewards/VisualizationJSONCombinedORM/mean": 0.653937816619873, "rewards/VisualizationJSONCombinedORM/std": 0.07636822760105133, "step": 5214, "train_speed(iter/s)": 0.084808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 307.125, "completions/min_length": 272.0, "epoch": 4.313482216708024, "grad_norm": 0.184696763753891, "kl": 0.05712890625, "learning_rate": 5.634653072390167e-07, "loss": 0.0005696713924407959, "memory(GiB)": 38.13, "reward": 0.4522039294242859, "reward_std": 0.028980985283851624, "rewards/VisualizationJSONCombinedORM/mean": 0.4522039294242859, "rewards/VisualizationJSONCombinedORM/std": 0.07103210687637329, "step": 5215, "train_speed(iter/s)": 0.084788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 307.125, "completions/min_length": 253.0, "epoch": 4.314309346567411, "grad_norm": 0.22427994012832642, "kl": 0.0909423828125, "learning_rate": 5.621343948441299e-07, "loss": 0.0009094774723052979, "memory(GiB)": 38.13, "reward": 0.8003705739974976, "reward_std": 0.051686421036720276, "rewards/VisualizationJSONCombinedORM/mean": 0.8003705739974976, "rewards/VisualizationJSONCombinedORM/std": 0.056070804595947266, "step": 5216, "train_speed(iter/s)": 0.08477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 330.5625, "completions/min_length": 254.0, "epoch": 4.315136476426799, "grad_norm": 0.1719503402709961, "kl": 0.079345703125, "learning_rate": 5.608049624981982e-07, "loss": 0.0007925545796751976, "memory(GiB)": 38.13, "reward": 0.49472686648368835, "reward_std": 0.04731755331158638, "rewards/VisualizationJSONCombinedORM/mean": 0.49472686648368835, "rewards/VisualizationJSONCombinedORM/std": 0.17881077527999878, "step": 5217, "train_speed(iter/s)": 0.084754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 309.4375, "completions/min_length": 256.0, "epoch": 4.315963606286187, "grad_norm": 0.1787538081407547, "kl": 0.077392578125, "learning_rate": 5.594770106445896e-07, "loss": 0.0007734298706054688, "memory(GiB)": 38.13, "reward": 0.5660102367401123, "reward_std": 0.06333136558532715, "rewards/VisualizationJSONCombinedORM/mean": 0.5660102367401123, "rewards/VisualizationJSONCombinedORM/std": 0.18234743177890778, "step": 5218, "train_speed(iter/s)": 0.084736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 329.125, "completions/min_length": 270.0, "epoch": 4.316790736145575, "grad_norm": 0.2451874315738678, "kl": 0.07757568359375, "learning_rate": 5.581505397261844e-07, "loss": 0.0007769465446472168, "memory(GiB)": 38.13, "reward": 0.5011085271835327, "reward_std": 0.04677803814411163, "rewards/VisualizationJSONCombinedORM/mean": 0.5011085271835327, "rewards/VisualizationJSONCombinedORM/std": 0.201048344373703, "step": 5219, "train_speed(iter/s)": 0.084722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 315.4375, "completions/min_length": 255.0, "epoch": 4.317617866004963, "grad_norm": 0.18829700350761414, "kl": 0.0484619140625, "learning_rate": 5.568255501853664e-07, "loss": 0.0004839003086090088, "memory(GiB)": 38.13, "reward": 0.5219789743423462, "reward_std": 0.05908765271306038, "rewards/VisualizationJSONCombinedORM/mean": 0.5219789743423462, "rewards/VisualizationJSONCombinedORM/std": 0.0729159265756607, "step": 5220, "train_speed(iter/s)": 0.084702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 342.1875, "completions/min_length": 283.0, "epoch": 4.3184449958643505, "grad_norm": 0.17684485018253326, "kl": 0.0477294921875, "learning_rate": 5.555020424640267e-07, "loss": 0.00047628581523895264, "memory(GiB)": 38.13, "reward": 0.826202929019928, "reward_std": 0.07143209874629974, "rewards/VisualizationJSONCombinedORM/mean": 0.826202929019928, "rewards/VisualizationJSONCombinedORM/std": 0.07602807879447937, "step": 5221, "train_speed(iter/s)": 0.084679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 303.75, "completions/min_length": 228.0, "epoch": 4.319272125723739, "grad_norm": 0.20050513744354248, "kl": 0.07342529296875, "learning_rate": 5.541800170035582e-07, "loss": 0.0007340610027313232, "memory(GiB)": 38.13, "reward": 0.5184363126754761, "reward_std": 0.04209689795970917, "rewards/VisualizationJSONCombinedORM/mean": 0.5184363126754761, "rewards/VisualizationJSONCombinedORM/std": 0.0847577303647995, "step": 5222, "train_speed(iter/s)": 0.084665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 296.375, "completions/min_length": 218.0, "epoch": 4.320099255583127, "grad_norm": 0.17911657691001892, "kl": 0.05377197265625, "learning_rate": 5.528594742448667e-07, "loss": 0.0005391091108322144, "memory(GiB)": 38.13, "reward": 0.4988005757331848, "reward_std": 0.05099352449178696, "rewards/VisualizationJSONCombinedORM/mean": 0.4988005757331848, "rewards/VisualizationJSONCombinedORM/std": 0.07754756510257721, "step": 5223, "train_speed(iter/s)": 0.084647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 290.6875, "completions/min_length": 239.0, "epoch": 4.320926385442514, "grad_norm": 0.2481113076210022, "kl": 0.125732421875, "learning_rate": 5.515404146283571e-07, "loss": 0.001257479190826416, "memory(GiB)": 38.13, "reward": 0.5942848324775696, "reward_std": 0.0682520866394043, "rewards/VisualizationJSONCombinedORM/mean": 0.5942848324775696, "rewards/VisualizationJSONCombinedORM/std": 0.08733291923999786, "step": 5224, "train_speed(iter/s)": 0.084622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 282.875, "completions/min_length": 229.0, "epoch": 4.321753515301903, "grad_norm": 0.20228981971740723, "kl": 0.0732421875, "learning_rate": 5.502228385939418e-07, "loss": 0.0007314179092645645, "memory(GiB)": 38.13, "reward": 0.5692470073699951, "reward_std": 0.1021791473031044, "rewards/VisualizationJSONCombinedORM/mean": 0.5692470073699951, "rewards/VisualizationJSONCombinedORM/std": 0.25998544692993164, "step": 5225, "train_speed(iter/s)": 0.084605 }, { "epoch": 4.321753515301903, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 386.4166666666667, "eval_completions/mean_length": 312.9114583333333, "eval_completions/min_length": 257.9583333333333, "eval_kl": 0.10396830240885417, "eval_loss": 0.001036314875818789, "eval_reward": 0.4556057049582402, "eval_reward_std": 0.06216793610171104, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4556057049582402, "eval_rewards/VisualizationJSONCombinedORM/std": 0.062167937537500016, "eval_runtime": 324.7302, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 334.5, "completions/min_length": 259.0, "epoch": 4.32258064516129, "grad_norm": 0.23619650304317474, "kl": 0.06109619140625, "learning_rate": 5.489067465810394e-07, "loss": 0.0006113424897193909, "memory(GiB)": 38.13, "reward": 0.431892454624176, "reward_std": 0.029027536511421204, "rewards/VisualizationJSONCombinedORM/mean": 0.431892454624176, "rewards/VisualizationJSONCombinedORM/std": 0.1753901243209839, "step": 5226, "train_speed(iter/s)": 0.084142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 317.125, "completions/min_length": 253.0, "epoch": 4.323407775020678, "grad_norm": 0.22771835327148438, "kl": 0.0450439453125, "learning_rate": 5.475921390285732e-07, "loss": 0.0004501808434724808, "memory(GiB)": 38.13, "reward": 0.5992292761802673, "reward_std": 0.07491633296012878, "rewards/VisualizationJSONCombinedORM/mean": 0.5992292761802673, "rewards/VisualizationJSONCombinedORM/std": 0.09165935963392258, "step": 5227, "train_speed(iter/s)": 0.084116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 319.8125, "completions/min_length": 222.0, "epoch": 4.324234904880067, "grad_norm": 0.17294850945472717, "kl": 0.08599853515625, "learning_rate": 5.462790163749693e-07, "loss": 0.0008587278425693512, "memory(GiB)": 38.13, "reward": 0.44086897373199463, "reward_std": 0.12974096834659576, "rewards/VisualizationJSONCombinedORM/mean": 0.44086897373199463, "rewards/VisualizationJSONCombinedORM/std": 0.19194622337818146, "step": 5228, "train_speed(iter/s)": 0.084099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 270.0625, "completions/min_length": 222.0, "epoch": 4.325062034739454, "grad_norm": 0.22261767089366913, "kl": 0.08624267578125, "learning_rate": 5.449673790581611e-07, "loss": 0.0008620359003543854, "memory(GiB)": 38.13, "reward": 0.30211880803108215, "reward_std": 0.05129098892211914, "rewards/VisualizationJSONCombinedORM/mean": 0.30211880803108215, "rewards/VisualizationJSONCombinedORM/std": 0.05427516996860504, "step": 5229, "train_speed(iter/s)": 0.084084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 282.5625, "completions/min_length": 223.0, "epoch": 4.325889164598842, "grad_norm": 0.2530825138092041, "kl": 0.06756591796875, "learning_rate": 5.43657227515586e-07, "loss": 0.0006757043302059174, "memory(GiB)": 38.13, "reward": 0.39104753732681274, "reward_std": 0.05396823585033417, "rewards/VisualizationJSONCombinedORM/mean": 0.39104753732681274, "rewards/VisualizationJSONCombinedORM/std": 0.08861412853002548, "step": 5230, "train_speed(iter/s)": 0.08406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 305.0625, "completions/min_length": 234.0, "epoch": 4.32671629445823, "grad_norm": 0.21160823106765747, "kl": 0.103759765625, "learning_rate": 5.423485621841862e-07, "loss": 0.001037701964378357, "memory(GiB)": 38.13, "reward": 0.5745106935501099, "reward_std": 0.047232672572135925, "rewards/VisualizationJSONCombinedORM/mean": 0.5745106935501099, "rewards/VisualizationJSONCombinedORM/std": 0.17592452466487885, "step": 5231, "train_speed(iter/s)": 0.08404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 319.375, "completions/min_length": 271.0, "epoch": 4.327543424317618, "grad_norm": 0.19759917259216309, "kl": 0.06231689453125, "learning_rate": 5.410413835004052e-07, "loss": 0.0006226152181625366, "memory(GiB)": 38.13, "reward": 0.4307665228843689, "reward_std": 0.050288476049900055, "rewards/VisualizationJSONCombinedORM/mean": 0.4307665228843689, "rewards/VisualizationJSONCombinedORM/std": 0.11071173846721649, "step": 5232, "train_speed(iter/s)": 0.08402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 337.4375, "completions/min_length": 260.0, "epoch": 4.328370554177006, "grad_norm": 0.20981940627098083, "kl": 0.174560546875, "learning_rate": 5.397356919001973e-07, "loss": 0.0017454251646995544, "memory(GiB)": 38.13, "reward": 0.5499274730682373, "reward_std": 0.07279307395219803, "rewards/VisualizationJSONCombinedORM/mean": 0.5499274730682373, "rewards/VisualizationJSONCombinedORM/std": 0.18215343356132507, "step": 5233, "train_speed(iter/s)": 0.083998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 309.0625, "completions/min_length": 250.0, "epoch": 4.3291976840363935, "grad_norm": 0.28800666332244873, "kl": 0.078857421875, "learning_rate": 5.384314878190133e-07, "loss": 0.0007879361510276794, "memory(GiB)": 38.13, "reward": 0.6023410558700562, "reward_std": 0.06286624073982239, "rewards/VisualizationJSONCombinedORM/mean": 0.6023410558700562, "rewards/VisualizationJSONCombinedORM/std": 0.06463737785816193, "step": 5234, "train_speed(iter/s)": 0.083984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 305.3125, "completions/min_length": 220.0, "epoch": 4.330024813895782, "grad_norm": 0.18833720684051514, "kl": 0.07427978515625, "learning_rate": 5.371287716918128e-07, "loss": 0.0007435865700244904, "memory(GiB)": 38.13, "reward": 0.5454850196838379, "reward_std": 0.07886102795600891, "rewards/VisualizationJSONCombinedORM/mean": 0.5454850196838379, "rewards/VisualizationJSONCombinedORM/std": 0.07774283736944199, "step": 5235, "train_speed(iter/s)": 0.083967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 301.25, "completions/min_length": 232.0, "epoch": 4.330851943755169, "grad_norm": 0.30799877643585205, "kl": 0.132568359375, "learning_rate": 5.358275439530574e-07, "loss": 0.0013254005461931229, "memory(GiB)": 38.13, "reward": 0.5343369245529175, "reward_std": 0.07326194643974304, "rewards/VisualizationJSONCombinedORM/mean": 0.5343369245529175, "rewards/VisualizationJSONCombinedORM/std": 0.20130757987499237, "step": 5236, "train_speed(iter/s)": 0.083954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 325.625, "completions/min_length": 249.0, "epoch": 4.331679073614557, "grad_norm": 0.19640704989433289, "kl": 0.0546875, "learning_rate": 5.345278050367142e-07, "loss": 0.0005449652671813965, "memory(GiB)": 38.13, "reward": 0.8152934312820435, "reward_std": 0.05170094221830368, "rewards/VisualizationJSONCombinedORM/mean": 0.8152934312820435, "rewards/VisualizationJSONCombinedORM/std": 0.051108911633491516, "step": 5237, "train_speed(iter/s)": 0.083933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 329.75, "completions/min_length": 264.0, "epoch": 4.332506203473946, "grad_norm": 0.17335344851016998, "kl": 0.1190185546875, "learning_rate": 5.332295553762479e-07, "loss": 0.001189209520816803, "memory(GiB)": 38.13, "reward": 0.57081139087677, "reward_std": 0.03991486877202988, "rewards/VisualizationJSONCombinedORM/mean": 0.57081139087677, "rewards/VisualizationJSONCombinedORM/std": 0.03982637822628021, "step": 5238, "train_speed(iter/s)": 0.083919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 310.0625, "completions/min_length": 244.0, "epoch": 4.333333333333333, "grad_norm": 0.16579638421535492, "kl": 0.074462890625, "learning_rate": 5.31932795404636e-07, "loss": 0.0007446669042110443, "memory(GiB)": 38.13, "reward": 0.565064549446106, "reward_std": 0.05372926592826843, "rewards/VisualizationJSONCombinedORM/mean": 0.565064549446106, "rewards/VisualizationJSONCombinedORM/std": 0.07412982732057571, "step": 5239, "train_speed(iter/s)": 0.083909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 333.625, "completions/min_length": 252.0, "epoch": 4.334160463192721, "grad_norm": 0.19907231628894806, "kl": 0.0426025390625, "learning_rate": 5.306375255543511e-07, "loss": 0.00042644888162612915, "memory(GiB)": 38.13, "reward": 0.6760549545288086, "reward_std": 0.0530584454536438, "rewards/VisualizationJSONCombinedORM/mean": 0.6760549545288086, "rewards/VisualizationJSONCombinedORM/std": 0.10142327100038528, "step": 5240, "train_speed(iter/s)": 0.083887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 319.125, "completions/min_length": 216.0, "epoch": 4.33498759305211, "grad_norm": 0.18626128137111664, "kl": 0.0999755859375, "learning_rate": 5.293437462573725e-07, "loss": 0.0010011233389377594, "memory(GiB)": 38.13, "reward": 0.6899363994598389, "reward_std": 0.09150323271751404, "rewards/VisualizationJSONCombinedORM/mean": 0.6899363994598389, "rewards/VisualizationJSONCombinedORM/std": 0.09648336470127106, "step": 5241, "train_speed(iter/s)": 0.083866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 305.875, "completions/min_length": 233.0, "epoch": 4.335814722911497, "grad_norm": 0.17961059510707855, "kl": 0.0380859375, "learning_rate": 5.280514579451812e-07, "loss": 0.000380493700504303, "memory(GiB)": 38.13, "reward": 0.6927772164344788, "reward_std": 0.05077254772186279, "rewards/VisualizationJSONCombinedORM/mean": 0.6927772164344788, "rewards/VisualizationJSONCombinedORM/std": 0.0844884067773819, "step": 5242, "train_speed(iter/s)": 0.083844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 303.0, "completions/min_length": 232.0, "epoch": 4.336641852770885, "grad_norm": 0.1938067376613617, "kl": 0.115234375, "learning_rate": 5.267606610487624e-07, "loss": 0.0011529773473739624, "memory(GiB)": 38.13, "reward": 0.5047380924224854, "reward_std": 0.05753753334283829, "rewards/VisualizationJSONCombinedORM/mean": 0.5047380924224854, "rewards/VisualizationJSONCombinedORM/std": 0.0707639828324318, "step": 5243, "train_speed(iter/s)": 0.083822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 333.4375, "completions/min_length": 288.0, "epoch": 4.337468982630273, "grad_norm": 0.1900278925895691, "kl": 0.06683349609375, "learning_rate": 5.254713559986019e-07, "loss": 0.0006686747074127197, "memory(GiB)": 38.13, "reward": 0.6352169513702393, "reward_std": 0.05893971771001816, "rewards/VisualizationJSONCombinedORM/mean": 0.6352169513702393, "rewards/VisualizationJSONCombinedORM/std": 0.1461675763130188, "step": 5244, "train_speed(iter/s)": 0.083804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 321.125, "completions/min_length": 250.0, "epoch": 4.338296112489661, "grad_norm": 0.19578450918197632, "kl": 0.0576171875, "learning_rate": 5.241835432246888e-07, "loss": 0.000576704740524292, "memory(GiB)": 38.13, "reward": 0.5548156499862671, "reward_std": 0.04691781848669052, "rewards/VisualizationJSONCombinedORM/mean": 0.5548156499862671, "rewards/VisualizationJSONCombinedORM/std": 0.21140453219413757, "step": 5245, "train_speed(iter/s)": 0.083783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 306.5625, "completions/min_length": 215.0, "epoch": 4.339123242349049, "grad_norm": 0.17974895238876343, "kl": 0.0885009765625, "learning_rate": 5.228972231565155e-07, "loss": 0.0008861534297466278, "memory(GiB)": 38.13, "reward": 0.6485353708267212, "reward_std": 0.08049342036247253, "rewards/VisualizationJSONCombinedORM/mean": 0.6485353708267212, "rewards/VisualizationJSONCombinedORM/std": 0.12603265047073364, "step": 5246, "train_speed(iter/s)": 0.083761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 295.375, "completions/min_length": 236.0, "epoch": 4.3399503722084365, "grad_norm": 0.20629948377609253, "kl": 0.05804443359375, "learning_rate": 5.216123962230763e-07, "loss": 0.000580132007598877, "memory(GiB)": 38.13, "reward": 0.8039494156837463, "reward_std": 0.10113844275474548, "rewards/VisualizationJSONCombinedORM/mean": 0.8039494156837463, "rewards/VisualizationJSONCombinedORM/std": 0.1282004863023758, "step": 5247, "train_speed(iter/s)": 0.083743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 307.125, "completions/min_length": 243.0, "epoch": 4.340777502067825, "grad_norm": 0.17582957446575165, "kl": 0.04498291015625, "learning_rate": 5.20329062852864e-07, "loss": 0.0004503075033426285, "memory(GiB)": 38.13, "reward": 0.7497948408126831, "reward_std": 0.04204254224896431, "rewards/VisualizationJSONCombinedORM/mean": 0.7497948408126831, "rewards/VisualizationJSONCombinedORM/std": 0.09582030028104782, "step": 5248, "train_speed(iter/s)": 0.083718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 290.0625, "completions/min_length": 227.0, "epoch": 4.341604631927213, "grad_norm": 0.17420324683189392, "kl": 0.060302734375, "learning_rate": 5.190472234738809e-07, "loss": 0.0006021559238433838, "memory(GiB)": 38.13, "reward": 0.7113610506057739, "reward_std": 0.051700253039598465, "rewards/VisualizationJSONCombinedORM/mean": 0.7113610506057739, "rewards/VisualizationJSONCombinedORM/std": 0.11094541847705841, "step": 5249, "train_speed(iter/s)": 0.083695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 293.875, "completions/min_length": 232.0, "epoch": 4.3424317617866, "grad_norm": 0.16407355666160583, "kl": 0.06500244140625, "learning_rate": 5.177668785136225e-07, "loss": 0.0006512477993965149, "memory(GiB)": 38.13, "reward": 0.6123833656311035, "reward_std": 0.05449230968952179, "rewards/VisualizationJSONCombinedORM/mean": 0.6123833656311035, "rewards/VisualizationJSONCombinedORM/std": 0.0786355510354042, "step": 5250, "train_speed(iter/s)": 0.083677 }, { "epoch": 4.3424317617866, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.875, "eval_completions/mean_length": 310.0, "eval_completions/min_length": 255.95833333333334, "eval_kl": 0.09327189127604167, "eval_loss": 0.0009306902065873146, "eval_reward": 0.4645705595612526, "eval_reward_std": 0.05011631750191251, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4645705595612526, "eval_rewards/VisualizationJSONCombinedORM/std": 0.050116317540717624, "eval_runtime": 314.3306, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 329.9375, "completions/min_length": 255.0, "epoch": 4.343258891645989, "grad_norm": 0.19641469419002533, "kl": 0.0809326171875, "learning_rate": 5.16488028399092e-07, "loss": 0.0008094701915979385, "memory(GiB)": 38.13, "reward": 0.5948103666305542, "reward_std": 0.0799742341041565, "rewards/VisualizationJSONCombinedORM/mean": 0.5948103666305542, "rewards/VisualizationJSONCombinedORM/std": 0.10424217581748962, "step": 5251, "train_speed(iter/s)": 0.083245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 295.6875, "completions/min_length": 241.0, "epoch": 4.344086021505376, "grad_norm": 0.1751062124967575, "kl": 0.1741943359375, "learning_rate": 5.152106735567908e-07, "loss": 0.0017413944005966187, "memory(GiB)": 38.13, "reward": 0.2717476487159729, "reward_std": 0.027242381125688553, "rewards/VisualizationJSONCombinedORM/mean": 0.2717476487159729, "rewards/VisualizationJSONCombinedORM/std": 0.09234275668859482, "step": 5252, "train_speed(iter/s)": 0.083228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 314.0625, "completions/min_length": 238.0, "epoch": 4.344913151364764, "grad_norm": 0.1584862768650055, "kl": 0.06744384765625, "learning_rate": 5.139348144127237e-07, "loss": 0.0006764531135559082, "memory(GiB)": 38.13, "reward": 0.5403362512588501, "reward_std": 0.03669646382331848, "rewards/VisualizationJSONCombinedORM/mean": 0.5403362512588501, "rewards/VisualizationJSONCombinedORM/std": 0.03897827863693237, "step": 5253, "train_speed(iter/s)": 0.083207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 282.4375, "completions/min_length": 209.0, "epoch": 4.345740281224153, "grad_norm": 0.176660418510437, "kl": 0.1468505859375, "learning_rate": 5.126604513923938e-07, "loss": 0.0014697164297103882, "memory(GiB)": 38.13, "reward": 0.6162517070770264, "reward_std": 0.06555518507957458, "rewards/VisualizationJSONCombinedORM/mean": 0.6162517070770264, "rewards/VisualizationJSONCombinedORM/std": 0.22761690616607666, "step": 5254, "train_speed(iter/s)": 0.083186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 331.9375, "completions/min_length": 288.0, "epoch": 4.34656741108354, "grad_norm": 0.19431886076927185, "kl": 0.076416015625, "learning_rate": 5.1138758492081e-07, "loss": 0.0007644221186637878, "memory(GiB)": 38.13, "reward": 0.37793970108032227, "reward_std": 0.02936946414411068, "rewards/VisualizationJSONCombinedORM/mean": 0.37793970108032227, "rewards/VisualizationJSONCombinedORM/std": 0.03324202075600624, "step": 5255, "train_speed(iter/s)": 0.083169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 309.9375, "completions/min_length": 216.0, "epoch": 4.347394540942928, "grad_norm": 0.18019378185272217, "kl": 0.03118896484375, "learning_rate": 5.10116215422477e-07, "loss": 0.00031181052327156067, "memory(GiB)": 38.13, "reward": 0.5991865396499634, "reward_std": 0.04876177757978439, "rewards/VisualizationJSONCombinedORM/mean": 0.5991865396499634, "rewards/VisualizationJSONCombinedORM/std": 0.25481024384498596, "step": 5256, "train_speed(iter/s)": 0.083152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 290.5625, "completions/min_length": 226.0, "epoch": 4.348221670802316, "grad_norm": 0.18477997183799744, "kl": 0.10693359375, "learning_rate": 5.08846343321403e-07, "loss": 0.0010709911584854126, "memory(GiB)": 38.13, "reward": 0.7112419605255127, "reward_std": 0.056597255170345306, "rewards/VisualizationJSONCombinedORM/mean": 0.7112419605255127, "rewards/VisualizationJSONCombinedORM/std": 0.13603830337524414, "step": 5257, "train_speed(iter/s)": 0.083128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 304.875, "completions/min_length": 222.0, "epoch": 4.349048800661704, "grad_norm": 0.1753135472536087, "kl": 0.07415771484375, "learning_rate": 5.07577969041097e-07, "loss": 0.0007421951740980148, "memory(GiB)": 38.13, "reward": 0.5687987804412842, "reward_std": 0.0635833740234375, "rewards/VisualizationJSONCombinedORM/mean": 0.5687987804412842, "rewards/VisualizationJSONCombinedORM/std": 0.0792325884103775, "step": 5258, "train_speed(iter/s)": 0.083116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 325.875, "completions/min_length": 256.0, "epoch": 4.349875930521092, "grad_norm": 0.2025226652622223, "kl": 0.0965576171875, "learning_rate": 5.063110930045679e-07, "loss": 0.0009662136435508728, "memory(GiB)": 38.13, "reward": 0.6426202058792114, "reward_std": 0.04786810278892517, "rewards/VisualizationJSONCombinedORM/mean": 0.6426202058792114, "rewards/VisualizationJSONCombinedORM/std": 0.192559152841568, "step": 5259, "train_speed(iter/s)": 0.083096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 304.25, "completions/min_length": 233.0, "epoch": 4.3507030603804795, "grad_norm": 0.3967421352863312, "kl": 0.064697265625, "learning_rate": 5.050457156343225e-07, "loss": 0.0006472095847129822, "memory(GiB)": 38.13, "reward": 0.4909161627292633, "reward_std": 0.04312751442193985, "rewards/VisualizationJSONCombinedORM/mean": 0.4909161627292633, "rewards/VisualizationJSONCombinedORM/std": 0.06531817466020584, "step": 5260, "train_speed(iter/s)": 0.083083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 279.125, "completions/min_length": 230.0, "epoch": 4.351530190239868, "grad_norm": 0.1642298400402069, "kl": 0.04754638671875, "learning_rate": 5.037818373523723e-07, "loss": 0.00047537684440612793, "memory(GiB)": 38.13, "reward": 0.6465467214584351, "reward_std": 0.08806087076663971, "rewards/VisualizationJSONCombinedORM/mean": 0.6465467214584351, "rewards/VisualizationJSONCombinedORM/std": 0.10649334639310837, "step": 5261, "train_speed(iter/s)": 0.083066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 292.3125, "completions/min_length": 254.0, "epoch": 4.352357320099255, "grad_norm": 0.19886666536331177, "kl": 0.1214599609375, "learning_rate": 5.025194585802262e-07, "loss": 0.0012129731476306915, "memory(GiB)": 38.13, "reward": 0.4744802713394165, "reward_std": 0.04963085800409317, "rewards/VisualizationJSONCombinedORM/mean": 0.4744802713394165, "rewards/VisualizationJSONCombinedORM/std": 0.2646452486515045, "step": 5262, "train_speed(iter/s)": 0.083048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 278.5, "completions/min_length": 213.0, "epoch": 4.353184449958643, "grad_norm": 0.17931519448757172, "kl": 0.02667236328125, "learning_rate": 5.012585797388936e-07, "loss": 0.0002673529088497162, "memory(GiB)": 38.13, "reward": 0.665982723236084, "reward_std": 0.03564681112766266, "rewards/VisualizationJSONCombinedORM/mean": 0.665982723236084, "rewards/VisualizationJSONCombinedORM/std": 0.13308440148830414, "step": 5263, "train_speed(iter/s)": 0.083029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 314.3125, "completions/min_length": 243.0, "epoch": 4.354011579818032, "grad_norm": 0.23317241668701172, "kl": 0.0806884765625, "learning_rate": 4.999992012488813e-07, "loss": 0.0008066780865192413, "memory(GiB)": 38.13, "reward": 0.3470534682273865, "reward_std": 0.033647868782281876, "rewards/VisualizationJSONCombinedORM/mean": 0.3470534682273865, "rewards/VisualizationJSONCombinedORM/std": 0.062184352427721024, "step": 5264, "train_speed(iter/s)": 0.083011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 325.4375, "completions/min_length": 269.0, "epoch": 4.354838709677419, "grad_norm": 0.17366757988929749, "kl": 0.054443359375, "learning_rate": 4.987413235302025e-07, "loss": 0.0005453154444694519, "memory(GiB)": 38.13, "reward": 0.593158483505249, "reward_std": 0.03651920706033707, "rewards/VisualizationJSONCombinedORM/mean": 0.593158483505249, "rewards/VisualizationJSONCombinedORM/std": 0.056393906474113464, "step": 5265, "train_speed(iter/s)": 0.082984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 295.8125, "completions/min_length": 241.0, "epoch": 4.355665839536807, "grad_norm": 0.14674164354801178, "kl": 0.04656982421875, "learning_rate": 4.974849470023607e-07, "loss": 0.00046597421169281006, "memory(GiB)": 38.13, "reward": 0.8277549147605896, "reward_std": 0.06521856784820557, "rewards/VisualizationJSONCombinedORM/mean": 0.8277549147605896, "rewards/VisualizationJSONCombinedORM/std": 0.07276815921068192, "step": 5266, "train_speed(iter/s)": 0.082969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 328.1875, "completions/min_length": 215.0, "epoch": 4.356492969396196, "grad_norm": 0.17277202010154724, "kl": 0.08056640625, "learning_rate": 4.962300720843655e-07, "loss": 0.0008070804178714752, "memory(GiB)": 38.13, "reward": 0.32504868507385254, "reward_std": 0.026140721514821053, "rewards/VisualizationJSONCombinedORM/mean": 0.32504868507385254, "rewards/VisualizationJSONCombinedORM/std": 0.029607610777020454, "step": 5267, "train_speed(iter/s)": 0.082942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 334.625, "completions/min_length": 253.0, "epoch": 4.357320099255583, "grad_norm": 0.21840141713619232, "kl": 0.1259765625, "learning_rate": 4.949766991947235e-07, "loss": 0.0012580081820487976, "memory(GiB)": 38.13, "reward": 0.42717427015304565, "reward_std": 0.023487096652388573, "rewards/VisualizationJSONCombinedORM/mean": 0.42717427015304565, "rewards/VisualizationJSONCombinedORM/std": 0.0883345752954483, "step": 5268, "train_speed(iter/s)": 0.082926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 298.4375, "completions/min_length": 226.0, "epoch": 4.358147229114971, "grad_norm": 0.19670364260673523, "kl": 0.04815673828125, "learning_rate": 4.937248287514407e-07, "loss": 0.00048120319843292236, "memory(GiB)": 38.13, "reward": 0.7296962738037109, "reward_std": 0.05394161492586136, "rewards/VisualizationJSONCombinedORM/mean": 0.7296962738037109, "rewards/VisualizationJSONCombinedORM/std": 0.1288454532623291, "step": 5269, "train_speed(iter/s)": 0.08291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 324.4375, "completions/min_length": 223.0, "epoch": 4.358974358974359, "grad_norm": 0.21904310584068298, "kl": 0.1070556640625, "learning_rate": 4.924744611720201e-07, "loss": 0.0010694842785596848, "memory(GiB)": 38.13, "reward": 0.5323029160499573, "reward_std": 0.04355234280228615, "rewards/VisualizationJSONCombinedORM/mean": 0.5323029160499573, "rewards/VisualizationJSONCombinedORM/std": 0.3181694447994232, "step": 5270, "train_speed(iter/s)": 0.082894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 326.0625, "completions/min_length": 268.0, "epoch": 4.359801488833747, "grad_norm": 0.19179052114486694, "kl": 0.110595703125, "learning_rate": 4.912255968734675e-07, "loss": 0.001104995608329773, "memory(GiB)": 38.13, "reward": 0.44171231985092163, "reward_std": 0.0645563006401062, "rewards/VisualizationJSONCombinedORM/mean": 0.44171231985092163, "rewards/VisualizationJSONCombinedORM/std": 0.11690478026866913, "step": 5271, "train_speed(iter/s)": 0.082876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 307.25, "completions/min_length": 228.0, "epoch": 4.360628618693135, "grad_norm": 0.19686222076416016, "kl": 0.04351806640625, "learning_rate": 4.899782362722827e-07, "loss": 0.0004355311393737793, "memory(GiB)": 38.13, "reward": 0.8069074749946594, "reward_std": 0.08986532688140869, "rewards/VisualizationJSONCombinedORM/mean": 0.8069074749946594, "rewards/VisualizationJSONCombinedORM/std": 0.10661319643259048, "step": 5272, "train_speed(iter/s)": 0.082855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 289.875, "completions/min_length": 219.0, "epoch": 4.3614557485525225, "grad_norm": 0.22449859976768494, "kl": 0.083984375, "learning_rate": 4.887323797844678e-07, "loss": 0.0008391477167606354, "memory(GiB)": 38.13, "reward": 0.49956998229026794, "reward_std": 0.09168482571840286, "rewards/VisualizationJSONCombinedORM/mean": 0.49956998229026794, "rewards/VisualizationJSONCombinedORM/std": 0.10980268567800522, "step": 5273, "train_speed(iter/s)": 0.082832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 308.6875, "completions/min_length": 241.0, "epoch": 4.362282878411911, "grad_norm": 0.1804114580154419, "kl": 0.203857421875, "learning_rate": 4.874880278255218e-07, "loss": 0.0020399242639541626, "memory(GiB)": 38.13, "reward": 0.4616504907608032, "reward_std": 0.04695790633559227, "rewards/VisualizationJSONCombinedORM/mean": 0.4616504907608032, "rewards/VisualizationJSONCombinedORM/std": 0.08123268932104111, "step": 5274, "train_speed(iter/s)": 0.082815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 272.75, "completions/min_length": 214.0, "epoch": 4.363110008271299, "grad_norm": 0.22101077437400818, "kl": 0.1064453125, "learning_rate": 4.862451808104419e-07, "loss": 0.0010645166039466858, "memory(GiB)": 38.13, "reward": 0.42735621333122253, "reward_std": 0.07441898435354233, "rewards/VisualizationJSONCombinedORM/mean": 0.42735621333122253, "rewards/VisualizationJSONCombinedORM/std": 0.18218573927879333, "step": 5275, "train_speed(iter/s)": 0.082791 }, { "epoch": 4.363110008271299, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 365.875, "eval_completions/mean_length": 310.203125, "eval_completions/min_length": 259.2916666666667, "eval_kl": 0.1086273193359375, "eval_loss": 0.0011104667792096734, "eval_reward": 0.46572874424358207, "eval_reward_std": 0.0529890451580286, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46572874424358207, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05298904605054607, "eval_runtime": 312.5094, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 288.0, "completions/min_length": 242.0, "epoch": 4.363937138130686, "grad_norm": 0.2076399177312851, "kl": 0.1068115234375, "learning_rate": 4.850038391537215e-07, "loss": 0.0010690391063690186, "memory(GiB)": 38.13, "reward": 0.41846269369125366, "reward_std": 0.058939773589372635, "rewards/VisualizationJSONCombinedORM/mean": 0.41846269369125366, "rewards/VisualizationJSONCombinedORM/std": 0.20796823501586914, "step": 5276, "train_speed(iter/s)": 0.082365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 313.875, "completions/min_length": 255.0, "epoch": 4.364764267990075, "grad_norm": 0.21042244136333466, "kl": 0.04364013671875, "learning_rate": 4.837640032693558e-07, "loss": 0.00043686479330062866, "memory(GiB)": 38.13, "reward": 0.42432692646980286, "reward_std": 0.05139390006661415, "rewards/VisualizationJSONCombinedORM/mean": 0.42432692646980286, "rewards/VisualizationJSONCombinedORM/std": 0.13275153934955597, "step": 5277, "train_speed(iter/s)": 0.082348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 317.875, "completions/min_length": 233.0, "epoch": 4.365591397849462, "grad_norm": 0.1774301826953888, "kl": 0.05157470703125, "learning_rate": 4.825256735708356e-07, "loss": 0.0005154833197593689, "memory(GiB)": 38.13, "reward": 0.3650537431240082, "reward_std": 0.032992295920848846, "rewards/VisualizationJSONCombinedORM/mean": 0.3650537431240082, "rewards/VisualizationJSONCombinedORM/std": 0.041410062462091446, "step": 5278, "train_speed(iter/s)": 0.082331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 333.9375, "completions/min_length": 278.0, "epoch": 4.36641852770885, "grad_norm": 0.21747606992721558, "kl": 0.05621337890625, "learning_rate": 4.812888504711505e-07, "loss": 0.0005610398948192596, "memory(GiB)": 38.13, "reward": 0.5447860956192017, "reward_std": 0.06183788925409317, "rewards/VisualizationJSONCombinedORM/mean": 0.5447860956192017, "rewards/VisualizationJSONCombinedORM/std": 0.1291772425174713, "step": 5279, "train_speed(iter/s)": 0.082313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 303.875, "completions/min_length": 239.0, "epoch": 4.367245657568239, "grad_norm": 0.16960622370243073, "kl": 0.1585693359375, "learning_rate": 4.800535343827834e-07, "loss": 0.0015837326645851135, "memory(GiB)": 38.13, "reward": 0.4412556290626526, "reward_std": 0.04275473952293396, "rewards/VisualizationJSONCombinedORM/mean": 0.4412556290626526, "rewards/VisualizationJSONCombinedORM/std": 0.21563242375850677, "step": 5280, "train_speed(iter/s)": 0.082295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 285.9375, "completions/min_length": 226.0, "epoch": 4.368072787427626, "grad_norm": 0.14461158215999603, "kl": 0.0440673828125, "learning_rate": 4.788197257177212e-07, "loss": 0.0004408247768878937, "memory(GiB)": 38.13, "reward": 0.682636022567749, "reward_std": 0.06733033806085587, "rewards/VisualizationJSONCombinedORM/mean": 0.682636022567749, "rewards/VisualizationJSONCombinedORM/std": 0.0702764019370079, "step": 5281, "train_speed(iter/s)": 0.082277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 298.625, "completions/min_length": 248.0, "epoch": 4.368899917287014, "grad_norm": 0.20523272454738617, "kl": 0.07659912109375, "learning_rate": 4.775874248874451e-07, "loss": 0.0007656514644622803, "memory(GiB)": 38.13, "reward": 0.6121044158935547, "reward_std": 0.05453869700431824, "rewards/VisualizationJSONCombinedORM/mean": 0.6121044158935547, "rewards/VisualizationJSONCombinedORM/std": 0.25657448172569275, "step": 5282, "train_speed(iter/s)": 0.082265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 293.5, "completions/min_length": 234.0, "epoch": 4.369727047146402, "grad_norm": 0.4966806471347809, "kl": 0.457763671875, "learning_rate": 4.7635663230293114e-07, "loss": 0.00460401177406311, "memory(GiB)": 38.13, "reward": 0.5089176893234253, "reward_std": 0.056600891053676605, "rewards/VisualizationJSONCombinedORM/mean": 0.5089176893234253, "rewards/VisualizationJSONCombinedORM/std": 0.12125537544488907, "step": 5283, "train_speed(iter/s)": 0.082249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 321.6875, "completions/min_length": 273.0, "epoch": 4.37055417700579, "grad_norm": 0.17098940908908844, "kl": 0.069091796875, "learning_rate": 4.7512734837465544e-07, "loss": 0.0006924048066139221, "memory(GiB)": 38.13, "reward": 0.5589470267295837, "reward_std": 0.04904758930206299, "rewards/VisualizationJSONCombinedORM/mean": 0.5589470267295837, "rewards/VisualizationJSONCombinedORM/std": 0.1512659788131714, "step": 5284, "train_speed(iter/s)": 0.082233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 266.4375, "completions/min_length": 227.0, "epoch": 4.371381306865178, "grad_norm": 0.21026842296123505, "kl": 0.05078125, "learning_rate": 4.738995735125895e-07, "loss": 0.0005084574222564697, "memory(GiB)": 38.13, "reward": 0.6298928260803223, "reward_std": 0.06730771064758301, "rewards/VisualizationJSONCombinedORM/mean": 0.6298928260803223, "rewards/VisualizationJSONCombinedORM/std": 0.16434936225414276, "step": 5285, "train_speed(iter/s)": 0.082221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 306.4375, "completions/min_length": 224.0, "epoch": 4.3722084367245655, "grad_norm": 0.2775593101978302, "kl": 0.1224365234375, "learning_rate": 4.726733081262036e-07, "loss": 0.0012241974472999573, "memory(GiB)": 38.13, "reward": 0.5644890069961548, "reward_std": 0.07535897195339203, "rewards/VisualizationJSONCombinedORM/mean": 0.5644890069961548, "rewards/VisualizationJSONCombinedORM/std": 0.09962331503629684, "step": 5286, "train_speed(iter/s)": 0.082204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 296.625, "completions/min_length": 226.0, "epoch": 4.373035566583954, "grad_norm": 0.17277273535728455, "kl": 0.03839111328125, "learning_rate": 4.7144855262446e-07, "loss": 0.00038370490074157715, "memory(GiB)": 38.13, "reward": 0.6507934331893921, "reward_std": 0.06581025570631027, "rewards/VisualizationJSONCombinedORM/mean": 0.6507934331893921, "rewards/VisualizationJSONCombinedORM/std": 0.13724708557128906, "step": 5287, "train_speed(iter/s)": 0.082187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 320.6875, "completions/min_length": 247.0, "epoch": 4.373862696443341, "grad_norm": 0.28112202882766724, "kl": 0.1002197265625, "learning_rate": 4.7022530741582363e-07, "loss": 0.0010022260248661041, "memory(GiB)": 38.13, "reward": 0.6768853664398193, "reward_std": 0.052816249430179596, "rewards/VisualizationJSONCombinedORM/mean": 0.6768853664398193, "rewards/VisualizationJSONCombinedORM/std": 0.13094834983348846, "step": 5288, "train_speed(iter/s)": 0.082166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 338.0, "completions/min_length": 255.0, "epoch": 4.374689826302729, "grad_norm": 0.2226291000843048, "kl": 0.0892333984375, "learning_rate": 4.6900357290825037e-07, "loss": 0.0008914675563573837, "memory(GiB)": 38.13, "reward": 0.5689287185668945, "reward_std": 0.05293108522891998, "rewards/VisualizationJSONCombinedORM/mean": 0.5689287185668945, "rewards/VisualizationJSONCombinedORM/std": 0.26194241642951965, "step": 5289, "train_speed(iter/s)": 0.082145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 282.125, "completions/min_length": 241.0, "epoch": 4.375516956162118, "grad_norm": 0.23676103353500366, "kl": 0.04296875, "learning_rate": 4.677833495091949e-07, "loss": 0.0004299059510231018, "memory(GiB)": 38.13, "reward": 0.4600873589515686, "reward_std": 0.0591546855866909, "rewards/VisualizationJSONCombinedORM/mean": 0.4600873589515686, "rewards/VisualizationJSONCombinedORM/std": 0.07141433656215668, "step": 5290, "train_speed(iter/s)": 0.082132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 314.6875, "completions/min_length": 253.0, "epoch": 4.376344086021505, "grad_norm": 0.19056160748004913, "kl": 0.07867431640625, "learning_rate": 4.665646376256078e-07, "loss": 0.0007865689694881439, "memory(GiB)": 38.13, "reward": 0.5325105786323547, "reward_std": 0.07473938167095184, "rewards/VisualizationJSONCombinedORM/mean": 0.5325105786323547, "rewards/VisualizationJSONCombinedORM/std": 0.08710575103759766, "step": 5291, "train_speed(iter/s)": 0.082113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 340.0625, "completions/min_length": 268.0, "epoch": 4.377171215880893, "grad_norm": 0.1824214905500412, "kl": 0.0526123046875, "learning_rate": 4.6534743766393554e-07, "loss": 0.0005255639553070068, "memory(GiB)": 38.13, "reward": 0.4349254369735718, "reward_std": 0.03182332590222359, "rewards/VisualizationJSONCombinedORM/mean": 0.4349254369735718, "rewards/VisualizationJSONCombinedORM/std": 0.18066824972629547, "step": 5292, "train_speed(iter/s)": 0.082089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 326.0, "completions/min_length": 249.0, "epoch": 4.377998345740282, "grad_norm": 0.3209565281867981, "kl": 0.07177734375, "learning_rate": 4.641317500301173e-07, "loss": 0.0007188841700553894, "memory(GiB)": 38.13, "reward": 0.5571117997169495, "reward_std": 0.06385037302970886, "rewards/VisualizationJSONCombinedORM/mean": 0.5571117997169495, "rewards/VisualizationJSONCombinedORM/std": 0.11580389738082886, "step": 5293, "train_speed(iter/s)": 0.082069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 301.1875, "completions/min_length": 239.0, "epoch": 4.378825475599669, "grad_norm": 0.1868482232093811, "kl": 0.0640869140625, "learning_rate": 4.62917575129595e-07, "loss": 0.000639788806438446, "memory(GiB)": 38.13, "reward": 0.6876273155212402, "reward_std": 0.06045927107334137, "rewards/VisualizationJSONCombinedORM/mean": 0.6876273155212402, "rewards/VisualizationJSONCombinedORM/std": 0.067911297082901, "step": 5294, "train_speed(iter/s)": 0.082045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 312.25, "completions/min_length": 242.0, "epoch": 4.379652605459057, "grad_norm": 0.26541274785995483, "kl": 0.255126953125, "learning_rate": 4.6170491336729794e-07, "loss": 0.002555467188358307, "memory(GiB)": 38.13, "reward": 0.5837898850440979, "reward_std": 0.07590268552303314, "rewards/VisualizationJSONCombinedORM/mean": 0.5837898850440979, "rewards/VisualizationJSONCombinedORM/std": 0.16392676532268524, "step": 5295, "train_speed(iter/s)": 0.082025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 323.875, "completions/min_length": 237.0, "epoch": 4.380479735318445, "grad_norm": 0.184256911277771, "kl": 0.0513916015625, "learning_rate": 4.6049376514765655e-07, "loss": 0.0005142763257026672, "memory(GiB)": 38.13, "reward": 0.5751951932907104, "reward_std": 0.15855765342712402, "rewards/VisualizationJSONCombinedORM/mean": 0.5751951932907104, "rewards/VisualizationJSONCombinedORM/std": 0.2084149718284607, "step": 5296, "train_speed(iter/s)": 0.082001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 307.125, "completions/min_length": 241.0, "epoch": 4.381306865177833, "grad_norm": 0.18282784521579742, "kl": 0.043212890625, "learning_rate": 4.5928413087459325e-07, "loss": 0.00043176859617233276, "memory(GiB)": 38.13, "reward": 0.42944055795669556, "reward_std": 0.030413120985031128, "rewards/VisualizationJSONCombinedORM/mean": 0.42944055795669556, "rewards/VisualizationJSONCombinedORM/std": 0.09295250475406647, "step": 5297, "train_speed(iter/s)": 0.08198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 281.8125, "completions/min_length": 227.0, "epoch": 4.382133995037221, "grad_norm": 0.1527702957391739, "kl": 0.06024169921875, "learning_rate": 4.5807601095152866e-07, "loss": 0.0006018728017807007, "memory(GiB)": 38.13, "reward": 0.5921862125396729, "reward_std": 0.03172028064727783, "rewards/VisualizationJSONCombinedORM/mean": 0.5921862125396729, "rewards/VisualizationJSONCombinedORM/std": 0.08224961906671524, "step": 5298, "train_speed(iter/s)": 0.081972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 321.0625, "completions/min_length": 234.0, "epoch": 4.3829611248966085, "grad_norm": 0.22018873691558838, "kl": 0.091552734375, "learning_rate": 4.5686940578137386e-07, "loss": 0.0009145140647888184, "memory(GiB)": 38.13, "reward": 0.5592012405395508, "reward_std": 0.05016176402568817, "rewards/VisualizationJSONCombinedORM/mean": 0.5592012405395508, "rewards/VisualizationJSONCombinedORM/std": 0.05186201632022858, "step": 5299, "train_speed(iter/s)": 0.081953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 317.5, "completions/min_length": 241.0, "epoch": 4.383788254755997, "grad_norm": 0.1973009556531906, "kl": 0.0821533203125, "learning_rate": 4.55664315766538e-07, "loss": 0.0008224025368690491, "memory(GiB)": 38.13, "reward": 0.6128647327423096, "reward_std": 0.06916210800409317, "rewards/VisualizationJSONCombinedORM/mean": 0.6128647327423096, "rewards/VisualizationJSONCombinedORM/std": 0.07474270462989807, "step": 5300, "train_speed(iter/s)": 0.081932 }, { "epoch": 4.383788254755997, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.0, "eval_completions/mean_length": 307.8489583333333, "eval_completions/min_length": 254.75, "eval_kl": 0.099212646484375, "eval_loss": 0.0009915943956002593, "eval_reward": 0.4647857540597518, "eval_reward_std": 0.05080500881498059, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4647857540597518, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05080500926123932, "eval_runtime": 317.3323, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 287.0, "completions/min_length": 226.0, "epoch": 4.384615384615385, "grad_norm": 0.22549661993980408, "kl": 0.0408935546875, "learning_rate": 4.5446074130892525e-07, "loss": 0.00040903128683567047, "memory(GiB)": 38.13, "reward": 0.4908829927444458, "reward_std": 0.07652540504932404, "rewards/VisualizationJSONCombinedORM/mean": 0.4908829927444458, "rewards/VisualizationJSONCombinedORM/std": 0.08866731822490692, "step": 5301, "train_speed(iter/s)": 0.081516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 301.4375, "completions/min_length": 249.0, "epoch": 4.385442514474772, "grad_norm": 0.21028801798820496, "kl": 0.052734375, "learning_rate": 4.532586828099339e-07, "loss": 0.0005297958850860596, "memory(GiB)": 38.13, "reward": 0.5540112257003784, "reward_std": 0.053729645907878876, "rewards/VisualizationJSONCombinedORM/mean": 0.5540112257003784, "rewards/VisualizationJSONCombinedORM/std": 0.1840217560529709, "step": 5302, "train_speed(iter/s)": 0.081495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 302.8125, "completions/min_length": 235.0, "epoch": 4.386269644334161, "grad_norm": 0.2682081460952759, "kl": 0.069580078125, "learning_rate": 4.5205814067045275e-07, "loss": 0.0006958208978176117, "memory(GiB)": 38.13, "reward": 0.281310498714447, "reward_std": 0.014020578004419804, "rewards/VisualizationJSONCombinedORM/mean": 0.281310498714447, "rewards/VisualizationJSONCombinedORM/std": 0.04404982924461365, "step": 5303, "train_speed(iter/s)": 0.081483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 322.375, "completions/min_length": 263.0, "epoch": 4.387096774193548, "grad_norm": 0.1947554349899292, "kl": 0.0333251953125, "learning_rate": 4.5085911529087256e-07, "loss": 0.00033330172300338745, "memory(GiB)": 38.13, "reward": 0.44546881318092346, "reward_std": 0.0422721803188324, "rewards/VisualizationJSONCombinedORM/mean": 0.44546881318092346, "rewards/VisualizationJSONCombinedORM/std": 0.1591123342514038, "step": 5304, "train_speed(iter/s)": 0.081467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 316.1875, "completions/min_length": 241.0, "epoch": 4.387923904052936, "grad_norm": 0.3135612905025482, "kl": 0.1494140625, "learning_rate": 4.4966160707107075e-07, "loss": 0.0014911293983459473, "memory(GiB)": 38.13, "reward": 0.7609665393829346, "reward_std": 0.06215915083885193, "rewards/VisualizationJSONCombinedORM/mean": 0.7609665393829346, "rewards/VisualizationJSONCombinedORM/std": 0.06482841074466705, "step": 5305, "train_speed(iter/s)": 0.081453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 280.5, "completions/min_length": 236.0, "epoch": 4.388751033912325, "grad_norm": 0.16546422243118286, "kl": 0.04034423828125, "learning_rate": 4.4846561641042343e-07, "loss": 0.00040312856435775757, "memory(GiB)": 38.13, "reward": 0.5403414368629456, "reward_std": 0.03620342165231705, "rewards/VisualizationJSONCombinedORM/mean": 0.5403414368629456, "rewards/VisualizationJSONCombinedORM/std": 0.2826133966445923, "step": 5306, "train_speed(iter/s)": 0.081432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 311.5625, "completions/min_length": 245.0, "epoch": 4.389578163771712, "grad_norm": 0.17296618223190308, "kl": 0.060546875, "learning_rate": 4.472711437077981e-07, "loss": 0.000606105662882328, "memory(GiB)": 38.13, "reward": 0.5330933332443237, "reward_std": 0.0449705645442009, "rewards/VisualizationJSONCombinedORM/mean": 0.5330933332443237, "rewards/VisualizationJSONCombinedORM/std": 0.07256291061639786, "step": 5307, "train_speed(iter/s)": 0.081412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 310.125, "completions/min_length": 242.0, "epoch": 4.3904052936311, "grad_norm": 0.19713905453681946, "kl": 0.10272216796875, "learning_rate": 4.460781893615584e-07, "loss": 0.001026161015033722, "memory(GiB)": 38.13, "reward": 0.5164295434951782, "reward_std": 0.1001124158501625, "rewards/VisualizationJSONCombinedORM/mean": 0.5164295434951782, "rewards/VisualizationJSONCombinedORM/std": 0.10608077794313431, "step": 5308, "train_speed(iter/s)": 0.081389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 301.0625, "completions/min_length": 233.0, "epoch": 4.391232423490488, "grad_norm": 0.16364048421382904, "kl": 0.039306640625, "learning_rate": 4.448867537695578e-07, "loss": 0.0003928951919078827, "memory(GiB)": 38.13, "reward": 0.5440046191215515, "reward_std": 0.037212859839200974, "rewards/VisualizationJSONCombinedORM/mean": 0.5440046191215515, "rewards/VisualizationJSONCombinedORM/std": 0.18587863445281982, "step": 5309, "train_speed(iter/s)": 0.081372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 322.25, "completions/min_length": 256.0, "epoch": 4.392059553349876, "grad_norm": 0.15251024067401886, "kl": 0.0411376953125, "learning_rate": 4.436968373291489e-07, "loss": 0.00041203200817108154, "memory(GiB)": 38.13, "reward": 0.4411923885345459, "reward_std": 0.04021007567644119, "rewards/VisualizationJSONCombinedORM/mean": 0.4411923885345459, "rewards/VisualizationJSONCombinedORM/std": 0.03921285271644592, "step": 5310, "train_speed(iter/s)": 0.08135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 319.3125, "completions/min_length": 251.0, "epoch": 4.392886683209264, "grad_norm": 0.18167732656002045, "kl": 0.0361328125, "learning_rate": 4.425084404371721e-07, "loss": 0.0003634020686149597, "memory(GiB)": 38.13, "reward": 0.541515588760376, "reward_std": 0.020869512110948563, "rewards/VisualizationJSONCombinedORM/mean": 0.541515588760376, "rewards/VisualizationJSONCombinedORM/std": 0.2508449852466583, "step": 5311, "train_speed(iter/s)": 0.081331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 328.6875, "completions/min_length": 248.0, "epoch": 4.3937138130686515, "grad_norm": 0.16422206163406372, "kl": 0.0426025390625, "learning_rate": 4.413215634899637e-07, "loss": 0.0004267096519470215, "memory(GiB)": 38.13, "reward": 0.47410494089126587, "reward_std": 0.05453384667634964, "rewards/VisualizationJSONCombinedORM/mean": 0.47410494089126587, "rewards/VisualizationJSONCombinedORM/std": 0.19827616214752197, "step": 5312, "train_speed(iter/s)": 0.081313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 323.625, "completions/min_length": 217.0, "epoch": 4.39454094292804, "grad_norm": 0.22454753518104553, "kl": 0.07012939453125, "learning_rate": 4.401362068833526e-07, "loss": 0.00070185586810112, "memory(GiB)": 38.13, "reward": 0.3855297565460205, "reward_std": 0.035593919456005096, "rewards/VisualizationJSONCombinedORM/mean": 0.3855297565460205, "rewards/VisualizationJSONCombinedORM/std": 0.03485891968011856, "step": 5313, "train_speed(iter/s)": 0.081299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 306.125, "completions/min_length": 244.0, "epoch": 4.395368072787428, "grad_norm": 0.20735587179660797, "kl": 0.193359375, "learning_rate": 4.3895237101266195e-07, "loss": 0.0019329823553562164, "memory(GiB)": 38.13, "reward": 0.5068665146827698, "reward_std": 0.03943609818816185, "rewards/VisualizationJSONCombinedORM/mean": 0.5068665146827698, "rewards/VisualizationJSONCombinedORM/std": 0.2723672389984131, "step": 5314, "train_speed(iter/s)": 0.081276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 303.4375, "completions/min_length": 242.0, "epoch": 4.396195202646815, "grad_norm": 0.2024778425693512, "kl": 0.093017578125, "learning_rate": 4.377700562727055e-07, "loss": 0.0009271726012229919, "memory(GiB)": 38.13, "reward": 0.5286691188812256, "reward_std": 0.06317012012004852, "rewards/VisualizationJSONCombinedORM/mean": 0.5286691188812256, "rewards/VisualizationJSONCombinedORM/std": 0.07161752879619598, "step": 5315, "train_speed(iter/s)": 0.081257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 320.5625, "completions/min_length": 249.0, "epoch": 4.397022332506204, "grad_norm": 0.177999809384346, "kl": 0.05108642578125, "learning_rate": 4.3658926305779003e-07, "loss": 0.0005109161138534546, "memory(GiB)": 38.13, "reward": 0.623410165309906, "reward_std": 0.031056277453899384, "rewards/VisualizationJSONCombinedORM/mean": 0.623410165309906, "rewards/VisualizationJSONCombinedORM/std": 0.23443856835365295, "step": 5316, "train_speed(iter/s)": 0.081236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 291.4375, "completions/min_length": 251.0, "epoch": 4.397849462365591, "grad_norm": 0.20862075686454773, "kl": 0.05364990234375, "learning_rate": 4.3540999176171717e-07, "loss": 0.0005373433232307434, "memory(GiB)": 38.13, "reward": 0.503731369972229, "reward_std": 0.046902935951948166, "rewards/VisualizationJSONCombinedORM/mean": 0.503731369972229, "rewards/VisualizationJSONCombinedORM/std": 0.04730372130870819, "step": 5317, "train_speed(iter/s)": 0.081222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 321.5, "completions/min_length": 255.0, "epoch": 4.398676592224979, "grad_norm": 0.1925603300333023, "kl": 0.08514404296875, "learning_rate": 4.3423224277777956e-07, "loss": 0.0008493810892105103, "memory(GiB)": 38.13, "reward": 0.631415605545044, "reward_std": 0.0725792869925499, "rewards/VisualizationJSONCombinedORM/mean": 0.631415605545044, "rewards/VisualizationJSONCombinedORM/std": 0.17797842621803284, "step": 5318, "train_speed(iter/s)": 0.081209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 300.125, "completions/min_length": 234.0, "epoch": 4.399503722084368, "grad_norm": 0.2445024847984314, "kl": 0.07196044921875, "learning_rate": 4.330560164987596e-07, "loss": 0.0007200911641120911, "memory(GiB)": 38.13, "reward": 0.7981826066970825, "reward_std": 0.05218835920095444, "rewards/VisualizationJSONCombinedORM/mean": 0.7981826066970825, "rewards/VisualizationJSONCombinedORM/std": 0.13731591403484344, "step": 5319, "train_speed(iter/s)": 0.081195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 303.8125, "completions/min_length": 260.0, "epoch": 4.400330851943755, "grad_norm": 0.17385347187519073, "kl": 0.035888671875, "learning_rate": 4.318813133169375e-07, "loss": 0.00035950541496276855, "memory(GiB)": 38.13, "reward": 0.5609915852546692, "reward_std": 0.01938472129404545, "rewards/VisualizationJSONCombinedORM/mean": 0.5609915852546692, "rewards/VisualizationJSONCombinedORM/std": 0.30782973766326904, "step": 5320, "train_speed(iter/s)": 0.08118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 291.125, "completions/min_length": 221.0, "epoch": 4.401157981803143, "grad_norm": 0.20700703561306, "kl": 0.060791015625, "learning_rate": 4.3070813362407924e-07, "loss": 0.0006076842546463013, "memory(GiB)": 38.13, "reward": 0.6939926147460938, "reward_std": 0.08036346733570099, "rewards/VisualizationJSONCombinedORM/mean": 0.6939926147460938, "rewards/VisualizationJSONCombinedORM/std": 0.09003210812807083, "step": 5321, "train_speed(iter/s)": 0.081164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 310.5, "completions/min_length": 236.0, "epoch": 4.4019851116625315, "grad_norm": 0.21779310703277588, "kl": 0.115478515625, "learning_rate": 4.2953647781144803e-07, "loss": 0.0011546462774276733, "memory(GiB)": 38.13, "reward": 0.4552963674068451, "reward_std": 0.05379649996757507, "rewards/VisualizationJSONCombinedORM/mean": 0.4552963674068451, "rewards/VisualizationJSONCombinedORM/std": 0.18843698501586914, "step": 5322, "train_speed(iter/s)": 0.081146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 316.125, "completions/min_length": 242.0, "epoch": 4.402812241521919, "grad_norm": 0.20728939771652222, "kl": 0.0814208984375, "learning_rate": 4.2836634626979513e-07, "loss": 0.0008136555552482605, "memory(GiB)": 38.13, "reward": 0.5236045718193054, "reward_std": 0.07273939251899719, "rewards/VisualizationJSONCombinedORM/mean": 0.5236045718193054, "rewards/VisualizationJSONCombinedORM/std": 0.10137218236923218, "step": 5323, "train_speed(iter/s)": 0.081124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 292.375, "completions/min_length": 258.0, "epoch": 4.403639371381307, "grad_norm": 0.40110400319099426, "kl": 0.06402587890625, "learning_rate": 4.2719773938936613e-07, "loss": 0.0006396137177944183, "memory(GiB)": 38.13, "reward": 0.4765681028366089, "reward_std": 0.055028460919857025, "rewards/VisualizationJSONCombinedORM/mean": 0.4765681028366089, "rewards/VisualizationJSONCombinedORM/std": 0.1665169596672058, "step": 5324, "train_speed(iter/s)": 0.081105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 294.5625, "completions/min_length": 224.0, "epoch": 4.4044665012406945, "grad_norm": 0.21392697095870972, "kl": 0.0523681640625, "learning_rate": 4.2603065755989493e-07, "loss": 0.0005233802367001772, "memory(GiB)": 38.13, "reward": 0.46918854117393494, "reward_std": 0.056893471628427505, "rewards/VisualizationJSONCombinedORM/mean": 0.46918854117393494, "rewards/VisualizationJSONCombinedORM/std": 0.07018597424030304, "step": 5325, "train_speed(iter/s)": 0.081089 }, { "epoch": 4.4044665012406945, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.125, "eval_completions/mean_length": 315.7239583333333, "eval_completions/min_length": 259.9166666666667, "eval_kl": 0.08197021484375, "eval_loss": 0.0008238402078859508, "eval_reward": 0.4615664066125949, "eval_reward_std": 0.05264943294848005, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4615664066125949, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052649433084297925, "eval_runtime": 320.7273, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 290.6875, "completions/min_length": 239.0, "epoch": 4.405293631100083, "grad_norm": 0.2026377171278, "kl": 0.11181640625, "learning_rate": 4.2486510117061174e-07, "loss": 0.0011178180575370789, "memory(GiB)": 38.13, "reward": 0.5123368501663208, "reward_std": 0.053345806896686554, "rewards/VisualizationJSONCombinedORM/mean": 0.5123368501663208, "rewards/VisualizationJSONCombinedORM/std": 0.08121467381715775, "step": 5326, "train_speed(iter/s)": 0.080683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 312.9375, "completions/min_length": 250.0, "epoch": 4.406120760959471, "grad_norm": 0.22227074205875397, "kl": 0.1649169921875, "learning_rate": 4.2370107061023237e-07, "loss": 0.0016496367752552032, "memory(GiB)": 38.13, "reward": 0.48148635029792786, "reward_std": 0.06773396581411362, "rewards/VisualizationJSONCombinedORM/mean": 0.48148635029792786, "rewards/VisualizationJSONCombinedORM/std": 0.17207522690296173, "step": 5327, "train_speed(iter/s)": 0.080666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 299.6875, "completions/min_length": 235.0, "epoch": 4.406947890818858, "grad_norm": 0.2445417195558548, "kl": 0.1533203125, "learning_rate": 4.225385662669673e-07, "loss": 0.0015359790995717049, "memory(GiB)": 38.13, "reward": 0.38959723711013794, "reward_std": 0.0412486270070076, "rewards/VisualizationJSONCombinedORM/mean": 0.38959723711013794, "rewards/VisualizationJSONCombinedORM/std": 0.041490696370601654, "step": 5328, "train_speed(iter/s)": 0.080646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 325.0625, "completions/min_length": 222.0, "epoch": 4.407775020678247, "grad_norm": 0.3432800769805908, "kl": 0.2353515625, "learning_rate": 4.213775885285176e-07, "loss": 0.0023609474301338196, "memory(GiB)": 38.13, "reward": 0.5846249461174011, "reward_std": 0.05167552828788757, "rewards/VisualizationJSONCombinedORM/mean": 0.5846249461174011, "rewards/VisualizationJSONCombinedORM/std": 0.19821950793266296, "step": 5329, "train_speed(iter/s)": 0.080628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 313.875, "completions/min_length": 249.0, "epoch": 4.408602150537634, "grad_norm": 0.1760527789592743, "kl": 0.0994873046875, "learning_rate": 4.202181377820752e-07, "loss": 0.0009964779019355774, "memory(GiB)": 38.13, "reward": 0.6982088088989258, "reward_std": 0.04321128875017166, "rewards/VisualizationJSONCombinedORM/mean": 0.6982088088989258, "rewards/VisualizationJSONCombinedORM/std": 0.17287680506706238, "step": 5330, "train_speed(iter/s)": 0.08061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 247.75, "completions/min_length": 210.0, "epoch": 4.409429280397022, "grad_norm": 0.1505778431892395, "kl": 0.11358642578125, "learning_rate": 4.1906021441432074e-07, "loss": 0.0011346936225891113, "memory(GiB)": 38.13, "reward": 0.5998039841651917, "reward_std": 0.06921690702438354, "rewards/VisualizationJSONCombinedORM/mean": 0.5998039841651917, "rewards/VisualizationJSONCombinedORM/std": 0.07171618193387985, "step": 5331, "train_speed(iter/s)": 0.080596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 275.9375, "completions/min_length": 227.0, "epoch": 4.410256410256411, "grad_norm": 0.20672805607318878, "kl": 0.06915283203125, "learning_rate": 4.1790381881142816e-07, "loss": 0.0006916001439094543, "memory(GiB)": 38.13, "reward": 0.40855270624160767, "reward_std": 0.032015834003686905, "rewards/VisualizationJSONCombinedORM/mean": 0.40855270624160767, "rewards/VisualizationJSONCombinedORM/std": 0.1062045469880104, "step": 5332, "train_speed(iter/s)": 0.080579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 336.6875, "completions/min_length": 267.0, "epoch": 4.411083540115798, "grad_norm": 0.1653612107038498, "kl": 0.114013671875, "learning_rate": 4.167489513590611e-07, "loss": 0.0011416003108024597, "memory(GiB)": 38.13, "reward": 0.5917416214942932, "reward_std": 0.05379781126976013, "rewards/VisualizationJSONCombinedORM/mean": 0.5917416214942932, "rewards/VisualizationJSONCombinedORM/std": 0.10896793752908707, "step": 5333, "train_speed(iter/s)": 0.080561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 276.0625, "completions/min_length": 216.0, "epoch": 4.411910669975186, "grad_norm": 0.2925615906715393, "kl": 0.113525390625, "learning_rate": 4.1559561244237323e-07, "loss": 0.0011392058804631233, "memory(GiB)": 38.13, "reward": 0.5470857620239258, "reward_std": 0.04664314165711403, "rewards/VisualizationJSONCombinedORM/mean": 0.5470857620239258, "rewards/VisualizationJSONCombinedORM/std": 0.22677360475063324, "step": 5334, "train_speed(iter/s)": 0.080542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 289.5625, "completions/min_length": 245.0, "epoch": 4.412737799834574, "grad_norm": 0.26068243384361267, "kl": 0.0714111328125, "learning_rate": 4.1444380244600623e-07, "loss": 0.0007132813334465027, "memory(GiB)": 38.13, "reward": 0.6693285703659058, "reward_std": 0.07378427684307098, "rewards/VisualizationJSONCombinedORM/mean": 0.6693285703659058, "rewards/VisualizationJSONCombinedORM/std": 0.10588113218545914, "step": 5335, "train_speed(iter/s)": 0.080525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 298.8125, "completions/min_length": 237.0, "epoch": 4.413564929693962, "grad_norm": 0.1645195335149765, "kl": 0.06866455078125, "learning_rate": 4.1329352175409785e-07, "loss": 0.0006869100034236908, "memory(GiB)": 38.13, "reward": 0.5324236750602722, "reward_std": 0.04598316550254822, "rewards/VisualizationJSONCombinedORM/mean": 0.5324236750602722, "rewards/VisualizationJSONCombinedORM/std": 0.11780118197202682, "step": 5336, "train_speed(iter/s)": 0.080508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 315.9375, "completions/min_length": 220.0, "epoch": 4.41439205955335, "grad_norm": 0.19915573298931122, "kl": 0.0836181640625, "learning_rate": 4.1214477075026957e-07, "loss": 0.0008358359336853027, "memory(GiB)": 38.13, "reward": 0.562608540058136, "reward_std": 0.03023330122232437, "rewards/VisualizationJSONCombinedORM/mean": 0.562608540058136, "rewards/VisualizationJSONCombinedORM/std": 0.19514591991901398, "step": 5337, "train_speed(iter/s)": 0.080498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 307.8125, "completions/min_length": 228.0, "epoch": 4.4152191894127375, "grad_norm": 0.1828337013721466, "kl": 0.0540771484375, "learning_rate": 4.109975498176355e-07, "loss": 0.0005384385585784912, "memory(GiB)": 38.13, "reward": 0.5061800479888916, "reward_std": 0.05453949421644211, "rewards/VisualizationJSONCombinedORM/mean": 0.5061800479888916, "rewards/VisualizationJSONCombinedORM/std": 0.22006340324878693, "step": 5338, "train_speed(iter/s)": 0.080482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 307.8125, "completions/min_length": 257.0, "epoch": 4.416046319272126, "grad_norm": 0.1908445805311203, "kl": 0.076171875, "learning_rate": 4.098518593388001e-07, "loss": 0.0007628053426742554, "memory(GiB)": 38.13, "reward": 0.4979529082775116, "reward_std": 0.05979783087968826, "rewards/VisualizationJSONCombinedORM/mean": 0.4979529082775116, "rewards/VisualizationJSONCombinedORM/std": 0.13860131800174713, "step": 5339, "train_speed(iter/s)": 0.080465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 322.25, "completions/min_length": 261.0, "epoch": 4.416873449131514, "grad_norm": 0.19101472198963165, "kl": 0.0953369140625, "learning_rate": 4.087076996958561e-07, "loss": 0.0009526070207357407, "memory(GiB)": 38.13, "reward": 0.5367751121520996, "reward_std": 0.07258997857570648, "rewards/VisualizationJSONCombinedORM/mean": 0.5367751121520996, "rewards/VisualizationJSONCombinedORM/std": 0.14778824150562286, "step": 5340, "train_speed(iter/s)": 0.080441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 354.625, "completions/min_length": 296.0, "epoch": 4.417700578990901, "grad_norm": 0.21684639155864716, "kl": 0.0721435546875, "learning_rate": 4.0756507127038494e-07, "loss": 0.000719945877790451, "memory(GiB)": 38.13, "reward": 0.5938600301742554, "reward_std": 0.034125931560993195, "rewards/VisualizationJSONCombinedORM/mean": 0.5938600301742554, "rewards/VisualizationJSONCombinedORM/std": 0.1598658263683319, "step": 5341, "train_speed(iter/s)": 0.080424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 310.75, "completions/min_length": 226.0, "epoch": 4.41852770885029, "grad_norm": 0.24513652920722961, "kl": 0.0755615234375, "learning_rate": 4.064239744434606e-07, "loss": 0.0007535815238952637, "memory(GiB)": 38.13, "reward": 0.31884559988975525, "reward_std": 0.02504155971109867, "rewards/VisualizationJSONCombinedORM/mean": 0.31884559988975525, "rewards/VisualizationJSONCombinedORM/std": 0.08777065575122833, "step": 5342, "train_speed(iter/s)": 0.080408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 334.25, "completions/min_length": 262.0, "epoch": 4.419354838709677, "grad_norm": 0.1851864606142044, "kl": 0.18359375, "learning_rate": 4.052844095956426e-07, "loss": 0.0018341261893510818, "memory(GiB)": 38.13, "reward": 0.3276394009590149, "reward_std": 0.03258858621120453, "rewards/VisualizationJSONCombinedORM/mean": 0.3276394009590149, "rewards/VisualizationJSONCombinedORM/std": 0.15695340931415558, "step": 5343, "train_speed(iter/s)": 0.080384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 279.25, "completions/min_length": 232.0, "epoch": 4.420181968569065, "grad_norm": 0.19117850065231323, "kl": 0.06353759765625, "learning_rate": 4.04146377106982e-07, "loss": 0.0006361864507198334, "memory(GiB)": 38.13, "reward": 0.44859832525253296, "reward_std": 0.0492151640355587, "rewards/VisualizationJSONCombinedORM/mean": 0.44859832525253296, "rewards/VisualizationJSONCombinedORM/std": 0.2269541174173355, "step": 5344, "train_speed(iter/s)": 0.080368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 315.25, "completions/min_length": 267.0, "epoch": 4.421009098428454, "grad_norm": 0.20256246626377106, "kl": 0.047332763671875, "learning_rate": 4.0300987735701733e-07, "loss": 0.0004736706614494324, "memory(GiB)": 38.13, "reward": 0.37452149391174316, "reward_std": 0.041891761124134064, "rewards/VisualizationJSONCombinedORM/mean": 0.37452149391174316, "rewards/VisualizationJSONCombinedORM/std": 0.06700189411640167, "step": 5345, "train_speed(iter/s)": 0.080352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 333.9375, "completions/min_length": 252.0, "epoch": 4.421836228287841, "grad_norm": 0.26070520281791687, "kl": 0.0819091796875, "learning_rate": 4.0187491072477825e-07, "loss": 0.0008207038044929504, "memory(GiB)": 38.13, "reward": 0.4868777096271515, "reward_std": 0.06858183443546295, "rewards/VisualizationJSONCombinedORM/mean": 0.4868777096271515, "rewards/VisualizationJSONCombinedORM/std": 0.07879646867513657, "step": 5346, "train_speed(iter/s)": 0.080337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 309.75, "completions/min_length": 229.0, "epoch": 4.422663358147229, "grad_norm": 0.19476734101772308, "kl": 0.059539794921875, "learning_rate": 4.0074147758877915e-07, "loss": 0.0005957931280136108, "memory(GiB)": 38.13, "reward": 0.528063952922821, "reward_std": 0.06774057447910309, "rewards/VisualizationJSONCombinedORM/mean": 0.528063952922821, "rewards/VisualizationJSONCombinedORM/std": 0.09384837746620178, "step": 5347, "train_speed(iter/s)": 0.080323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 324.9375, "completions/min_length": 250.0, "epoch": 4.4234904880066175, "grad_norm": 0.20386435091495514, "kl": 0.1451416015625, "learning_rate": 3.9960957832702594e-07, "loss": 0.0014573857188224792, "memory(GiB)": 38.13, "reward": 0.5247443914413452, "reward_std": 0.0948716551065445, "rewards/VisualizationJSONCombinedORM/mean": 0.5247443914413452, "rewards/VisualizationJSONCombinedORM/std": 0.11836837977170944, "step": 5348, "train_speed(iter/s)": 0.080301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 300.875, "completions/min_length": 244.0, "epoch": 4.424317617866005, "grad_norm": 0.22051827609539032, "kl": 0.0440673828125, "learning_rate": 3.984792133170129e-07, "loss": 0.0004414040595293045, "memory(GiB)": 38.13, "reward": 0.7481342554092407, "reward_std": 0.054736461490392685, "rewards/VisualizationJSONCombinedORM/mean": 0.7481342554092407, "rewards/VisualizationJSONCombinedORM/std": 0.094966359436512, "step": 5349, "train_speed(iter/s)": 0.080287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 316.8125, "completions/min_length": 249.0, "epoch": 4.425144747725393, "grad_norm": 0.2903353273868561, "kl": 0.111328125, "learning_rate": 3.973503829357223e-07, "loss": 0.0011140629649162292, "memory(GiB)": 38.13, "reward": 0.5474956035614014, "reward_std": 0.0994715690612793, "rewards/VisualizationJSONCombinedORM/mean": 0.5474956035614014, "rewards/VisualizationJSONCombinedORM/std": 0.11330430209636688, "step": 5350, "train_speed(iter/s)": 0.08027 }, { "epoch": 4.425144747725393, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.1666666666667, "eval_completions/mean_length": 313.0572916666667, "eval_completions/min_length": 258.7916666666667, "eval_kl": 0.10044352213541667, "eval_loss": 0.0010137433419004083, "eval_reward": 0.45947464369237423, "eval_reward_std": 0.055215665216868125, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45947464369237423, "eval_rewards/VisualizationJSONCombinedORM/std": 0.055215667467564344, "eval_runtime": 320.4819, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 307.0625, "completions/min_length": 266.0, "epoch": 4.4259718775847805, "grad_norm": 0.21516287326812744, "kl": 0.06494140625, "learning_rate": 3.9622308755962234e-07, "loss": 0.0006497912108898163, "memory(GiB)": 38.13, "reward": 0.31589749455451965, "reward_std": 0.040244750678539276, "rewards/VisualizationJSONCombinedORM/mean": 0.31589749455451965, "rewards/VisualizationJSONCombinedORM/std": 0.1383494883775711, "step": 5351, "train_speed(iter/s)": 0.07987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 269.4375, "completions/min_length": 237.0, "epoch": 4.426799007444169, "grad_norm": 0.2279704064130783, "kl": 0.07293701171875, "learning_rate": 3.9509732756467465e-07, "loss": 0.0007295403629541397, "memory(GiB)": 38.13, "reward": 0.7694242000579834, "reward_std": 0.08662714064121246, "rewards/VisualizationJSONCombinedORM/mean": 0.7694242000579834, "rewards/VisualizationJSONCombinedORM/std": 0.117198646068573, "step": 5352, "train_speed(iter/s)": 0.07985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 321.25, "completions/min_length": 285.0, "epoch": 4.427626137303557, "grad_norm": 0.23863881826400757, "kl": 0.0596923828125, "learning_rate": 3.939731033263228e-07, "loss": 0.0005960352718830109, "memory(GiB)": 38.13, "reward": 0.606820821762085, "reward_std": 0.06986051052808762, "rewards/VisualizationJSONCombinedORM/mean": 0.606820821762085, "rewards/VisualizationJSONCombinedORM/std": 0.20812322199344635, "step": 5353, "train_speed(iter/s)": 0.079835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 336.3125, "completions/min_length": 267.0, "epoch": 4.428453267162944, "grad_norm": 0.16545724868774414, "kl": 0.05126953125, "learning_rate": 3.9285041521950176e-07, "loss": 0.0005121119320392609, "memory(GiB)": 38.13, "reward": 0.7062239646911621, "reward_std": 0.04737592861056328, "rewards/VisualizationJSONCombinedORM/mean": 0.7062239646911621, "rewards/VisualizationJSONCombinedORM/std": 0.06709335744380951, "step": 5354, "train_speed(iter/s)": 0.079821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 295.4375, "completions/min_length": 233.0, "epoch": 4.429280397022333, "grad_norm": 0.19747154414653778, "kl": 0.0955810546875, "learning_rate": 3.9172926361863316e-07, "loss": 0.0009563639760017395, "memory(GiB)": 38.13, "reward": 0.4342848062515259, "reward_std": 0.03830623999238014, "rewards/VisualizationJSONCombinedORM/mean": 0.4342848062515259, "rewards/VisualizationJSONCombinedORM/std": 0.11574793606996536, "step": 5355, "train_speed(iter/s)": 0.079801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 321.25, "completions/min_length": 271.0, "epoch": 4.43010752688172, "grad_norm": 0.19350846111774445, "kl": 0.07073974609375, "learning_rate": 3.906096488976269e-07, "loss": 0.0007081441581249237, "memory(GiB)": 38.13, "reward": 0.5366219878196716, "reward_std": 0.062153272330760956, "rewards/VisualizationJSONCombinedORM/mean": 0.5366219878196716, "rewards/VisualizationJSONCombinedORM/std": 0.061390701681375504, "step": 5356, "train_speed(iter/s)": 0.079786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/mean_length": 361.125, "completions/min_length": 267.0, "epoch": 4.430934656741108, "grad_norm": 0.24853835999965668, "kl": 0.05096435546875, "learning_rate": 3.894915714298775e-07, "loss": 0.0005091149359941483, "memory(GiB)": 38.13, "reward": 0.6460591554641724, "reward_std": 0.06693343818187714, "rewards/VisualizationJSONCombinedORM/mean": 0.6460591554641724, "rewards/VisualizationJSONCombinedORM/std": 0.06495118141174316, "step": 5357, "train_speed(iter/s)": 0.079767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 325.9375, "completions/min_length": 258.0, "epoch": 4.431761786600497, "grad_norm": 0.21928371489048004, "kl": 0.04046630859375, "learning_rate": 3.883750315882728e-07, "loss": 0.00040454044938087463, "memory(GiB)": 38.13, "reward": 0.5705132484436035, "reward_std": 0.06197758764028549, "rewards/VisualizationJSONCombinedORM/mean": 0.5705132484436035, "rewards/VisualizationJSONCombinedORM/std": 0.06537239253520966, "step": 5358, "train_speed(iter/s)": 0.079751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 299.375, "completions/min_length": 230.0, "epoch": 4.432588916459884, "grad_norm": 0.1924351155757904, "kl": 0.05926513671875, "learning_rate": 3.872600297451806e-07, "loss": 0.0005940943956375122, "memory(GiB)": 38.13, "reward": 0.5760006904602051, "reward_std": 0.04787447676062584, "rewards/VisualizationJSONCombinedORM/mean": 0.5760006904602051, "rewards/VisualizationJSONCombinedORM/std": 0.1745365858078003, "step": 5359, "train_speed(iter/s)": 0.079736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 267.125, "completions/min_length": 223.0, "epoch": 4.433416046319272, "grad_norm": 0.19339832663536072, "kl": 0.09130859375, "learning_rate": 3.8614656627246115e-07, "loss": 0.0009155049920082092, "memory(GiB)": 38.13, "reward": 0.5342353582382202, "reward_std": 0.06805133074522018, "rewards/VisualizationJSONCombinedORM/mean": 0.5342353582382202, "rewards/VisualizationJSONCombinedORM/std": 0.22409379482269287, "step": 5360, "train_speed(iter/s)": 0.079715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 307.375, "completions/min_length": 240.0, "epoch": 4.43424317617866, "grad_norm": 0.21521733701229095, "kl": 0.1016845703125, "learning_rate": 3.850346415414585e-07, "loss": 0.0010170415043830872, "memory(GiB)": 38.13, "reward": 0.40630945563316345, "reward_std": 0.025547439232468605, "rewards/VisualizationJSONCombinedORM/mean": 0.40630945563316345, "rewards/VisualizationJSONCombinedORM/std": 0.08221583813428879, "step": 5361, "train_speed(iter/s)": 0.079701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 320.4375, "completions/min_length": 253.0, "epoch": 4.435070306038048, "grad_norm": 0.2328965812921524, "kl": 0.130859375, "learning_rate": 3.8392425592300666e-07, "loss": 0.0013086283579468727, "memory(GiB)": 38.13, "reward": 0.5564453601837158, "reward_std": 0.03356845676898956, "rewards/VisualizationJSONCombinedORM/mean": 0.5564453601837158, "rewards/VisualizationJSONCombinedORM/std": 0.25303104519844055, "step": 5362, "train_speed(iter/s)": 0.079685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 297.875, "completions/min_length": 231.0, "epoch": 4.435897435897436, "grad_norm": 0.24521194398403168, "kl": 0.187255859375, "learning_rate": 3.8281540978742206e-07, "loss": 0.001871451735496521, "memory(GiB)": 38.13, "reward": 0.26857897639274597, "reward_std": 0.036749113351106644, "rewards/VisualizationJSONCombinedORM/mean": 0.26857897639274597, "rewards/VisualizationJSONCombinedORM/std": 0.042532309889793396, "step": 5363, "train_speed(iter/s)": 0.07967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 304.9375, "completions/min_length": 238.0, "epoch": 4.4367245657568235, "grad_norm": 0.16182196140289307, "kl": 0.03814697265625, "learning_rate": 3.817081035045117e-07, "loss": 0.0003817826509475708, "memory(GiB)": 38.13, "reward": 0.4896100163459778, "reward_std": 0.034824054688215256, "rewards/VisualizationJSONCombinedORM/mean": 0.4896100163459778, "rewards/VisualizationJSONCombinedORM/std": 0.1462431401014328, "step": 5364, "train_speed(iter/s)": 0.079652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 305.625, "completions/min_length": 226.0, "epoch": 4.437551695616212, "grad_norm": 0.2145892083644867, "kl": 0.05615234375, "learning_rate": 3.8060233744356634e-07, "loss": 0.0005615893751382828, "memory(GiB)": 38.13, "reward": 0.6410056352615356, "reward_std": 0.0672130435705185, "rewards/VisualizationJSONCombinedORM/mean": 0.6410056352615356, "rewards/VisualizationJSONCombinedORM/std": 0.1476869434118271, "step": 5365, "train_speed(iter/s)": 0.079636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 295.75, "completions/min_length": 264.0, "epoch": 4.4383788254756, "grad_norm": 0.22106273472309113, "kl": 0.048583984375, "learning_rate": 3.7949811197336485e-07, "loss": 0.00048602744936943054, "memory(GiB)": 38.13, "reward": 0.6562182903289795, "reward_std": 0.0879083126783371, "rewards/VisualizationJSONCombinedORM/mean": 0.6562182903289795, "rewards/VisualizationJSONCombinedORM/std": 0.08578279614448547, "step": 5366, "train_speed(iter/s)": 0.079619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 277.8125, "completions/min_length": 218.0, "epoch": 4.439205955334987, "grad_norm": 0.1746637523174286, "kl": 0.0888671875, "learning_rate": 3.783954274621721e-07, "loss": 0.0008897781372070312, "memory(GiB)": 38.13, "reward": 0.5385036468505859, "reward_std": 0.058365970849990845, "rewards/VisualizationJSONCombinedORM/mean": 0.5385036468505859, "rewards/VisualizationJSONCombinedORM/std": 0.13274531066417694, "step": 5367, "train_speed(iter/s)": 0.079608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 308.0625, "completions/min_length": 223.0, "epoch": 4.440033085194376, "grad_norm": 0.16504324972629547, "kl": 0.135009765625, "learning_rate": 3.7729428427773717e-07, "loss": 0.0013482198119163513, "memory(GiB)": 38.13, "reward": 0.5652102828025818, "reward_std": 0.07147151976823807, "rewards/VisualizationJSONCombinedORM/mean": 0.5652102828025818, "rewards/VisualizationJSONCombinedORM/std": 0.12111927568912506, "step": 5368, "train_speed(iter/s)": 0.079589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 299.1875, "completions/min_length": 232.0, "epoch": 4.440860215053763, "grad_norm": 0.260538250207901, "kl": 0.046630859375, "learning_rate": 3.761946827872981e-07, "loss": 0.0004668682813644409, "memory(GiB)": 38.13, "reward": 0.6452752351760864, "reward_std": 0.064762644469738, "rewards/VisualizationJSONCombinedORM/mean": 0.6452752351760864, "rewards/VisualizationJSONCombinedORM/std": 0.06472770124673843, "step": 5369, "train_speed(iter/s)": 0.079576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 306.1875, "completions/min_length": 212.0, "epoch": 4.441687344913151, "grad_norm": 0.18037143349647522, "kl": 0.0528564453125, "learning_rate": 3.750966233575753e-07, "loss": 0.0005290452390909195, "memory(GiB)": 38.13, "reward": 0.7186825275421143, "reward_std": 0.05996387451887131, "rewards/VisualizationJSONCombinedORM/mean": 0.7186825275421143, "rewards/VisualizationJSONCombinedORM/std": 0.07651858031749725, "step": 5370, "train_speed(iter/s)": 0.079559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 307.125, "completions/min_length": 246.0, "epoch": 4.44251447477254, "grad_norm": 0.1807573437690735, "kl": 0.0823974609375, "learning_rate": 3.740001063547782e-07, "loss": 0.0008233971893787384, "memory(GiB)": 38.13, "reward": 0.6426022052764893, "reward_std": 0.07755245268344879, "rewards/VisualizationJSONCombinedORM/mean": 0.6426022052764893, "rewards/VisualizationJSONCombinedORM/std": 0.09651850163936615, "step": 5371, "train_speed(iter/s)": 0.079545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 325.25, "completions/min_length": 267.0, "epoch": 4.443341604631927, "grad_norm": 0.2115166336297989, "kl": 0.058349609375, "learning_rate": 3.729051321445992e-07, "loss": 0.0005838871002197266, "memory(GiB)": 38.13, "reward": 0.46032655239105225, "reward_std": 0.05902639031410217, "rewards/VisualizationJSONCombinedORM/mean": 0.46032655239105225, "rewards/VisualizationJSONCombinedORM/std": 0.0879594162106514, "step": 5372, "train_speed(iter/s)": 0.079529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 306.5, "completions/min_length": 268.0, "epoch": 4.444168734491315, "grad_norm": 0.20889416337013245, "kl": 0.1378173828125, "learning_rate": 3.71811701092219e-07, "loss": 0.0013781078159809113, "memory(GiB)": 38.13, "reward": 0.5618293881416321, "reward_std": 0.049276795238256454, "rewards/VisualizationJSONCombinedORM/mean": 0.5618293881416321, "rewards/VisualizationJSONCombinedORM/std": 0.2112744301557541, "step": 5373, "train_speed(iter/s)": 0.079513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 288.8125, "completions/min_length": 226.0, "epoch": 4.4449958643507035, "grad_norm": 0.3221009373664856, "kl": 0.0677490234375, "learning_rate": 3.707198135622997e-07, "loss": 0.0006756209768354893, "memory(GiB)": 38.13, "reward": 0.5591988563537598, "reward_std": 0.10628975927829742, "rewards/VisualizationJSONCombinedORM/mean": 0.5591988563537598, "rewards/VisualizationJSONCombinedORM/std": 0.12116622924804688, "step": 5374, "train_speed(iter/s)": 0.079498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 323.625, "completions/min_length": 255.0, "epoch": 4.445822994210091, "grad_norm": 0.2092239260673523, "kl": 0.0736083984375, "learning_rate": 3.696294699189934e-07, "loss": 0.0007348842918872833, "memory(GiB)": 38.13, "reward": 0.5406885147094727, "reward_std": 0.02765841968357563, "rewards/VisualizationJSONCombinedORM/mean": 0.5406885147094727, "rewards/VisualizationJSONCombinedORM/std": 0.07003360986709595, "step": 5375, "train_speed(iter/s)": 0.079476 }, { "epoch": 4.445822994210091, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.375, "eval_completions/mean_length": 305.9270833333333, "eval_completions/min_length": 253.91666666666666, "eval_kl": 0.081787109375, "eval_loss": 0.0008324707741849124, "eval_reward": 0.45508256182074547, "eval_reward_std": 0.05266423348803073, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45508256182074547, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05266423457457373, "eval_runtime": 313.7073, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 290.125, "completions/min_length": 222.0, "epoch": 4.446650124069479, "grad_norm": 0.23555059731006622, "kl": 0.1207275390625, "learning_rate": 3.685406705259326e-07, "loss": 0.0012067873030900955, "memory(GiB)": 38.13, "reward": 0.5349772572517395, "reward_std": 0.08584265410900116, "rewards/VisualizationJSONCombinedORM/mean": 0.5349772572517395, "rewards/VisualizationJSONCombinedORM/std": 0.10211027413606644, "step": 5376, "train_speed(iter/s)": 0.079089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 299.4375, "completions/min_length": 232.0, "epoch": 4.4474772539288665, "grad_norm": 0.20514456927776337, "kl": 0.0616455078125, "learning_rate": 3.6745341574623785e-07, "loss": 0.0006155595183372498, "memory(GiB)": 38.13, "reward": 0.6384141445159912, "reward_std": 0.04542510583996773, "rewards/VisualizationJSONCombinedORM/mean": 0.6384141445159912, "rewards/VisualizationJSONCombinedORM/std": 0.149898961186409, "step": 5377, "train_speed(iter/s)": 0.079076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 301.5, "completions/min_length": 226.0, "epoch": 4.448304383788255, "grad_norm": 0.16382308304309845, "kl": 0.029205322265625, "learning_rate": 3.663677059425136e-07, "loss": 0.0002923794090747833, "memory(GiB)": 38.13, "reward": 0.5254051685333252, "reward_std": 0.06714966148138046, "rewards/VisualizationJSONCombinedORM/mean": 0.5254051685333252, "rewards/VisualizationJSONCombinedORM/std": 0.07438596338033676, "step": 5378, "train_speed(iter/s)": 0.079059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 287.4375, "completions/min_length": 235.0, "epoch": 4.449131513647643, "grad_norm": 0.1634562462568283, "kl": 0.05804443359375, "learning_rate": 3.652835414768502e-07, "loss": 0.000580312917008996, "memory(GiB)": 38.13, "reward": 0.5327486991882324, "reward_std": 0.04617907106876373, "rewards/VisualizationJSONCombinedORM/mean": 0.5327486991882324, "rewards/VisualizationJSONCombinedORM/std": 0.11176495254039764, "step": 5379, "train_speed(iter/s)": 0.079045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 319.75, "completions/min_length": 255.0, "epoch": 4.44995864350703, "grad_norm": 0.1981227993965149, "kl": 0.06512451171875, "learning_rate": 3.642009227108195e-07, "loss": 0.0006516464054584503, "memory(GiB)": 38.13, "reward": 0.5907485485076904, "reward_std": 0.07042355090379715, "rewards/VisualizationJSONCombinedORM/mean": 0.5907485485076904, "rewards/VisualizationJSONCombinedORM/std": 0.09516946971416473, "step": 5380, "train_speed(iter/s)": 0.079031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 309.625, "completions/min_length": 232.0, "epoch": 4.450785773366419, "grad_norm": 0.24482464790344238, "kl": 0.05828857421875, "learning_rate": 3.6311985000548223e-07, "loss": 0.0005820393562316895, "memory(GiB)": 38.13, "reward": 0.4777950942516327, "reward_std": 0.048938460648059845, "rewards/VisualizationJSONCombinedORM/mean": 0.4777950942516327, "rewards/VisualizationJSONCombinedORM/std": 0.13211789727210999, "step": 5381, "train_speed(iter/s)": 0.079015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 280.8125, "completions/min_length": 229.0, "epoch": 4.451612903225806, "grad_norm": 0.29725682735443115, "kl": 0.04022216796875, "learning_rate": 3.620403237213799e-07, "loss": 0.0004023611545562744, "memory(GiB)": 38.13, "reward": 0.4841720461845398, "reward_std": 0.06572949886322021, "rewards/VisualizationJSONCombinedORM/mean": 0.4841720461845398, "rewards/VisualizationJSONCombinedORM/std": 0.1840437352657318, "step": 5382, "train_speed(iter/s)": 0.079002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 304.6875, "completions/min_length": 253.0, "epoch": 4.452440033085194, "grad_norm": 0.20557668805122375, "kl": 0.0843505859375, "learning_rate": 3.609623442185395e-07, "loss": 0.0008437931537628174, "memory(GiB)": 38.13, "reward": 0.6120681762695312, "reward_std": 0.023303460329771042, "rewards/VisualizationJSONCombinedORM/mean": 0.6120681762695312, "rewards/VisualizationJSONCombinedORM/std": 0.109037846326828, "step": 5383, "train_speed(iter/s)": 0.078981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 293.4375, "completions/min_length": 233.0, "epoch": 4.453267162944583, "grad_norm": 0.28360965847969055, "kl": 0.31640625, "learning_rate": 3.5988591185647235e-07, "loss": 0.003165982663631439, "memory(GiB)": 38.13, "reward": 0.6919276714324951, "reward_std": 0.083184614777565, "rewards/VisualizationJSONCombinedORM/mean": 0.6919276714324951, "rewards/VisualizationJSONCombinedORM/std": 0.08104449510574341, "step": 5384, "train_speed(iter/s)": 0.078963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 288.0625, "completions/min_length": 220.0, "epoch": 4.45409429280397, "grad_norm": 0.16926085948944092, "kl": 0.05908203125, "learning_rate": 3.588110269941747e-07, "loss": 0.000591987743973732, "memory(GiB)": 38.13, "reward": 0.36196470260620117, "reward_std": 0.0314708910882473, "rewards/VisualizationJSONCombinedORM/mean": 0.36196470260620117, "rewards/VisualizationJSONCombinedORM/std": 0.13797344267368317, "step": 5385, "train_speed(iter/s)": 0.078951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 310.6875, "completions/min_length": 256.0, "epoch": 4.454921422663358, "grad_norm": 0.1953382045030594, "kl": 0.02508544921875, "learning_rate": 3.577376899901236e-07, "loss": 0.00025043264031410217, "memory(GiB)": 38.13, "reward": 0.5641522407531738, "reward_std": 0.053196363151073456, "rewards/VisualizationJSONCombinedORM/mean": 0.5641522407531738, "rewards/VisualizationJSONCombinedORM/std": 0.18144206702709198, "step": 5386, "train_speed(iter/s)": 0.078936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/mean_length": 329.75, "completions/min_length": 228.0, "epoch": 4.4557485525227465, "grad_norm": 0.2332996428012848, "kl": 0.08148193359375, "learning_rate": 3.5666590120228384e-07, "loss": 0.000815611332654953, "memory(GiB)": 38.13, "reward": 0.6382853984832764, "reward_std": 0.0559295117855072, "rewards/VisualizationJSONCombinedORM/mean": 0.6382853984832764, "rewards/VisualizationJSONCombinedORM/std": 0.169671893119812, "step": 5387, "train_speed(iter/s)": 0.078918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 301.625, "completions/min_length": 250.0, "epoch": 4.456575682382134, "grad_norm": 0.1967635303735733, "kl": 0.064697265625, "learning_rate": 3.5559566098810116e-07, "loss": 0.0006460286676883698, "memory(GiB)": 38.13, "reward": 0.513853907585144, "reward_std": 0.05093942582607269, "rewards/VisualizationJSONCombinedORM/mean": 0.513853907585144, "rewards/VisualizationJSONCombinedORM/std": 0.16813145577907562, "step": 5388, "train_speed(iter/s)": 0.078904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/mean_length": 335.375, "completions/min_length": 258.0, "epoch": 4.457402812241522, "grad_norm": 0.18584498763084412, "kl": 0.086669921875, "learning_rate": 3.5452696970450674e-07, "loss": 0.0008659586310386658, "memory(GiB)": 38.13, "reward": 0.4078143239021301, "reward_std": 0.028165549039840698, "rewards/VisualizationJSONCombinedORM/mean": 0.4078143239021301, "rewards/VisualizationJSONCombinedORM/std": 0.10476154834032059, "step": 5389, "train_speed(iter/s)": 0.078885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 322.9375, "completions/min_length": 273.0, "epoch": 4.4582299421009095, "grad_norm": 0.17250487208366394, "kl": 0.068115234375, "learning_rate": 3.5345982770791096e-07, "loss": 0.0006812363862991333, "memory(GiB)": 38.13, "reward": 0.7196593284606934, "reward_std": 0.04970823973417282, "rewards/VisualizationJSONCombinedORM/mean": 0.7196593284606934, "rewards/VisualizationJSONCombinedORM/std": 0.12494216859340668, "step": 5390, "train_speed(iter/s)": 0.078867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 314.5, "completions/min_length": 256.0, "epoch": 4.459057071960298, "grad_norm": 0.23246170580387115, "kl": 0.0450439453125, "learning_rate": 3.5239423535421537e-07, "loss": 0.00044968724250793457, "memory(GiB)": 38.13, "reward": 0.6162046194076538, "reward_std": 0.0518009215593338, "rewards/VisualizationJSONCombinedORM/mean": 0.6162046194076538, "rewards/VisualizationJSONCombinedORM/std": 0.19006365537643433, "step": 5391, "train_speed(iter/s)": 0.078857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 293.0, "completions/min_length": 235.0, "epoch": 4.459884201819686, "grad_norm": 0.21358130872249603, "kl": 0.11761474609375, "learning_rate": 3.5133019299879665e-07, "loss": 0.001178254373371601, "memory(GiB)": 38.13, "reward": 0.43189799785614014, "reward_std": 0.06330651789903641, "rewards/VisualizationJSONCombinedORM/mean": 0.43189799785614014, "rewards/VisualizationJSONCombinedORM/std": 0.11140107363462448, "step": 5392, "train_speed(iter/s)": 0.078842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 311.25, "completions/min_length": 243.0, "epoch": 4.460711331679073, "grad_norm": 0.2013665735721588, "kl": 0.04656982421875, "learning_rate": 3.5026770099651885e-07, "loss": 0.0004654712975025177, "memory(GiB)": 38.13, "reward": 0.6261125802993774, "reward_std": 0.0420970544219017, "rewards/VisualizationJSONCombinedORM/mean": 0.6261125802993774, "rewards/VisualizationJSONCombinedORM/std": 0.14903098344802856, "step": 5393, "train_speed(iter/s)": 0.078825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 331.5625, "completions/min_length": 252.0, "epoch": 4.461538461538462, "grad_norm": 0.224024698138237, "kl": 0.14654541015625, "learning_rate": 3.492067597017279e-07, "loss": 0.0014629010111093521, "memory(GiB)": 38.13, "reward": 0.5471301674842834, "reward_std": 0.059649087488651276, "rewards/VisualizationJSONCombinedORM/mean": 0.5471301674842834, "rewards/VisualizationJSONCombinedORM/std": 0.06539336591959, "step": 5394, "train_speed(iter/s)": 0.078814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 319.5625, "completions/min_length": 268.0, "epoch": 4.462365591397849, "grad_norm": 0.23730319738388062, "kl": 0.072998046875, "learning_rate": 3.4814736946825357e-07, "loss": 0.0007295794785022736, "memory(GiB)": 38.13, "reward": 0.4786994159221649, "reward_std": 0.02245069108903408, "rewards/VisualizationJSONCombinedORM/mean": 0.4786994159221649, "rewards/VisualizationJSONCombinedORM/std": 0.13661503791809082, "step": 5395, "train_speed(iter/s)": 0.078799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 294.1875, "completions/min_length": 234.0, "epoch": 4.463192721257237, "grad_norm": 0.1658691167831421, "kl": 0.07049560546875, "learning_rate": 3.4708953064940487e-07, "loss": 0.0007053837180137634, "memory(GiB)": 38.13, "reward": 0.5890011787414551, "reward_std": 0.017590750008821487, "rewards/VisualizationJSONCombinedORM/mean": 0.5890011787414551, "rewards/VisualizationJSONCombinedORM/std": 0.2816520929336548, "step": 5396, "train_speed(iter/s)": 0.078781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 292.9375, "completions/min_length": 239.0, "epoch": 4.464019851116626, "grad_norm": 0.17814740538597107, "kl": 0.04754638671875, "learning_rate": 3.4603324359798016e-07, "loss": 0.0004768744111061096, "memory(GiB)": 38.13, "reward": 0.6009869575500488, "reward_std": 0.07199671119451523, "rewards/VisualizationJSONCombinedORM/mean": 0.6009869575500488, "rewards/VisualizationJSONCombinedORM/std": 0.10401077568531036, "step": 5397, "train_speed(iter/s)": 0.078768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 284.1875, "completions/min_length": 225.0, "epoch": 4.464846980976013, "grad_norm": 0.19544638693332672, "kl": 0.05938720703125, "learning_rate": 3.4497850866625213e-07, "loss": 0.0005945339798927307, "memory(GiB)": 38.13, "reward": 0.6424591541290283, "reward_std": 0.06836824119091034, "rewards/VisualizationJSONCombinedORM/mean": 0.6424591541290283, "rewards/VisualizationJSONCombinedORM/std": 0.06784358620643616, "step": 5398, "train_speed(iter/s)": 0.078747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 296.0, "completions/min_length": 233.0, "epoch": 4.465674110835401, "grad_norm": 0.1592486947774887, "kl": 0.0408935546875, "learning_rate": 3.439253262059822e-07, "loss": 0.00040905922651290894, "memory(GiB)": 38.13, "reward": 0.6615196466445923, "reward_std": 0.049277640879154205, "rewards/VisualizationJSONCombinedORM/mean": 0.6615196466445923, "rewards/VisualizationJSONCombinedORM/std": 0.168157696723938, "step": 5399, "train_speed(iter/s)": 0.07873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 305.125, "completions/min_length": 231.0, "epoch": 4.4665012406947895, "grad_norm": 0.18046934902668, "kl": 0.02777099609375, "learning_rate": 3.4287369656841095e-07, "loss": 0.0002777464687824249, "memory(GiB)": 38.13, "reward": 0.5486313700675964, "reward_std": 0.04103350639343262, "rewards/VisualizationJSONCombinedORM/mean": 0.5486313700675964, "rewards/VisualizationJSONCombinedORM/std": 0.23022636771202087, "step": 5400, "train_speed(iter/s)": 0.078714 }, { "epoch": 4.4665012406947895, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.5833333333333, "eval_completions/mean_length": 312.4270833333333, "eval_completions/min_length": 258.2916666666667, "eval_kl": 0.08394368489583333, "eval_loss": 0.0008493487839587033, "eval_reward": 0.448137441650033, "eval_reward_std": 0.053473912606326245, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.448137441650033, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05347391175261388, "eval_runtime": 320.4714, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 319.0625, "completions/min_length": 268.0, "epoch": 4.467328370554177, "grad_norm": 0.1767505556344986, "kl": 0.1181640625, "learning_rate": 3.4182362010426184e-07, "loss": 0.001180201768875122, "memory(GiB)": 38.13, "reward": 0.528969407081604, "reward_std": 0.053006719797849655, "rewards/VisualizationJSONCombinedORM/mean": 0.528969407081604, "rewards/VisualizationJSONCombinedORM/std": 0.10976404696702957, "step": 5401, "train_speed(iter/s)": 0.078334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 317.6875, "completions/min_length": 242.0, "epoch": 4.468155500413565, "grad_norm": 0.1637835055589676, "kl": 0.06134033203125, "learning_rate": 3.407750971637402e-07, "loss": 0.0006128512322902679, "memory(GiB)": 38.13, "reward": 0.7567466497421265, "reward_std": 0.0844956785440445, "rewards/VisualizationJSONCombinedORM/mean": 0.7567466497421265, "rewards/VisualizationJSONCombinedORM/std": 0.09275505691766739, "step": 5402, "train_speed(iter/s)": 0.078319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 328.5, "completions/min_length": 266.0, "epoch": 4.4689826302729525, "grad_norm": 0.2806827127933502, "kl": 0.0550537109375, "learning_rate": 3.397281280965331e-07, "loss": 0.0005499571561813354, "memory(GiB)": 38.13, "reward": 0.42924177646636963, "reward_std": 0.04780256748199463, "rewards/VisualizationJSONCombinedORM/mean": 0.42924177646636963, "rewards/VisualizationJSONCombinedORM/std": 0.09888535737991333, "step": 5403, "train_speed(iter/s)": 0.078308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 312.1875, "completions/min_length": 259.0, "epoch": 4.469809760132341, "grad_norm": 0.20849263668060303, "kl": 0.06536865234375, "learning_rate": 3.3868271325180946e-07, "loss": 0.0006535500288009644, "memory(GiB)": 38.13, "reward": 0.5191291570663452, "reward_std": 0.041866734623909, "rewards/VisualizationJSONCombinedORM/mean": 0.5191291570663452, "rewards/VisualizationJSONCombinedORM/std": 0.08372347801923752, "step": 5404, "train_speed(iter/s)": 0.078295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 312.0, "completions/min_length": 250.0, "epoch": 4.470636889991729, "grad_norm": 0.17601992189884186, "kl": 0.05438232421875, "learning_rate": 3.3763885297822153e-07, "loss": 0.0005427896976470947, "memory(GiB)": 38.13, "reward": 0.6965210437774658, "reward_std": 0.0640026181936264, "rewards/VisualizationJSONCombinedORM/mean": 0.6965210437774658, "rewards/VisualizationJSONCombinedORM/std": 0.09139162302017212, "step": 5405, "train_speed(iter/s)": 0.078282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 318.125, "completions/min_length": 239.0, "epoch": 4.471464019851116, "grad_norm": 0.18458078801631927, "kl": 0.1312255859375, "learning_rate": 3.3659654762389917e-07, "loss": 0.0013119075447320938, "memory(GiB)": 38.13, "reward": 0.5138651132583618, "reward_std": 0.10757695138454437, "rewards/VisualizationJSONCombinedORM/mean": 0.5138651132583618, "rewards/VisualizationJSONCombinedORM/std": 0.10929273068904877, "step": 5406, "train_speed(iter/s)": 0.078271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 306.0625, "completions/min_length": 273.0, "epoch": 4.472291149710505, "grad_norm": 0.21836039423942566, "kl": 0.0889892578125, "learning_rate": 3.355557975364587e-07, "loss": 0.0008908407762646675, "memory(GiB)": 38.13, "reward": 0.660438597202301, "reward_std": 0.052929677069187164, "rewards/VisualizationJSONCombinedORM/mean": 0.660438597202301, "rewards/VisualizationJSONCombinedORM/std": 0.11933441460132599, "step": 5407, "train_speed(iter/s)": 0.07826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 314.5, "completions/min_length": 235.0, "epoch": 4.473118279569892, "grad_norm": 0.29201000928878784, "kl": 0.05755615234375, "learning_rate": 3.3451660306299317e-07, "loss": 0.0005751773715019226, "memory(GiB)": 38.13, "reward": 0.603468120098114, "reward_std": 0.0706147849559784, "rewards/VisualizationJSONCombinedORM/mean": 0.603468120098114, "rewards/VisualizationJSONCombinedORM/std": 0.210265651345253, "step": 5408, "train_speed(iter/s)": 0.078244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 315.25, "completions/min_length": 261.0, "epoch": 4.47394540942928, "grad_norm": 0.1805313229560852, "kl": 0.0460205078125, "learning_rate": 3.3347896455007965e-07, "loss": 0.0004594549536705017, "memory(GiB)": 38.13, "reward": 0.5099266767501831, "reward_std": 0.04029332101345062, "rewards/VisualizationJSONCombinedORM/mean": 0.5099266767501831, "rewards/VisualizationJSONCombinedORM/std": 0.06221950426697731, "step": 5409, "train_speed(iter/s)": 0.078227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 326.6875, "completions/min_length": 261.0, "epoch": 4.474772539288669, "grad_norm": 0.2001909762620926, "kl": 0.1142578125, "learning_rate": 3.324428823437753e-07, "loss": 0.0011425912380218506, "memory(GiB)": 38.13, "reward": 0.5871041417121887, "reward_std": 0.07898016273975372, "rewards/VisualizationJSONCombinedORM/mean": 0.5871041417121887, "rewards/VisualizationJSONCombinedORM/std": 0.18950983881950378, "step": 5410, "train_speed(iter/s)": 0.078207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 293.8125, "completions/min_length": 225.0, "epoch": 4.475599669148056, "grad_norm": 0.23546355962753296, "kl": 0.060302734375, "learning_rate": 3.3140835678961925e-07, "loss": 0.0006036385893821716, "memory(GiB)": 38.13, "reward": 0.6762239933013916, "reward_std": 0.05743129551410675, "rewards/VisualizationJSONCombinedORM/mean": 0.6762239933013916, "rewards/VisualizationJSONCombinedORM/std": 0.07285840809345245, "step": 5411, "train_speed(iter/s)": 0.078194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 288.375, "completions/min_length": 236.0, "epoch": 4.476426799007444, "grad_norm": 0.23486103117465973, "kl": 0.05255126953125, "learning_rate": 3.303753882326277e-07, "loss": 0.0005247872322797775, "memory(GiB)": 38.13, "reward": 0.4747845232486725, "reward_std": 0.06568937003612518, "rewards/VisualizationJSONCombinedORM/mean": 0.4747845232486725, "rewards/VisualizationJSONCombinedORM/std": 0.1462189108133316, "step": 5412, "train_speed(iter/s)": 0.078182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 348.0, "completions/min_length": 237.0, "epoch": 4.4772539288668325, "grad_norm": 0.20761315524578094, "kl": 0.06976318359375, "learning_rate": 3.293439770173046e-07, "loss": 0.0006975587457418442, "memory(GiB)": 38.13, "reward": 0.5713033080101013, "reward_std": 0.04499698430299759, "rewards/VisualizationJSONCombinedORM/mean": 0.5713033080101013, "rewards/VisualizationJSONCombinedORM/std": 0.2197026014328003, "step": 5413, "train_speed(iter/s)": 0.078167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 326.8125, "completions/min_length": 237.0, "epoch": 4.47808105872622, "grad_norm": 0.18023458123207092, "kl": 0.062255859375, "learning_rate": 3.283141234876275e-07, "loss": 0.0006221532821655273, "memory(GiB)": 38.13, "reward": 0.6686723828315735, "reward_std": 0.05225205048918724, "rewards/VisualizationJSONCombinedORM/mean": 0.6686723828315735, "rewards/VisualizationJSONCombinedORM/std": 0.0661405548453331, "step": 5414, "train_speed(iter/s)": 0.078155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 311.0625, "completions/min_length": 251.0, "epoch": 4.478908188585608, "grad_norm": 0.45326879620552063, "kl": 0.1158447265625, "learning_rate": 3.272858279870583e-07, "loss": 0.001156143844127655, "memory(GiB)": 38.13, "reward": 0.3370455801486969, "reward_std": 0.037507735192775726, "rewards/VisualizationJSONCombinedORM/mean": 0.3370455801486969, "rewards/VisualizationJSONCombinedORM/std": 0.1503012776374817, "step": 5415, "train_speed(iter/s)": 0.078144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 320.5, "completions/min_length": 253.0, "epoch": 4.4797353184449955, "grad_norm": 0.1802968829870224, "kl": 0.030181884765625, "learning_rate": 3.262590908585378e-07, "loss": 0.00030131638050079346, "memory(GiB)": 38.13, "reward": 0.5541001558303833, "reward_std": 0.03809241205453873, "rewards/VisualizationJSONCombinedORM/mean": 0.5541001558303833, "rewards/VisualizationJSONCombinedORM/std": 0.1535445898771286, "step": 5416, "train_speed(iter/s)": 0.078129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 295.875, "completions/min_length": 241.0, "epoch": 4.480562448304384, "grad_norm": 0.21723410487174988, "kl": 0.11944580078125, "learning_rate": 3.2523391244448923e-07, "loss": 0.0011916439980268478, "memory(GiB)": 38.13, "reward": 0.4506461024284363, "reward_std": 0.05295554921030998, "rewards/VisualizationJSONCombinedORM/mean": 0.4506461024284363, "rewards/VisualizationJSONCombinedORM/std": 0.2612293064594269, "step": 5417, "train_speed(iter/s)": 0.078113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/mean_length": 283.9375, "completions/min_length": 225.0, "epoch": 4.481389578163772, "grad_norm": 0.20563441514968872, "kl": 0.04925537109375, "learning_rate": 3.2421029308681296e-07, "loss": 0.0004931539297103882, "memory(GiB)": 38.13, "reward": 0.5741821527481079, "reward_std": 0.07395502924919128, "rewards/VisualizationJSONCombinedORM/mean": 0.5741821527481079, "rewards/VisualizationJSONCombinedORM/std": 0.12202435731887817, "step": 5418, "train_speed(iter/s)": 0.078101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 343.125, "completions/min_length": 246.0, "epoch": 4.482216708023159, "grad_norm": 0.19645653665065765, "kl": 0.040985107421875, "learning_rate": 3.231882331268904e-07, "loss": 0.00040966644883155823, "memory(GiB)": 38.13, "reward": 0.6811164617538452, "reward_std": 0.0671609565615654, "rewards/VisualizationJSONCombinedORM/mean": 0.6811164617538452, "rewards/VisualizationJSONCombinedORM/std": 0.10972867906093597, "step": 5419, "train_speed(iter/s)": 0.07808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 308.0625, "completions/min_length": 235.0, "epoch": 4.483043837882548, "grad_norm": 0.16626207530498505, "kl": 0.1226806640625, "learning_rate": 3.22167732905585e-07, "loss": 0.0012267008423805237, "memory(GiB)": 38.13, "reward": 0.4701698422431946, "reward_std": 0.07282106578350067, "rewards/VisualizationJSONCombinedORM/mean": 0.4701698422431946, "rewards/VisualizationJSONCombinedORM/std": 0.07324282824993134, "step": 5420, "train_speed(iter/s)": 0.078065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 334.875, "completions/min_length": 253.0, "epoch": 4.483870967741936, "grad_norm": 0.2009957879781723, "kl": 0.0458984375, "learning_rate": 3.2114879276323783e-07, "loss": 0.00045893527567386627, "memory(GiB)": 38.13, "reward": 0.6394999027252197, "reward_std": 0.07761191576719284, "rewards/VisualizationJSONCombinedORM/mean": 0.6394999027252197, "rewards/VisualizationJSONCombinedORM/std": 0.07541736960411072, "step": 5421, "train_speed(iter/s)": 0.078052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 312.6875, "completions/min_length": 217.0, "epoch": 4.484698097601323, "grad_norm": 0.15268348157405853, "kl": 0.05615234375, "learning_rate": 3.2013141303966823e-07, "loss": 0.0005619227886199951, "memory(GiB)": 38.13, "reward": 0.76315838098526, "reward_std": 0.16484200954437256, "rewards/VisualizationJSONCombinedORM/mean": 0.76315838098526, "rewards/VisualizationJSONCombinedORM/std": 0.23229408264160156, "step": 5422, "train_speed(iter/s)": 0.078036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 300.5625, "completions/min_length": 226.0, "epoch": 4.485525227460712, "grad_norm": 0.22808688879013062, "kl": 0.03424072265625, "learning_rate": 3.1911559407418025e-07, "loss": 0.0003428161144256592, "memory(GiB)": 38.13, "reward": 0.5073162317276001, "reward_std": 0.05865573137998581, "rewards/VisualizationJSONCombinedORM/mean": 0.5073162317276001, "rewards/VisualizationJSONCombinedORM/std": 0.14072132110595703, "step": 5423, "train_speed(iter/s)": 0.078022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 304.5625, "completions/min_length": 259.0, "epoch": 4.486352357320099, "grad_norm": 0.22006630897521973, "kl": 0.142822265625, "learning_rate": 3.181013362055518e-07, "loss": 0.0014290884137153625, "memory(GiB)": 38.13, "reward": 0.4590626358985901, "reward_std": 0.04362247884273529, "rewards/VisualizationJSONCombinedORM/mean": 0.4590626358985901, "rewards/VisualizationJSONCombinedORM/std": 0.27963870763778687, "step": 5424, "train_speed(iter/s)": 0.078012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.0625, "completions/min_length": 229.0, "epoch": 4.487179487179487, "grad_norm": 0.18237072229385376, "kl": 0.079345703125, "learning_rate": 3.170886397720435e-07, "loss": 0.0007938221096992493, "memory(GiB)": 38.13, "reward": 0.7571544051170349, "reward_std": 0.08271558582782745, "rewards/VisualizationJSONCombinedORM/mean": 0.7571544051170349, "rewards/VisualizationJSONCombinedORM/std": 0.10509029030799866, "step": 5425, "train_speed(iter/s)": 0.077991 }, { "epoch": 4.487179487179487, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 363.9166666666667, "eval_completions/mean_length": 309.0729166666667, "eval_completions/min_length": 263.0833333333333, "eval_kl": 0.07872517903645833, "eval_loss": 0.0007907543331384659, "eval_reward": 0.4529078919440508, "eval_reward_std": 0.04837121919263154, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4529078919440508, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04837121960008517, "eval_runtime": 311.3012, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 277.5, "completions/min_length": 240.0, "epoch": 4.4880066170388755, "grad_norm": 0.27526313066482544, "kl": 0.05279541015625, "learning_rate": 3.160775051113951e-07, "loss": 0.0005279034376144409, "memory(GiB)": 38.13, "reward": 0.7675086259841919, "reward_std": 0.06870848685503006, "rewards/VisualizationJSONCombinedORM/mean": 0.7675086259841919, "rewards/VisualizationJSONCombinedORM/std": 0.06947732716798782, "step": 5426, "train_speed(iter/s)": 0.077637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 321.125, "completions/min_length": 228.0, "epoch": 4.488833746898263, "grad_norm": 0.16761073470115662, "kl": 0.053955078125, "learning_rate": 3.150679325608241e-07, "loss": 0.000538703054189682, "memory(GiB)": 38.13, "reward": 0.8137392401695251, "reward_std": 0.03578059375286102, "rewards/VisualizationJSONCombinedORM/mean": 0.8137392401695251, "rewards/VisualizationJSONCombinedORM/std": 0.058221131563186646, "step": 5427, "train_speed(iter/s)": 0.077619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 256.8125, "completions/min_length": 218.0, "epoch": 4.489660876757651, "grad_norm": 0.16995635628700256, "kl": 0.097412109375, "learning_rate": 3.1405992245702624e-07, "loss": 0.0009745694696903229, "memory(GiB)": 38.13, "reward": 0.46354854106903076, "reward_std": 0.05855240672826767, "rewards/VisualizationJSONCombinedORM/mean": 0.46354854106903076, "rewards/VisualizationJSONCombinedORM/std": 0.06276210397481918, "step": 5428, "train_speed(iter/s)": 0.077604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 305.625, "completions/min_length": 265.0, "epoch": 4.4904880066170385, "grad_norm": 0.19739790260791779, "kl": 0.0943603515625, "learning_rate": 3.130534751361808e-07, "loss": 0.000943697988986969, "memory(GiB)": 38.13, "reward": 0.31578829884529114, "reward_std": 0.026792805641889572, "rewards/VisualizationJSONCombinedORM/mean": 0.31578829884529114, "rewards/VisualizationJSONCombinedORM/std": 0.08916978538036346, "step": 5429, "train_speed(iter/s)": 0.077588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 336.75, "completions/min_length": 264.0, "epoch": 4.491315136476427, "grad_norm": 0.18985135853290558, "kl": 0.1334228515625, "learning_rate": 3.120485909339399e-07, "loss": 0.0013338718563318253, "memory(GiB)": 38.13, "reward": 0.4977602958679199, "reward_std": 0.06843239068984985, "rewards/VisualizationJSONCombinedORM/mean": 0.4977602958679199, "rewards/VisualizationJSONCombinedORM/std": 0.3121030330657959, "step": 5430, "train_speed(iter/s)": 0.077571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 319.375, "completions/min_length": 262.0, "epoch": 4.492142266335815, "grad_norm": 0.23728355765342712, "kl": 0.03887939453125, "learning_rate": 3.110452701854383e-07, "loss": 0.00038833916187286377, "memory(GiB)": 38.13, "reward": 0.6105309724807739, "reward_std": 0.04990380257368088, "rewards/VisualizationJSONCombinedORM/mean": 0.6105309724807739, "rewards/VisualizationJSONCombinedORM/std": 0.18733566999435425, "step": 5431, "train_speed(iter/s)": 0.07756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 337.4375, "completions/min_length": 266.0, "epoch": 4.492969396195202, "grad_norm": 0.18491432070732117, "kl": 0.06207275390625, "learning_rate": 3.100435132252877e-07, "loss": 0.0006204582750797272, "memory(GiB)": 38.13, "reward": 0.5872284173965454, "reward_std": 0.06404119729995728, "rewards/VisualizationJSONCombinedORM/mean": 0.5872284173965454, "rewards/VisualizationJSONCombinedORM/std": 0.12908223271369934, "step": 5432, "train_speed(iter/s)": 0.077542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 315.625, "completions/min_length": 250.0, "epoch": 4.493796526054591, "grad_norm": 0.20040971040725708, "kl": 0.26458740234375, "learning_rate": 3.0904332038757977e-07, "loss": 0.0026368126273155212, "memory(GiB)": 38.13, "reward": 0.502685546875, "reward_std": 0.08540455996990204, "rewards/VisualizationJSONCombinedORM/mean": 0.502685546875, "rewards/VisualizationJSONCombinedORM/std": 0.18321013450622559, "step": 5433, "train_speed(iter/s)": 0.077524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.3125, "completions/min_length": 261.0, "epoch": 4.494623655913978, "grad_norm": 0.16329781711101532, "kl": 0.02471923828125, "learning_rate": 3.0804469200588214e-07, "loss": 0.0002473890781402588, "memory(GiB)": 38.13, "reward": 0.4912008047103882, "reward_std": 0.049185752868652344, "rewards/VisualizationJSONCombinedORM/mean": 0.4912008047103882, "rewards/VisualizationJSONCombinedORM/std": 0.25907620787620544, "step": 5434, "train_speed(iter/s)": 0.077511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 353.0, "completions/min_length": 281.0, "epoch": 4.495450785773366, "grad_norm": 0.24880467355251312, "kl": 0.0889892578125, "learning_rate": 3.070476284132429e-07, "loss": 0.0008895210921764374, "memory(GiB)": 38.13, "reward": 0.5583651065826416, "reward_std": 0.09420302510261536, "rewards/VisualizationJSONCombinedORM/mean": 0.5583651065826416, "rewards/VisualizationJSONCombinedORM/std": 0.1520017385482788, "step": 5435, "train_speed(iter/s)": 0.077494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 323.125, "completions/min_length": 285.0, "epoch": 4.496277915632755, "grad_norm": 0.25117069482803345, "kl": 0.0526123046875, "learning_rate": 3.0605212994218647e-07, "loss": 0.0005272757261991501, "memory(GiB)": 38.13, "reward": 0.6501270532608032, "reward_std": 0.06259393692016602, "rewards/VisualizationJSONCombinedORM/mean": 0.6501270532608032, "rewards/VisualizationJSONCombinedORM/std": 0.09002231806516647, "step": 5436, "train_speed(iter/s)": 0.077481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 283.875, "completions/min_length": 234.0, "epoch": 4.497105045492142, "grad_norm": 0.18648409843444824, "kl": 0.07696533203125, "learning_rate": 3.0505819692471797e-07, "loss": 0.0007694531232118607, "memory(GiB)": 38.13, "reward": 0.729077935218811, "reward_std": 0.07703761756420135, "rewards/VisualizationJSONCombinedORM/mean": 0.729077935218811, "rewards/VisualizationJSONCombinedORM/std": 0.1131611093878746, "step": 5437, "train_speed(iter/s)": 0.077468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 315.0625, "completions/min_length": 245.0, "epoch": 4.49793217535153, "grad_norm": 0.16963531076908112, "kl": 0.05889892578125, "learning_rate": 3.0406582969231656e-07, "loss": 0.0005886554718017578, "memory(GiB)": 38.13, "reward": 0.5123496055603027, "reward_std": 0.03298996761441231, "rewards/VisualizationJSONCombinedORM/mean": 0.5123496055603027, "rewards/VisualizationJSONCombinedORM/std": 0.04243484511971474, "step": 5438, "train_speed(iter/s)": 0.077454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 323.625, "completions/min_length": 248.0, "epoch": 4.4987593052109185, "grad_norm": 0.16574804484844208, "kl": 0.0462646484375, "learning_rate": 3.030750285759432e-07, "loss": 0.00046218931674957275, "memory(GiB)": 38.13, "reward": 0.3600866198539734, "reward_std": 0.02717534825205803, "rewards/VisualizationJSONCombinedORM/mean": 0.3600866198539734, "rewards/VisualizationJSONCombinedORM/std": 0.02988598495721817, "step": 5439, "train_speed(iter/s)": 0.077439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 298.75, "completions/min_length": 247.0, "epoch": 4.499586435070306, "grad_norm": 0.16467945277690887, "kl": 0.03472900390625, "learning_rate": 3.02085793906034e-07, "loss": 0.00034762918949127197, "memory(GiB)": 38.13, "reward": 0.48265284299850464, "reward_std": 0.034474700689315796, "rewards/VisualizationJSONCombinedORM/mean": 0.48265284299850464, "rewards/VisualizationJSONCombinedORM/std": 0.2275390326976776, "step": 5440, "train_speed(iter/s)": 0.077423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 310.75, "completions/min_length": 241.0, "epoch": 4.500413564929694, "grad_norm": 0.2944883108139038, "kl": 0.08251953125, "learning_rate": 3.010981260125029e-07, "loss": 0.0008240751922130585, "memory(GiB)": 38.13, "reward": 0.4020000994205475, "reward_std": 0.038802895694971085, "rewards/VisualizationJSONCombinedORM/mean": 0.4020000994205475, "rewards/VisualizationJSONCombinedORM/std": 0.0897543728351593, "step": 5441, "train_speed(iter/s)": 0.077408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 323.375, "completions/min_length": 253.0, "epoch": 4.5012406947890815, "grad_norm": 0.18603923916816711, "kl": 0.031219482421875, "learning_rate": 3.0011202522474245e-07, "loss": 0.0003132075071334839, "memory(GiB)": 38.13, "reward": 0.6266121864318848, "reward_std": 0.05806712806224823, "rewards/VisualizationJSONCombinedORM/mean": 0.6266121864318848, "rewards/VisualizationJSONCombinedORM/std": 0.10325375199317932, "step": 5442, "train_speed(iter/s)": 0.077392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 330.1875, "completions/min_length": 265.0, "epoch": 4.50206782464847, "grad_norm": 0.19211922585964203, "kl": 0.073974609375, "learning_rate": 2.9912749187162195e-07, "loss": 0.0007410570979118347, "memory(GiB)": 38.13, "reward": 0.6290093660354614, "reward_std": 0.061247482895851135, "rewards/VisualizationJSONCombinedORM/mean": 0.6290093660354614, "rewards/VisualizationJSONCombinedORM/std": 0.06602019816637039, "step": 5443, "train_speed(iter/s)": 0.077379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 328.8125, "completions/min_length": 251.0, "epoch": 4.502894954507858, "grad_norm": 0.2775888741016388, "kl": 0.0860595703125, "learning_rate": 2.9814452628148636e-07, "loss": 0.0008596405386924744, "memory(GiB)": 38.13, "reward": 0.7195985317230225, "reward_std": 0.09107516705989838, "rewards/VisualizationJSONCombinedORM/mean": 0.7195985317230225, "rewards/VisualizationJSONCombinedORM/std": 0.0913936123251915, "step": 5444, "train_speed(iter/s)": 0.077361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 280.1875, "completions/min_length": 227.0, "epoch": 4.503722084367245, "grad_norm": 0.14497685432434082, "kl": 0.04638671875, "learning_rate": 2.9716312878216194e-07, "loss": 0.0004619881510734558, "memory(GiB)": 38.13, "reward": 0.6109260320663452, "reward_std": 0.027925513684749603, "rewards/VisualizationJSONCombinedORM/mean": 0.6109260320663452, "rewards/VisualizationJSONCombinedORM/std": 0.07478052377700806, "step": 5445, "train_speed(iter/s)": 0.077347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 307.3125, "completions/min_length": 239.0, "epoch": 4.504549214226634, "grad_norm": 0.24545127153396606, "kl": 0.152099609375, "learning_rate": 2.961832997009473e-07, "loss": 0.0015222281217575073, "memory(GiB)": 38.13, "reward": 0.6372890472412109, "reward_std": 0.029443470761179924, "rewards/VisualizationJSONCombinedORM/mean": 0.6372890472412109, "rewards/VisualizationJSONCombinedORM/std": 0.08787275850772858, "step": 5446, "train_speed(iter/s)": 0.077332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 286.5, "completions/min_length": 225.0, "epoch": 4.505376344086022, "grad_norm": 0.1833561658859253, "kl": 0.04901123046875, "learning_rate": 2.9520503936462054e-07, "loss": 0.0004891082644462585, "memory(GiB)": 38.13, "reward": 0.7273392081260681, "reward_std": 0.04926231503486633, "rewards/VisualizationJSONCombinedORM/mean": 0.7273392081260681, "rewards/VisualizationJSONCombinedORM/std": 0.07141974568367004, "step": 5447, "train_speed(iter/s)": 0.077317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/mean_length": 314.8125, "completions/min_length": 217.0, "epoch": 4.506203473945409, "grad_norm": 0.20769080519676208, "kl": 0.19970703125, "learning_rate": 2.942283480994362e-07, "loss": 0.001996864564716816, "memory(GiB)": 38.13, "reward": 0.6210399866104126, "reward_std": 0.0638008862733841, "rewards/VisualizationJSONCombinedORM/mean": 0.6210399866104126, "rewards/VisualizationJSONCombinedORM/std": 0.1404181718826294, "step": 5448, "train_speed(iter/s)": 0.077299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 302.8125, "completions/min_length": 217.0, "epoch": 4.507030603804798, "grad_norm": 0.22644047439098358, "kl": 0.07666015625, "learning_rate": 2.932532262311261e-07, "loss": 0.0007646083831787109, "memory(GiB)": 38.13, "reward": 0.2643474042415619, "reward_std": 0.033941518515348434, "rewards/VisualizationJSONCombinedORM/mean": 0.2643474042415619, "rewards/VisualizationJSONCombinedORM/std": 0.08151652663946152, "step": 5449, "train_speed(iter/s)": 0.077289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 332.1875, "completions/min_length": 225.0, "epoch": 4.507857733664185, "grad_norm": 0.15220753848552704, "kl": 0.0540771484375, "learning_rate": 2.9227967408489653e-07, "loss": 0.0005417987704277039, "memory(GiB)": 38.13, "reward": 0.27592936158180237, "reward_std": 0.015471098944544792, "rewards/VisualizationJSONCombinedORM/mean": 0.27592936158180237, "rewards/VisualizationJSONCombinedORM/std": 0.1478196233510971, "step": 5450, "train_speed(iter/s)": 0.077272 }, { "epoch": 4.507857733664185, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 366.875, "eval_completions/mean_length": 306.9739583333333, "eval_completions/min_length": 259.0, "eval_kl": 0.08369954427083333, "eval_loss": 0.0008344787056557834, "eval_reward": 0.4529715571552515, "eval_reward_std": 0.054362548127149544, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4529715571552515, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05436254886444658, "eval_runtime": 313.2816, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 312.625, "completions/min_length": 235.0, "epoch": 4.508684863523573, "grad_norm": 0.2412988543510437, "kl": 0.125732421875, "learning_rate": 2.9130769198543185e-07, "loss": 0.0012546032667160034, "memory(GiB)": 38.13, "reward": 0.43302738666534424, "reward_std": 0.02122325636446476, "rewards/VisualizationJSONCombinedORM/mean": 0.43302738666534424, "rewards/VisualizationJSONCombinedORM/std": 0.2540457248687744, "step": 5451, "train_speed(iter/s)": 0.076918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 329.125, "completions/min_length": 254.0, "epoch": 4.5095119933829615, "grad_norm": 0.2117965817451477, "kl": 0.1473388671875, "learning_rate": 2.903372802568938e-07, "loss": 0.0014730915427207947, "memory(GiB)": 38.13, "reward": 0.5576625466346741, "reward_std": 0.06308680772781372, "rewards/VisualizationJSONCombinedORM/mean": 0.5576625466346741, "rewards/VisualizationJSONCombinedORM/std": 0.23864473402500153, "step": 5452, "train_speed(iter/s)": 0.076902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 310.8125, "completions/min_length": 258.0, "epoch": 4.510339123242349, "grad_norm": 0.1700630784034729, "kl": 0.03472900390625, "learning_rate": 2.893684392229185e-07, "loss": 0.00034700334072113037, "memory(GiB)": 38.13, "reward": 0.5881195068359375, "reward_std": 0.019874300807714462, "rewards/VisualizationJSONCombinedORM/mean": 0.5881195068359375, "rewards/VisualizationJSONCombinedORM/std": 0.15873154997825623, "step": 5453, "train_speed(iter/s)": 0.076888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 299.9375, "completions/min_length": 261.0, "epoch": 4.511166253101737, "grad_norm": 0.1558041125535965, "kl": 0.04400634765625, "learning_rate": 2.884011692066191e-07, "loss": 0.00044113118201494217, "memory(GiB)": 38.13, "reward": 0.4672030210494995, "reward_std": 0.02642299421131611, "rewards/VisualizationJSONCombinedORM/mean": 0.4672030210494995, "rewards/VisualizationJSONCombinedORM/std": 0.02554686740040779, "step": 5454, "train_speed(iter/s)": 0.076876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 308.75, "completions/min_length": 236.0, "epoch": 4.5119933829611245, "grad_norm": 0.17459522187709808, "kl": 0.05267333984375, "learning_rate": 2.874354705305843e-07, "loss": 0.000527847558259964, "memory(GiB)": 38.13, "reward": 0.6302142143249512, "reward_std": 0.05894378572702408, "rewards/VisualizationJSONCombinedORM/mean": 0.6302142143249512, "rewards/VisualizationJSONCombinedORM/std": 0.2288220226764679, "step": 5455, "train_speed(iter/s)": 0.07686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/mean_length": 327.4375, "completions/min_length": 248.0, "epoch": 4.512820512820513, "grad_norm": 0.15735061466693878, "kl": 0.11798095703125, "learning_rate": 2.864713435168803e-07, "loss": 0.0011788606643676758, "memory(GiB)": 38.13, "reward": 0.7876018285751343, "reward_std": 0.05740652233362198, "rewards/VisualizationJSONCombinedORM/mean": 0.7876018285751343, "rewards/VisualizationJSONCombinedORM/std": 0.06592757999897003, "step": 5456, "train_speed(iter/s)": 0.076849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 334.75, "completions/min_length": 290.0, "epoch": 4.513647642679901, "grad_norm": 0.156574085354805, "kl": 0.08770751953125, "learning_rate": 2.8550878848704666e-07, "loss": 0.0008770152926445007, "memory(GiB)": 38.13, "reward": 0.6060711145401001, "reward_std": 0.042562153190374374, "rewards/VisualizationJSONCombinedORM/mean": 0.6060711145401001, "rewards/VisualizationJSONCombinedORM/std": 0.09136775135993958, "step": 5457, "train_speed(iter/s)": 0.076833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 335.0, "completions/min_length": 262.0, "epoch": 4.514474772539288, "grad_norm": 0.17712002992630005, "kl": 0.060791015625, "learning_rate": 2.8454780576209986e-07, "loss": 0.0006061941385269165, "memory(GiB)": 38.13, "reward": 0.594286322593689, "reward_std": 0.02184401825070381, "rewards/VisualizationJSONCombinedORM/mean": 0.594286322593689, "rewards/VisualizationJSONCombinedORM/std": 0.27970007061958313, "step": 5458, "train_speed(iter/s)": 0.076824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 311.25, "completions/min_length": 264.0, "epoch": 4.515301902398677, "grad_norm": 0.24614883959293365, "kl": 0.0428466796875, "learning_rate": 2.8358839566253346e-07, "loss": 0.0004297494888305664, "memory(GiB)": 38.13, "reward": 0.617264986038208, "reward_std": 0.05075431615114212, "rewards/VisualizationJSONCombinedORM/mean": 0.617264986038208, "rewards/VisualizationJSONCombinedORM/std": 0.10509118437767029, "step": 5459, "train_speed(iter/s)": 0.076808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 303.0, "completions/min_length": 248.0, "epoch": 4.516129032258064, "grad_norm": 0.19013744592666626, "kl": 0.06512451171875, "learning_rate": 2.826305585083144e-07, "loss": 0.0006508566439151764, "memory(GiB)": 38.13, "reward": 0.7031391263008118, "reward_std": 0.0786128118634224, "rewards/VisualizationJSONCombinedORM/mean": 0.7031391263008118, "rewards/VisualizationJSONCombinedORM/std": 0.08195934444665909, "step": 5460, "train_speed(iter/s)": 0.076794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 279.875, "completions/min_length": 232.0, "epoch": 4.516956162117452, "grad_norm": 0.22879670560359955, "kl": 0.06365966796875, "learning_rate": 2.8167429461888496e-07, "loss": 0.0006366521120071411, "memory(GiB)": 38.13, "reward": 0.7585881352424622, "reward_std": 0.0659792423248291, "rewards/VisualizationJSONCombinedORM/mean": 0.7585881352424622, "rewards/VisualizationJSONCombinedORM/std": 0.0665869414806366, "step": 5461, "train_speed(iter/s)": 0.07678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 278.125, "completions/min_length": 231.0, "epoch": 4.517783291976841, "grad_norm": 0.220554381608963, "kl": 0.11572265625, "learning_rate": 2.80719604313166e-07, "loss": 0.0011601299047470093, "memory(GiB)": 38.13, "reward": 0.6727988123893738, "reward_std": 0.07809747755527496, "rewards/VisualizationJSONCombinedORM/mean": 0.6727988123893738, "rewards/VisualizationJSONCombinedORM/std": 0.10491127520799637, "step": 5462, "train_speed(iter/s)": 0.076761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 316.9375, "completions/min_length": 261.0, "epoch": 4.518610421836228, "grad_norm": 0.19822075963020325, "kl": 0.08587646484375, "learning_rate": 2.7976648790954963e-07, "loss": 0.0008573047816753387, "memory(GiB)": 38.13, "reward": 0.5384103655815125, "reward_std": 0.0675884559750557, "rewards/VisualizationJSONCombinedORM/mean": 0.5384103655815125, "rewards/VisualizationJSONCombinedORM/std": 0.08267000317573547, "step": 5463, "train_speed(iter/s)": 0.076746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 294.4375, "completions/min_length": 248.0, "epoch": 4.519437551695616, "grad_norm": 0.22198934853076935, "kl": 0.0755615234375, "learning_rate": 2.788149457259043e-07, "loss": 0.0007550790905952454, "memory(GiB)": 38.13, "reward": 0.5304455161094666, "reward_std": 0.04690771549940109, "rewards/VisualizationJSONCombinedORM/mean": 0.5304455161094666, "rewards/VisualizationJSONCombinedORM/std": 0.26983803510665894, "step": 5464, "train_speed(iter/s)": 0.076729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 332.8125, "completions/min_length": 238.0, "epoch": 4.5202646815550045, "grad_norm": 0.19102491438388824, "kl": 0.1365966796875, "learning_rate": 2.778649780795739e-07, "loss": 0.0013657696545124054, "memory(GiB)": 38.13, "reward": 0.6150861382484436, "reward_std": 0.07020573318004608, "rewards/VisualizationJSONCombinedORM/mean": 0.6150861382484436, "rewards/VisualizationJSONCombinedORM/std": 0.22477376461029053, "step": 5465, "train_speed(iter/s)": 0.076718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 319.0, "completions/min_length": 245.0, "epoch": 4.521091811414392, "grad_norm": 0.19822679460048676, "kl": 0.115966796875, "learning_rate": 2.76916585287379e-07, "loss": 0.0011594295501708984, "memory(GiB)": 38.13, "reward": 0.4764785170555115, "reward_std": 0.06864052265882492, "rewards/VisualizationJSONCombinedORM/mean": 0.4764785170555115, "rewards/VisualizationJSONCombinedORM/std": 0.06910547614097595, "step": 5466, "train_speed(iter/s)": 0.076702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 292.375, "completions/min_length": 227.0, "epoch": 4.52191894127378, "grad_norm": 0.21595771610736847, "kl": 0.06121826171875, "learning_rate": 2.7596976766560977e-07, "loss": 0.0006122253835201263, "memory(GiB)": 38.13, "reward": 0.5947651863098145, "reward_std": 0.031391166150569916, "rewards/VisualizationJSONCombinedORM/mean": 0.5947651863098145, "rewards/VisualizationJSONCombinedORM/std": 0.0716906413435936, "step": 5467, "train_speed(iter/s)": 0.076689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 292.0625, "completions/min_length": 230.0, "epoch": 4.522746071133168, "grad_norm": 0.1775786429643631, "kl": 0.0938720703125, "learning_rate": 2.750245255300371e-07, "loss": 0.0009395591914653778, "memory(GiB)": 38.13, "reward": 0.6913102865219116, "reward_std": 0.06033874303102493, "rewards/VisualizationJSONCombinedORM/mean": 0.6913102865219116, "rewards/VisualizationJSONCombinedORM/std": 0.10956905037164688, "step": 5468, "train_speed(iter/s)": 0.07668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 288.4375, "completions/min_length": 242.0, "epoch": 4.523573200992556, "grad_norm": 0.2541828155517578, "kl": 0.04278564453125, "learning_rate": 2.7408085919590265e-07, "loss": 0.0004282332956790924, "memory(GiB)": 38.13, "reward": 0.4267652630805969, "reward_std": 0.06144358962774277, "rewards/VisualizationJSONCombinedORM/mean": 0.4267652630805969, "rewards/VisualizationJSONCombinedORM/std": 0.06588239222764969, "step": 5469, "train_speed(iter/s)": 0.076671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 324.3125, "completions/min_length": 253.0, "epoch": 4.524400330851944, "grad_norm": 0.30066820979118347, "kl": 0.1138916015625, "learning_rate": 2.7313876897792304e-07, "loss": 0.00113750621676445, "memory(GiB)": 38.13, "reward": 0.5681911110877991, "reward_std": 0.06415142118930817, "rewards/VisualizationJSONCombinedORM/mean": 0.5681911110877991, "rewards/VisualizationJSONCombinedORM/std": 0.22775377333164215, "step": 5470, "train_speed(iter/s)": 0.07665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 295.3125, "completions/min_length": 229.0, "epoch": 4.525227460711331, "grad_norm": 0.20806632936000824, "kl": 0.07293701171875, "learning_rate": 2.7219825519029017e-07, "loss": 0.0007322356104850769, "memory(GiB)": 38.13, "reward": 0.6595327258110046, "reward_std": 0.049079529941082, "rewards/VisualizationJSONCombinedORM/mean": 0.6595327258110046, "rewards/VisualizationJSONCombinedORM/std": 0.10016249120235443, "step": 5471, "train_speed(iter/s)": 0.076637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 332.75, "completions/min_length": 247.0, "epoch": 4.52605459057072, "grad_norm": 0.17008019983768463, "kl": 0.0362548828125, "learning_rate": 2.712593181466711e-07, "loss": 0.0003636721521615982, "memory(GiB)": 38.13, "reward": 0.5762096643447876, "reward_std": 0.04195629805326462, "rewards/VisualizationJSONCombinedORM/mean": 0.5762096643447876, "rewards/VisualizationJSONCombinedORM/std": 0.0475987084209919, "step": 5472, "train_speed(iter/s)": 0.076624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 334.375, "completions/min_length": 234.0, "epoch": 4.526881720430108, "grad_norm": 0.2171286642551422, "kl": 0.08935546875, "learning_rate": 2.703219581602035e-07, "loss": 0.0008941888809204102, "memory(GiB)": 38.13, "reward": 0.3468121886253357, "reward_std": 0.03571662679314613, "rewards/VisualizationJSONCombinedORM/mean": 0.3468121886253357, "rewards/VisualizationJSONCombinedORM/std": 0.0892680287361145, "step": 5473, "train_speed(iter/s)": 0.076609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 338.3125, "completions/min_length": 262.0, "epoch": 4.527708850289495, "grad_norm": 0.2310227006673813, "kl": 0.0843505859375, "learning_rate": 2.6938617554350234e-07, "loss": 0.000844273716211319, "memory(GiB)": 38.13, "reward": 0.5543628931045532, "reward_std": 0.028071045875549316, "rewards/VisualizationJSONCombinedORM/mean": 0.5543628931045532, "rewards/VisualizationJSONCombinedORM/std": 0.20990270376205444, "step": 5474, "train_speed(iter/s)": 0.076594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 342.25, "completions/min_length": 258.0, "epoch": 4.528535980148884, "grad_norm": 0.18132992088794708, "kl": 0.028564453125, "learning_rate": 2.684519706086558e-07, "loss": 0.0002851709723472595, "memory(GiB)": 38.13, "reward": 0.728030800819397, "reward_std": 0.047160688787698746, "rewards/VisualizationJSONCombinedORM/mean": 0.728030800819397, "rewards/VisualizationJSONCombinedORM/std": 0.06459015607833862, "step": 5475, "train_speed(iter/s)": 0.076576 }, { "epoch": 4.528535980148884, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.8333333333333, "eval_completions/mean_length": 313.0677083333333, "eval_completions/min_length": 254.79166666666666, "eval_kl": 0.08425394694010417, "eval_loss": 0.0008586191688664258, "eval_reward": 0.44807523613174755, "eval_reward_std": 0.05065743182785809, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44807523613174755, "eval_rewards/VisualizationJSONCombinedORM/std": 0.050657432118896395, "eval_runtime": 320.3703, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 304.3125, "completions/min_length": 240.0, "epoch": 4.529363110008271, "grad_norm": 0.25574326515197754, "kl": 0.04046630859375, "learning_rate": 2.6751934366722575e-07, "loss": 0.00040576979517936707, "memory(GiB)": 38.13, "reward": 0.6636042594909668, "reward_std": 0.07494436204433441, "rewards/VisualizationJSONCombinedORM/mean": 0.6636042594909668, "rewards/VisualizationJSONCombinedORM/std": 0.09755731374025345, "step": 5476, "train_speed(iter/s)": 0.076219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 322.6875, "completions/min_length": 265.0, "epoch": 4.530190239867659, "grad_norm": 0.16699078679084778, "kl": 0.038818359375, "learning_rate": 2.6658829503024566e-07, "loss": 0.000388462096452713, "memory(GiB)": 38.13, "reward": 0.4074063301086426, "reward_std": 0.04574720561504364, "rewards/VisualizationJSONCombinedORM/mean": 0.4074063301086426, "rewards/VisualizationJSONCombinedORM/std": 0.07679806649684906, "step": 5477, "train_speed(iter/s)": 0.076208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 285.5, "completions/min_length": 222.0, "epoch": 4.5310173697270475, "grad_norm": 0.16353264451026917, "kl": 0.037353515625, "learning_rate": 2.6565882500822817e-07, "loss": 0.00037403404712677, "memory(GiB)": 38.13, "reward": 0.7473615407943726, "reward_std": 0.03873355686664581, "rewards/VisualizationJSONCombinedORM/mean": 0.7473615407943726, "rewards/VisualizationJSONCombinedORM/std": 0.07727581262588501, "step": 5478, "train_speed(iter/s)": 0.076194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 320.5, "completions/min_length": 265.0, "epoch": 4.531844499586435, "grad_norm": 0.18914996087551117, "kl": 0.05859375, "learning_rate": 2.6473093391115256e-07, "loss": 0.0005850456655025482, "memory(GiB)": 38.13, "reward": 0.6594562530517578, "reward_std": 0.041003528982400894, "rewards/VisualizationJSONCombinedORM/mean": 0.6594562530517578, "rewards/VisualizationJSONCombinedORM/std": 0.1152045875787735, "step": 5479, "train_speed(iter/s)": 0.076181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 290.375, "completions/min_length": 233.0, "epoch": 4.532671629445823, "grad_norm": 0.23151156306266785, "kl": 0.095703125, "learning_rate": 2.6380462204847633e-07, "loss": 0.0009556412696838379, "memory(GiB)": 38.13, "reward": 0.5195680856704712, "reward_std": 0.0893947035074234, "rewards/VisualizationJSONCombinedORM/mean": 0.5195680856704712, "rewards/VisualizationJSONCombinedORM/std": 0.09259622544050217, "step": 5480, "train_speed(iter/s)": 0.076169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 293.8125, "completions/min_length": 244.0, "epoch": 4.5334987593052105, "grad_norm": 0.1801082342863083, "kl": 0.04754638671875, "learning_rate": 2.628798897291285e-07, "loss": 0.0004761572927236557, "memory(GiB)": 38.13, "reward": 0.4868336319923401, "reward_std": 0.06292049586772919, "rewards/VisualizationJSONCombinedORM/mean": 0.4868336319923401, "rewards/VisualizationJSONCombinedORM/std": 0.06871999055147171, "step": 5481, "train_speed(iter/s)": 0.076157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 312.0, "completions/min_length": 244.0, "epoch": 4.534325889164599, "grad_norm": 0.16415339708328247, "kl": 0.04541015625, "learning_rate": 2.619567372615123e-07, "loss": 0.00045353174209594727, "memory(GiB)": 38.13, "reward": 0.6468327045440674, "reward_std": 0.0281454399228096, "rewards/VisualizationJSONCombinedORM/mean": 0.6468327045440674, "rewards/VisualizationJSONCombinedORM/std": 0.07969682663679123, "step": 5482, "train_speed(iter/s)": 0.076142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 330.75, "completions/min_length": 263.0, "epoch": 4.535153019023987, "grad_norm": 0.1917356699705124, "kl": 0.05810546875, "learning_rate": 2.61035164953502e-07, "loss": 0.0005815587937831879, "memory(GiB)": 38.13, "reward": 0.7198340892791748, "reward_std": 0.07177305221557617, "rewards/VisualizationJSONCombinedORM/mean": 0.7198340892791748, "rewards/VisualizationJSONCombinedORM/std": 0.08713264018297195, "step": 5483, "train_speed(iter/s)": 0.07613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 321.9375, "completions/min_length": 238.0, "epoch": 4.535980148883374, "grad_norm": 0.18147237598896027, "kl": 0.03594970703125, "learning_rate": 2.601151731124485e-07, "loss": 0.0003593862056732178, "memory(GiB)": 38.13, "reward": 0.7479323148727417, "reward_std": 0.07465974241495132, "rewards/VisualizationJSONCombinedORM/mean": 0.7479323148727417, "rewards/VisualizationJSONCombinedORM/std": 0.08022255450487137, "step": 5484, "train_speed(iter/s)": 0.076118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 289.5, "completions/min_length": 223.0, "epoch": 4.536807278742763, "grad_norm": 0.23439845442771912, "kl": 0.05206298828125, "learning_rate": 2.5919676204517073e-07, "loss": 0.0005198195576667786, "memory(GiB)": 38.13, "reward": 0.31258636713027954, "reward_std": 0.02904697135090828, "rewards/VisualizationJSONCombinedORM/mean": 0.31258636713027954, "rewards/VisualizationJSONCombinedORM/std": 0.04857455939054489, "step": 5485, "train_speed(iter/s)": 0.076105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 311.3125, "completions/min_length": 259.0, "epoch": 4.53763440860215, "grad_norm": 0.14116649329662323, "kl": 0.03985595703125, "learning_rate": 2.5827993205796487e-07, "loss": 0.00039696943713352084, "memory(GiB)": 38.13, "reward": 0.5269132256507874, "reward_std": 0.03128398582339287, "rewards/VisualizationJSONCombinedORM/mean": 0.5269132256507874, "rewards/VisualizationJSONCombinedORM/std": 0.2849138379096985, "step": 5486, "train_speed(iter/s)": 0.07609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 307.125, "completions/min_length": 259.0, "epoch": 4.538461538461538, "grad_norm": 0.21402327716350555, "kl": 0.038818359375, "learning_rate": 2.5736468345659736e-07, "loss": 0.00038880109786987305, "memory(GiB)": 38.13, "reward": 0.7104145288467407, "reward_std": 0.06747336685657501, "rewards/VisualizationJSONCombinedORM/mean": 0.7104145288467407, "rewards/VisualizationJSONCombinedORM/std": 0.1080017015337944, "step": 5487, "train_speed(iter/s)": 0.076078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 315.875, "completions/min_length": 227.0, "epoch": 4.539288668320927, "grad_norm": 0.1879482865333557, "kl": 0.06719970703125, "learning_rate": 2.564510165463091e-07, "loss": 0.0006738007068634033, "memory(GiB)": 38.13, "reward": 0.6218981742858887, "reward_std": 0.04875178262591362, "rewards/VisualizationJSONCombinedORM/mean": 0.6218981742858887, "rewards/VisualizationJSONCombinedORM/std": 0.15556980669498444, "step": 5488, "train_speed(iter/s)": 0.076067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.625, "completions/min_length": 239.0, "epoch": 4.540115798180314, "grad_norm": 0.22040894627571106, "kl": 0.15655517578125, "learning_rate": 2.555389316318102e-07, "loss": 0.0015651006251573563, "memory(GiB)": 38.13, "reward": 0.5456185936927795, "reward_std": 0.0240153931081295, "rewards/VisualizationJSONCombinedORM/mean": 0.5456185936927795, "rewards/VisualizationJSONCombinedORM/std": 0.2734712064266205, "step": 5489, "train_speed(iter/s)": 0.076052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 307.4375, "completions/min_length": 253.0, "epoch": 4.540942928039702, "grad_norm": 0.21901051700115204, "kl": 0.06884765625, "learning_rate": 2.546284290172862e-07, "loss": 0.0006873682141304016, "memory(GiB)": 38.13, "reward": 0.4598124027252197, "reward_std": 0.03942563757300377, "rewards/VisualizationJSONCombinedORM/mean": 0.4598124027252197, "rewards/VisualizationJSONCombinedORM/std": 0.2082814872264862, "step": 5490, "train_speed(iter/s)": 0.076042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 297.375, "completions/min_length": 248.0, "epoch": 4.5417700578990905, "grad_norm": 0.25402316451072693, "kl": 0.05810546875, "learning_rate": 2.537195090063943e-07, "loss": 0.0005814339965581894, "memory(GiB)": 38.13, "reward": 0.624756932258606, "reward_std": 0.06944844126701355, "rewards/VisualizationJSONCombinedORM/mean": 0.624756932258606, "rewards/VisualizationJSONCombinedORM/std": 0.14041948318481445, "step": 5491, "train_speed(iter/s)": 0.076032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 331.75, "completions/min_length": 259.0, "epoch": 4.542597187758478, "grad_norm": 0.16684919595718384, "kl": 0.07373046875, "learning_rate": 2.5281217190226414e-07, "loss": 0.0007380694150924683, "memory(GiB)": 38.13, "reward": 0.48193472623825073, "reward_std": 0.0494413785636425, "rewards/VisualizationJSONCombinedORM/mean": 0.48193472623825073, "rewards/VisualizationJSONCombinedORM/std": 0.16110771894454956, "step": 5492, "train_speed(iter/s)": 0.076012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 299.75, "completions/min_length": 244.0, "epoch": 4.543424317617866, "grad_norm": 0.17160455882549286, "kl": 0.05389404296875, "learning_rate": 2.5190641800749424e-07, "loss": 0.0005397498607635498, "memory(GiB)": 38.13, "reward": 0.39849188923835754, "reward_std": 0.03995262086391449, "rewards/VisualizationJSONCombinedORM/mean": 0.39849188923835754, "rewards/VisualizationJSONCombinedORM/std": 0.24737700819969177, "step": 5493, "train_speed(iter/s)": 0.076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 315.6875, "completions/min_length": 253.0, "epoch": 4.544251447477254, "grad_norm": 0.18016491830348969, "kl": 0.10552978515625, "learning_rate": 2.510022476241614e-07, "loss": 0.0010585002601146698, "memory(GiB)": 38.13, "reward": 0.6643559336662292, "reward_std": 0.053502827882766724, "rewards/VisualizationJSONCombinedORM/mean": 0.6643559336662292, "rewards/VisualizationJSONCombinedORM/std": 0.2200760543346405, "step": 5494, "train_speed(iter/s)": 0.075991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 322.0, "completions/min_length": 267.0, "epoch": 4.545078577336642, "grad_norm": 0.5457996129989624, "kl": 0.06634521484375, "learning_rate": 2.500996610538081e-07, "loss": 0.0006648115813732147, "memory(GiB)": 38.13, "reward": 0.7988858222961426, "reward_std": 0.06059501692652702, "rewards/VisualizationJSONCombinedORM/mean": 0.7988858222961426, "rewards/VisualizationJSONCombinedORM/std": 0.06208599731326103, "step": 5495, "train_speed(iter/s)": 0.075982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 286.5, "completions/min_length": 233.0, "epoch": 4.54590570719603, "grad_norm": 0.20005550980567932, "kl": 0.0782470703125, "learning_rate": 2.491986585974521e-07, "loss": 0.0007821787148714066, "memory(GiB)": 38.13, "reward": 0.5675075650215149, "reward_std": 0.05797567218542099, "rewards/VisualizationJSONCombinedORM/mean": 0.5675075650215149, "rewards/VisualizationJSONCombinedORM/std": 0.1949371099472046, "step": 5496, "train_speed(iter/s)": 0.07597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 302.375, "completions/min_length": 237.0, "epoch": 4.546732837055417, "grad_norm": 0.19824537634849548, "kl": 0.055908203125, "learning_rate": 2.4829924055558174e-07, "loss": 0.0005581863224506378, "memory(GiB)": 38.13, "reward": 0.5466251373291016, "reward_std": 0.07081616669893265, "rewards/VisualizationJSONCombinedORM/mean": 0.5466251373291016, "rewards/VisualizationJSONCombinedORM/std": 0.20476815104484558, "step": 5497, "train_speed(iter/s)": 0.075958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 337.0, "completions/min_length": 247.0, "epoch": 4.547559966914806, "grad_norm": 0.19283173978328705, "kl": 0.07574462890625, "learning_rate": 2.474014072281578e-07, "loss": 0.0007570423185825348, "memory(GiB)": 38.13, "reward": 0.5984092950820923, "reward_std": 0.07767154276371002, "rewards/VisualizationJSONCombinedORM/mean": 0.5984092950820923, "rewards/VisualizationJSONCombinedORM/std": 0.09958873689174652, "step": 5498, "train_speed(iter/s)": 0.075945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 306.25, "completions/min_length": 248.0, "epoch": 4.548387096774194, "grad_norm": 0.18078966438770294, "kl": 0.0592041015625, "learning_rate": 2.4650515891461004e-07, "loss": 0.0005906671285629272, "memory(GiB)": 38.13, "reward": 0.5305308103561401, "reward_std": 0.05102970451116562, "rewards/VisualizationJSONCombinedORM/mean": 0.5305308103561401, "rewards/VisualizationJSONCombinedORM/std": 0.18559126555919647, "step": 5499, "train_speed(iter/s)": 0.075932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 300.0625, "completions/min_length": 249.0, "epoch": 4.549214226633581, "grad_norm": 0.2838890850543976, "kl": 0.086181640625, "learning_rate": 2.4561049591384387e-07, "loss": 0.0008642561733722687, "memory(GiB)": 38.13, "reward": 0.4034978151321411, "reward_std": 0.03010186180472374, "rewards/VisualizationJSONCombinedORM/mean": 0.4034978151321411, "rewards/VisualizationJSONCombinedORM/std": 0.03904189541935921, "step": 5500, "train_speed(iter/s)": 0.07592 }, { "epoch": 4.549214226633581, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 374.875, "eval_completions/mean_length": 309.734375, "eval_completions/min_length": 259.7083333333333, "eval_kl": 0.08403523763020833, "eval_loss": 0.0008546735043637455, "eval_reward": 0.45325052303572494, "eval_reward_std": 0.04970862662109236, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45325052303572494, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04970862860015283, "eval_runtime": 317.8177, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 340.875, "completions/min_length": 269.0, "epoch": 4.55004135649297, "grad_norm": 0.1821385771036148, "kl": 0.0804443359375, "learning_rate": 2.447174185242324e-07, "loss": 0.000805094838142395, "memory(GiB)": 38.13, "reward": 0.32633110880851746, "reward_std": 0.022249121218919754, "rewards/VisualizationJSONCombinedORM/mean": 0.32633110880851746, "rewards/VisualizationJSONCombinedORM/std": 0.10685966908931732, "step": 5501, "train_speed(iter/s)": 0.075572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 294.875, "completions/min_length": 251.0, "epoch": 4.550868486352357, "grad_norm": 0.15981853008270264, "kl": 0.0565185546875, "learning_rate": 2.4382592704362053e-07, "loss": 0.0005662441253662109, "memory(GiB)": 38.13, "reward": 0.7384833097457886, "reward_std": 0.03841191902756691, "rewards/VisualizationJSONCombinedORM/mean": 0.7384833097457886, "rewards/VisualizationJSONCombinedORM/std": 0.10037350654602051, "step": 5502, "train_speed(iter/s)": 0.075563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 308.3125, "completions/min_length": 235.0, "epoch": 4.551695616211745, "grad_norm": 0.23441652953624725, "kl": 0.06689453125, "learning_rate": 2.429360217693261e-07, "loss": 0.0006688311696052551, "memory(GiB)": 38.13, "reward": 0.5698537826538086, "reward_std": 0.06381198763847351, "rewards/VisualizationJSONCombinedORM/mean": 0.5698537826538086, "rewards/VisualizationJSONCombinedORM/std": 0.1889217495918274, "step": 5503, "train_speed(iter/s)": 0.075552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 341.5, "completions/min_length": 288.0, "epoch": 4.5525227460711335, "grad_norm": 0.21648778021335602, "kl": 0.05694580078125, "learning_rate": 2.4204770299813664e-07, "loss": 0.0005694516003131866, "memory(GiB)": 38.13, "reward": 0.571898341178894, "reward_std": 0.044992949813604355, "rewards/VisualizationJSONCombinedORM/mean": 0.571898341178894, "rewards/VisualizationJSONCombinedORM/std": 0.24232569336891174, "step": 5504, "train_speed(iter/s)": 0.075541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 309.0625, "completions/min_length": 246.0, "epoch": 4.553349875930521, "grad_norm": 0.21631962060928345, "kl": 0.0582275390625, "learning_rate": 2.411609710263091e-07, "loss": 0.0005813203752040863, "memory(GiB)": 38.13, "reward": 0.41995394229888916, "reward_std": 0.042050864547491074, "rewards/VisualizationJSONCombinedORM/mean": 0.41995394229888916, "rewards/VisualizationJSONCombinedORM/std": 0.0667077973484993, "step": 5505, "train_speed(iter/s)": 0.075526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 307.0, "completions/min_length": 251.0, "epoch": 4.554177005789909, "grad_norm": 0.2553776502609253, "kl": 0.05206298828125, "learning_rate": 2.4027582614957414e-07, "loss": 0.0005202107131481171, "memory(GiB)": 38.13, "reward": 0.5110085606575012, "reward_std": 0.060332298278808594, "rewards/VisualizationJSONCombinedORM/mean": 0.5110085606575012, "rewards/VisualizationJSONCombinedORM/std": 0.05959147959947586, "step": 5506, "train_speed(iter/s)": 0.075516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 318.75, "completions/min_length": 264.0, "epoch": 4.5550041356492965, "grad_norm": 0.19541293382644653, "kl": 0.039520263671875, "learning_rate": 2.393922686631306e-07, "loss": 0.0003952197730541229, "memory(GiB)": 38.13, "reward": 0.5951718091964722, "reward_std": 0.06276512145996094, "rewards/VisualizationJSONCombinedORM/mean": 0.5951718091964722, "rewards/VisualizationJSONCombinedORM/std": 0.12918493151664734, "step": 5507, "train_speed(iter/s)": 0.075503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 314.5625, "completions/min_length": 263.0, "epoch": 4.555831265508685, "grad_norm": 0.201792910695076, "kl": 0.0545654296875, "learning_rate": 2.385102988616511e-07, "loss": 0.0005438420921564102, "memory(GiB)": 38.13, "reward": 0.48244181275367737, "reward_std": 0.05964726209640503, "rewards/VisualizationJSONCombinedORM/mean": 0.48244181275367737, "rewards/VisualizationJSONCombinedORM/std": 0.22017575800418854, "step": 5508, "train_speed(iter/s)": 0.075491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 297.3125, "completions/min_length": 238.0, "epoch": 4.556658395368073, "grad_norm": 0.2080654501914978, "kl": 0.248779296875, "learning_rate": 2.3762991703927375e-07, "loss": 0.0024882666766643524, "memory(GiB)": 38.13, "reward": 0.38291406631469727, "reward_std": 0.05975683033466339, "rewards/VisualizationJSONCombinedORM/mean": 0.38291406631469727, "rewards/VisualizationJSONCombinedORM/std": 0.11298103630542755, "step": 5509, "train_speed(iter/s)": 0.075473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 304.75, "completions/min_length": 254.0, "epoch": 4.55748552522746, "grad_norm": 0.16230064630508423, "kl": 0.1265869140625, "learning_rate": 2.367511234896125e-07, "loss": 0.0012665074318647385, "memory(GiB)": 38.13, "reward": 0.44303178787231445, "reward_std": 0.04203405976295471, "rewards/VisualizationJSONCombinedORM/mean": 0.44303178787231445, "rewards/VisualizationJSONCombinedORM/std": 0.05792326480150223, "step": 5510, "train_speed(iter/s)": 0.075456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 312.9375, "completions/min_length": 252.0, "epoch": 4.558312655086849, "grad_norm": 0.18134039640426636, "kl": 0.04132080078125, "learning_rate": 2.3587391850574792e-07, "loss": 0.00041317567229270935, "memory(GiB)": 38.13, "reward": 0.7814825177192688, "reward_std": 0.08803558349609375, "rewards/VisualizationJSONCombinedORM/mean": 0.7814825177192688, "rewards/VisualizationJSONCombinedORM/std": 0.09272037446498871, "step": 5511, "train_speed(iter/s)": 0.075442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 267.75, "completions/min_length": 220.0, "epoch": 4.559139784946236, "grad_norm": 0.23532357811927795, "kl": 0.08489990234375, "learning_rate": 2.3499830238023215e-07, "loss": 0.0008478313684463501, "memory(GiB)": 38.13, "reward": 0.3320695459842682, "reward_std": 0.05234619230031967, "rewards/VisualizationJSONCombinedORM/mean": 0.3320695459842682, "rewards/VisualizationJSONCombinedORM/std": 0.05438859015703201, "step": 5512, "train_speed(iter/s)": 0.075427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 315.0625, "completions/min_length": 238.0, "epoch": 4.559966914805624, "grad_norm": 0.1567046344280243, "kl": 0.06121826171875, "learning_rate": 2.3412427540508763e-07, "loss": 0.0006137266755104065, "memory(GiB)": 38.13, "reward": 0.5795005559921265, "reward_std": 0.03349619358778, "rewards/VisualizationJSONCombinedORM/mean": 0.5795005559921265, "rewards/VisualizationJSONCombinedORM/std": 0.2396371215581894, "step": 5513, "train_speed(iter/s)": 0.075412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 304.4375, "completions/min_length": 241.0, "epoch": 4.560794044665013, "grad_norm": 0.2375091016292572, "kl": 0.0755615234375, "learning_rate": 2.3325183787180683e-07, "loss": 0.000756584107875824, "memory(GiB)": 38.13, "reward": 0.32087939977645874, "reward_std": 0.04284632205963135, "rewards/VisualizationJSONCombinedORM/mean": 0.32087939977645874, "rewards/VisualizationJSONCombinedORM/std": 0.07375301420688629, "step": 5514, "train_speed(iter/s)": 0.0754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 266.0625, "completions/min_length": 211.0, "epoch": 4.5616211745244, "grad_norm": 0.2112143188714981, "kl": 0.0478515625, "learning_rate": 2.3238099007134973e-07, "loss": 0.0004789307713508606, "memory(GiB)": 38.13, "reward": 0.5855568647384644, "reward_std": 0.10385725647211075, "rewards/VisualizationJSONCombinedORM/mean": 0.5855568647384644, "rewards/VisualizationJSONCombinedORM/std": 0.10273455828428268, "step": 5515, "train_speed(iter/s)": 0.07539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 272.0, "completions/min_length": 227.0, "epoch": 4.562448304383788, "grad_norm": 0.18912801146507263, "kl": 0.09033203125, "learning_rate": 2.315117322941507e-07, "loss": 0.0009026974439620972, "memory(GiB)": 38.13, "reward": 0.6027696132659912, "reward_std": 0.07301943749189377, "rewards/VisualizationJSONCombinedORM/mean": 0.6027696132659912, "rewards/VisualizationJSONCombinedORM/std": 0.15501244366168976, "step": 5516, "train_speed(iter/s)": 0.075375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 327.5, "completions/min_length": 263.0, "epoch": 4.5632754342431765, "grad_norm": 0.1953420639038086, "kl": 0.1705322265625, "learning_rate": 2.3064406483010947e-07, "loss": 0.0017159990966320038, "memory(GiB)": 38.13, "reward": 0.3242267370223999, "reward_std": 0.03533301502466202, "rewards/VisualizationJSONCombinedORM/mean": 0.3242267370223999, "rewards/VisualizationJSONCombinedORM/std": 0.09769897907972336, "step": 5517, "train_speed(iter/s)": 0.075363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 316.5, "completions/min_length": 255.0, "epoch": 4.564102564102564, "grad_norm": 0.2185175120830536, "kl": 0.080078125, "learning_rate": 2.2977798796859796e-07, "loss": 0.0008014626801013947, "memory(GiB)": 38.13, "reward": 0.6780394911766052, "reward_std": 0.030845191329717636, "rewards/VisualizationJSONCombinedORM/mean": 0.6780394911766052, "rewards/VisualizationJSONCombinedORM/std": 0.12421288341283798, "step": 5518, "train_speed(iter/s)": 0.075345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 274.0, "completions/min_length": 235.0, "epoch": 4.564929693961952, "grad_norm": 0.18415194749832153, "kl": 0.044647216796875, "learning_rate": 2.2891350199845673e-07, "loss": 0.0004486367106437683, "memory(GiB)": 38.13, "reward": 0.5586232542991638, "reward_std": 0.02661086618900299, "rewards/VisualizationJSONCombinedORM/mean": 0.5586232542991638, "rewards/VisualizationJSONCombinedORM/std": 0.10183355212211609, "step": 5519, "train_speed(iter/s)": 0.075332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 291.625, "completions/min_length": 235.0, "epoch": 4.56575682382134, "grad_norm": 0.18816393613815308, "kl": 0.036865234375, "learning_rate": 2.280506072079963e-07, "loss": 0.00036845356225967407, "memory(GiB)": 38.13, "reward": 0.6081921458244324, "reward_std": 0.03643054515123367, "rewards/VisualizationJSONCombinedORM/mean": 0.6081921458244324, "rewards/VisualizationJSONCombinedORM/std": 0.15012945234775543, "step": 5520, "train_speed(iter/s)": 0.075322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 304.875, "completions/min_length": 233.0, "epoch": 4.566583953680728, "grad_norm": 0.21460609138011932, "kl": 0.10888671875, "learning_rate": 2.2718930388499537e-07, "loss": 0.0010899249464273453, "memory(GiB)": 38.13, "reward": 0.42077499628067017, "reward_std": 0.039719074964523315, "rewards/VisualizationJSONCombinedORM/mean": 0.42077499628067017, "rewards/VisualizationJSONCombinedORM/std": 0.1993720382452011, "step": 5521, "train_speed(iter/s)": 0.075314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 297.375, "completions/min_length": 250.0, "epoch": 4.567411083540116, "grad_norm": 0.1811688393354416, "kl": 0.06732177734375, "learning_rate": 2.263295923167025e-07, "loss": 0.0006730183959007263, "memory(GiB)": 38.13, "reward": 0.4675056040287018, "reward_std": 0.0456806905567646, "rewards/VisualizationJSONCombinedORM/mean": 0.4675056040287018, "rewards/VisualizationJSONCombinedORM/std": 0.1321503072977066, "step": 5522, "train_speed(iter/s)": 0.075301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 335.75, "completions/min_length": 233.0, "epoch": 4.568238213399503, "grad_norm": 0.22018800675868988, "kl": 0.0682373046875, "learning_rate": 2.254714727898366e-07, "loss": 0.0006829071789979935, "memory(GiB)": 38.13, "reward": 0.3846328854560852, "reward_std": 0.047411829233169556, "rewards/VisualizationJSONCombinedORM/mean": 0.3846328854560852, "rewards/VisualizationJSONCombinedORM/std": 0.09564622491598129, "step": 5523, "train_speed(iter/s)": 0.075289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 314.375, "completions/min_length": 245.0, "epoch": 4.569065343258892, "grad_norm": 0.19316405057907104, "kl": 0.0882568359375, "learning_rate": 2.246149455905844e-07, "loss": 0.0008823499083518982, "memory(GiB)": 38.13, "reward": 0.5617628693580627, "reward_std": 0.028826236724853516, "rewards/VisualizationJSONCombinedORM/mean": 0.5617628693580627, "rewards/VisualizationJSONCombinedORM/std": 0.20906832814216614, "step": 5524, "train_speed(iter/s)": 0.075278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 297.9375, "completions/min_length": 247.0, "epoch": 4.56989247311828, "grad_norm": 0.2617630958557129, "kl": 0.05328369140625, "learning_rate": 2.237600110046001e-07, "loss": 0.0005331821739673615, "memory(GiB)": 38.13, "reward": 0.4142226576805115, "reward_std": 0.029316313564777374, "rewards/VisualizationJSONCombinedORM/mean": 0.4142226576805115, "rewards/VisualizationJSONCombinedORM/std": 0.22335109114646912, "step": 5525, "train_speed(iter/s)": 0.075265 }, { "epoch": 4.56989247311828, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.5833333333333, "eval_completions/mean_length": 314.1510416666667, "eval_completions/min_length": 255.875, "eval_kl": 0.083770751953125, "eval_loss": 0.000843370973598212, "eval_reward": 0.4640924520790577, "eval_reward_std": 0.05052524451942494, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4640924520790577, "eval_rewards/VisualizationJSONCombinedORM/std": 0.050525246207447104, "eval_runtime": 316.0648, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 290.875, "completions/min_length": 217.0, "epoch": 4.570719602977667, "grad_norm": 0.1787072867155075, "kl": 0.08953857421875, "learning_rate": 2.2290666931701067e-07, "loss": 0.0008955039083957672, "memory(GiB)": 38.13, "reward": 0.7743144035339355, "reward_std": 0.05099107697606087, "rewards/VisualizationJSONCombinedORM/mean": 0.7743144035339355, "rewards/VisualizationJSONCombinedORM/std": 0.05209999158978462, "step": 5526, "train_speed(iter/s)": 0.074931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 322.3125, "completions/min_length": 271.0, "epoch": 4.571546732837056, "grad_norm": 0.20360395312309265, "kl": 0.05694580078125, "learning_rate": 2.2205492081240787e-07, "loss": 0.0005692318081855774, "memory(GiB)": 38.13, "reward": 0.8210313320159912, "reward_std": 0.050138022750616074, "rewards/VisualizationJSONCombinedORM/mean": 0.8210313320159912, "rewards/VisualizationJSONCombinedORM/std": 0.08084110915660858, "step": 5527, "train_speed(iter/s)": 0.074919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 300.4375, "completions/min_length": 252.0, "epoch": 4.572373862696443, "grad_norm": 0.28140199184417725, "kl": 0.0870361328125, "learning_rate": 2.212047657748545e-07, "loss": 0.0008712559938430786, "memory(GiB)": 38.13, "reward": 0.7222613096237183, "reward_std": 0.07932303845882416, "rewards/VisualizationJSONCombinedORM/mean": 0.7222613096237183, "rewards/VisualizationJSONCombinedORM/std": 0.16915997862815857, "step": 5528, "train_speed(iter/s)": 0.074904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 322.8125, "completions/min_length": 254.0, "epoch": 4.573200992555831, "grad_norm": 0.25612446665763855, "kl": 0.070068359375, "learning_rate": 2.20356204487881e-07, "loss": 0.0007005296647548676, "memory(GiB)": 38.13, "reward": 0.5638539791107178, "reward_std": 0.04519046097993851, "rewards/VisualizationJSONCombinedORM/mean": 0.5638539791107178, "rewards/VisualizationJSONCombinedORM/std": 0.19368815422058105, "step": 5529, "train_speed(iter/s)": 0.074887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 304.75, "completions/min_length": 259.0, "epoch": 4.5740281224152195, "grad_norm": 0.1782267987728119, "kl": 0.079345703125, "learning_rate": 2.1950923723448704e-07, "loss": 0.0007941462099552155, "memory(GiB)": 38.13, "reward": 0.6018812656402588, "reward_std": 0.055938586592674255, "rewards/VisualizationJSONCombinedORM/mean": 0.6018812656402588, "rewards/VisualizationJSONCombinedORM/std": 0.1710364818572998, "step": 5530, "train_speed(iter/s)": 0.074875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 311.125, "completions/min_length": 231.0, "epoch": 4.574855252274607, "grad_norm": 0.2060801386833191, "kl": 0.11279296875, "learning_rate": 2.1866386429713893e-07, "loss": 0.00112856924533844, "memory(GiB)": 38.13, "reward": 0.5164103507995605, "reward_std": 0.059376128017902374, "rewards/VisualizationJSONCombinedORM/mean": 0.5164103507995605, "rewards/VisualizationJSONCombinedORM/std": 0.057375580072402954, "step": 5531, "train_speed(iter/s)": 0.074863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 295.75, "completions/min_length": 215.0, "epoch": 4.575682382133995, "grad_norm": 0.2009446918964386, "kl": 0.06292724609375, "learning_rate": 2.1782008595777448e-07, "loss": 0.0006291158497333527, "memory(GiB)": 38.13, "reward": 0.5931711196899414, "reward_std": 0.02828514203429222, "rewards/VisualizationJSONCombinedORM/mean": 0.5931711196899414, "rewards/VisualizationJSONCombinedORM/std": 0.1857832819223404, "step": 5532, "train_speed(iter/s)": 0.07485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 302.875, "completions/min_length": 222.0, "epoch": 4.5765095119933825, "grad_norm": 0.18441839516162872, "kl": 0.03692626953125, "learning_rate": 2.1697790249779638e-07, "loss": 0.00036919862031936646, "memory(GiB)": 38.13, "reward": 0.5558151602745056, "reward_std": 0.05679410696029663, "rewards/VisualizationJSONCombinedORM/mean": 0.5558151602745056, "rewards/VisualizationJSONCombinedORM/std": 0.08486635982990265, "step": 5533, "train_speed(iter/s)": 0.074837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 360.6875, "completions/min_length": 308.0, "epoch": 4.577336641852771, "grad_norm": 0.17202876508235931, "kl": 0.0723876953125, "learning_rate": 2.161373141980766e-07, "loss": 0.0007230602204799652, "memory(GiB)": 38.13, "reward": 0.7914047837257385, "reward_std": 0.035778384655714035, "rewards/VisualizationJSONCombinedORM/mean": 0.7914047837257385, "rewards/VisualizationJSONCombinedORM/std": 0.09102725237607956, "step": 5534, "train_speed(iter/s)": 0.07482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 326.9375, "completions/min_length": 259.0, "epoch": 4.578163771712159, "grad_norm": 0.18173907697200775, "kl": 0.07220458984375, "learning_rate": 2.152983213389559e-07, "loss": 0.0007225139997899532, "memory(GiB)": 38.13, "reward": 0.6579567790031433, "reward_std": 0.06045781075954437, "rewards/VisualizationJSONCombinedORM/mean": 0.6579567790031433, "rewards/VisualizationJSONCombinedORM/std": 0.14804627001285553, "step": 5535, "train_speed(iter/s)": 0.074806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 294.5, "completions/min_length": 247.0, "epoch": 4.578990901571546, "grad_norm": 0.27170631289482117, "kl": 0.060546875, "learning_rate": 2.1446092420024157e-07, "loss": 0.0006051640957593918, "memory(GiB)": 38.13, "reward": 0.5656830072402954, "reward_std": 0.06302058696746826, "rewards/VisualizationJSONCombinedORM/mean": 0.5656830072402954, "rewards/VisualizationJSONCombinedORM/std": 0.23803472518920898, "step": 5536, "train_speed(iter/s)": 0.074794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/mean_length": 342.625, "completions/min_length": 249.0, "epoch": 4.579818031430935, "grad_norm": 0.16512443125247955, "kl": 0.032012939453125, "learning_rate": 2.136251230612113e-07, "loss": 0.0003202371299266815, "memory(GiB)": 38.13, "reward": 0.45378875732421875, "reward_std": 0.016184579581022263, "rewards/VisualizationJSONCombinedORM/mean": 0.45378875732421875, "rewards/VisualizationJSONCombinedORM/std": 0.08610415458679199, "step": 5537, "train_speed(iter/s)": 0.074775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 302.0, "completions/min_length": 235.0, "epoch": 4.580645161290323, "grad_norm": 0.20777767896652222, "kl": 0.0640869140625, "learning_rate": 2.127909182006055e-07, "loss": 0.000641385093331337, "memory(GiB)": 38.13, "reward": 0.4129321575164795, "reward_std": 0.06235317140817642, "rewards/VisualizationJSONCombinedORM/mean": 0.4129321575164795, "rewards/VisualizationJSONCombinedORM/std": 0.0759972482919693, "step": 5538, "train_speed(iter/s)": 0.074762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 337.9375, "completions/min_length": 254.0, "epoch": 4.58147229114971, "grad_norm": 0.21046055853366852, "kl": 0.1156005859375, "learning_rate": 2.119583098966388e-07, "loss": 0.0011580996215343475, "memory(GiB)": 38.13, "reward": 0.6948959231376648, "reward_std": 0.060955338180065155, "rewards/VisualizationJSONCombinedORM/mean": 0.6948959231376648, "rewards/VisualizationJSONCombinedORM/std": 0.07186008989810944, "step": 5539, "train_speed(iter/s)": 0.074749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 315.9375, "completions/min_length": 264.0, "epoch": 4.582299421009099, "grad_norm": 0.18869677186012268, "kl": 0.0887451171875, "learning_rate": 2.11127298426988e-07, "loss": 0.0008874014019966125, "memory(GiB)": 38.13, "reward": 0.30421844124794006, "reward_std": 0.02361014112830162, "rewards/VisualizationJSONCombinedORM/mean": 0.30421844124794006, "rewards/VisualizationJSONCombinedORM/std": 0.10305386036634445, "step": 5540, "train_speed(iter/s)": 0.07473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 323.5, "completions/min_length": 271.0, "epoch": 4.583126550868486, "grad_norm": 0.19187477231025696, "kl": 0.0736083984375, "learning_rate": 2.102978840687997e-07, "loss": 0.0007376708090305328, "memory(GiB)": 38.13, "reward": 0.39669761061668396, "reward_std": 0.04270864278078079, "rewards/VisualizationJSONCombinedORM/mean": 0.39669761061668396, "rewards/VisualizationJSONCombinedORM/std": 0.06217993423342705, "step": 5541, "train_speed(iter/s)": 0.074721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 318.625, "completions/min_length": 269.0, "epoch": 4.583953680727874, "grad_norm": 0.20488743484020233, "kl": 0.1282958984375, "learning_rate": 2.094700670986871e-07, "loss": 0.0012832656502723694, "memory(GiB)": 38.13, "reward": 0.6664031744003296, "reward_std": 0.09580262005329132, "rewards/VisualizationJSONCombinedORM/mean": 0.6664031744003296, "rewards/VisualizationJSONCombinedORM/std": 0.10997475683689117, "step": 5542, "train_speed(iter/s)": 0.074709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 305.75, "completions/min_length": 253.0, "epoch": 4.5847808105872625, "grad_norm": 0.13161125779151917, "kl": 0.04132080078125, "learning_rate": 2.0864384779273274e-07, "loss": 0.0004130652523599565, "memory(GiB)": 38.13, "reward": 0.5008388161659241, "reward_std": 0.019586889073252678, "rewards/VisualizationJSONCombinedORM/mean": 0.5008388161659241, "rewards/VisualizationJSONCombinedORM/std": 0.1283222734928131, "step": 5543, "train_speed(iter/s)": 0.074697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 321.4375, "completions/min_length": 228.0, "epoch": 4.58560794044665, "grad_norm": 0.1976798176765442, "kl": 0.06695556640625, "learning_rate": 2.0781922642648222e-07, "loss": 0.0006699934601783752, "memory(GiB)": 38.13, "reward": 0.513073205947876, "reward_std": 0.03381580486893654, "rewards/VisualizationJSONCombinedORM/mean": 0.513073205947876, "rewards/VisualizationJSONCombinedORM/std": 0.037192560732364655, "step": 5544, "train_speed(iter/s)": 0.074683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 295.9375, "completions/min_length": 242.0, "epoch": 4.586435070306038, "grad_norm": 0.22906674444675446, "kl": 0.05682373046875, "learning_rate": 2.0699620327495174e-07, "loss": 0.0005670934915542603, "memory(GiB)": 38.13, "reward": 0.4638345241546631, "reward_std": 0.05567973852157593, "rewards/VisualizationJSONCombinedORM/mean": 0.4638345241546631, "rewards/VisualizationJSONCombinedORM/std": 0.17248238623142242, "step": 5545, "train_speed(iter/s)": 0.074666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 309.625, "completions/min_length": 264.0, "epoch": 4.587262200165426, "grad_norm": 0.2697261571884155, "kl": 0.136962890625, "learning_rate": 2.0617477861262335e-07, "loss": 0.0013726092875003815, "memory(GiB)": 38.13, "reward": 0.5397962331771851, "reward_std": 0.08604568243026733, "rewards/VisualizationJSONCombinedORM/mean": 0.5397962331771851, "rewards/VisualizationJSONCombinedORM/std": 0.10430698096752167, "step": 5546, "train_speed(iter/s)": 0.074659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 338.5625, "completions/min_length": 282.0, "epoch": 4.588089330024814, "grad_norm": 0.18765011429786682, "kl": 0.073486328125, "learning_rate": 2.0535495271344686e-07, "loss": 0.0007362179458141327, "memory(GiB)": 38.13, "reward": 0.33049747347831726, "reward_std": 0.03085208311676979, "rewards/VisualizationJSONCombinedORM/mean": 0.33049747347831726, "rewards/VisualizationJSONCombinedORM/std": 0.04519595205783844, "step": 5547, "train_speed(iter/s)": 0.074646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 319.0625, "completions/min_length": 261.0, "epoch": 4.588916459884202, "grad_norm": 0.24129211902618408, "kl": 0.07427978515625, "learning_rate": 2.045367258508363e-07, "loss": 0.0007429085671901703, "memory(GiB)": 38.13, "reward": 0.41750800609588623, "reward_std": 0.03936821222305298, "rewards/VisualizationJSONCombinedORM/mean": 0.41750800609588623, "rewards/VisualizationJSONCombinedORM/std": 0.17146073281764984, "step": 5548, "train_speed(iter/s)": 0.074631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 314.8125, "completions/min_length": 214.0, "epoch": 4.589743589743589, "grad_norm": 0.19411087036132812, "kl": 0.05718994140625, "learning_rate": 2.0372009829767558e-07, "loss": 0.0005731023848056793, "memory(GiB)": 38.13, "reward": 0.5180762410163879, "reward_std": 0.04391952231526375, "rewards/VisualizationJSONCombinedORM/mean": 0.5180762410163879, "rewards/VisualizationJSONCombinedORM/std": 0.0881514772772789, "step": 5549, "train_speed(iter/s)": 0.074617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 335.0, "completions/min_length": 273.0, "epoch": 4.590570719602978, "grad_norm": 0.1836545467376709, "kl": 0.064208984375, "learning_rate": 2.0290507032631356e-07, "loss": 0.0006415396928787231, "memory(GiB)": 38.13, "reward": 0.4634840488433838, "reward_std": 0.05504065752029419, "rewards/VisualizationJSONCombinedORM/mean": 0.4634840488433838, "rewards/VisualizationJSONCombinedORM/std": 0.08420684933662415, "step": 5550, "train_speed(iter/s)": 0.074598 }, { "epoch": 4.590570719602978, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 376.6666666666667, "eval_completions/mean_length": 312.9114583333333, "eval_completions/min_length": 259.0833333333333, "eval_kl": 0.09032185872395833, "eval_loss": 0.0009101306204684079, "eval_reward": 0.46043671295046806, "eval_reward_std": 0.05140005884459242, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46043671295046806, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05140005895130647, "eval_runtime": 319.0722, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 341.4375, "completions/min_length": 243.0, "epoch": 4.591397849462366, "grad_norm": 0.2284896820783615, "kl": 0.0557861328125, "learning_rate": 2.0209164220856503e-07, "loss": 0.0005582850426435471, "memory(GiB)": 38.15, "reward": 0.6586703658103943, "reward_std": 0.0980093777179718, "rewards/VisualizationJSONCombinedORM/mean": 0.6586703658103943, "rewards/VisualizationJSONCombinedORM/std": 0.14322678744792938, "step": 5551, "train_speed(iter/s)": 0.074259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 295.0625, "completions/min_length": 244.0, "epoch": 4.592224979321753, "grad_norm": 0.2269144505262375, "kl": 0.0880126953125, "learning_rate": 2.0127981421571295e-07, "loss": 0.0008797869086265564, "memory(GiB)": 38.15, "reward": 0.5638047456741333, "reward_std": 0.05589589104056358, "rewards/VisualizationJSONCombinedORM/mean": 0.5638047456741333, "rewards/VisualizationJSONCombinedORM/std": 0.06478022038936615, "step": 5552, "train_speed(iter/s)": 0.074247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 312.8125, "completions/min_length": 244.0, "epoch": 4.593052109181142, "grad_norm": 0.20576177537441254, "kl": 0.06976318359375, "learning_rate": 2.004695866185058e-07, "loss": 0.0006977207958698273, "memory(GiB)": 38.15, "reward": 0.5393006801605225, "reward_std": 0.06722383946180344, "rewards/VisualizationJSONCombinedORM/mean": 0.5393006801605225, "rewards/VisualizationJSONCombinedORM/std": 0.24763686954975128, "step": 5553, "train_speed(iter/s)": 0.074229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 319.0, "completions/min_length": 240.0, "epoch": 4.593879239040529, "grad_norm": 0.15667270123958588, "kl": 0.0799560546875, "learning_rate": 1.9966095968715737e-07, "loss": 0.0007979888468980789, "memory(GiB)": 38.15, "reward": 0.5817739963531494, "reward_std": 0.046090222895145416, "rewards/VisualizationJSONCombinedORM/mean": 0.5817739963531494, "rewards/VisualizationJSONCombinedORM/std": 0.1440448760986328, "step": 5554, "train_speed(iter/s)": 0.074216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 309.0, "completions/min_length": 266.0, "epoch": 4.594706368899917, "grad_norm": 0.1792847067117691, "kl": 0.08612060546875, "learning_rate": 1.9885393369134976e-07, "loss": 0.0008599385619163513, "memory(GiB)": 38.15, "reward": 0.6854779720306396, "reward_std": 0.0602884441614151, "rewards/VisualizationJSONCombinedORM/mean": 0.6854779720306396, "rewards/VisualizationJSONCombinedORM/std": 0.09595073014497757, "step": 5555, "train_speed(iter/s)": 0.074209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 310.5, "completions/min_length": 256.0, "epoch": 4.5955334987593055, "grad_norm": 0.1959196925163269, "kl": 0.08648681640625, "learning_rate": 1.980485089002293e-07, "loss": 0.0008631870150566101, "memory(GiB)": 38.15, "reward": 0.6617757081985474, "reward_std": 0.06441749632358551, "rewards/VisualizationJSONCombinedORM/mean": 0.6617757081985474, "rewards/VisualizationJSONCombinedORM/std": 0.06696322560310364, "step": 5556, "train_speed(iter/s)": 0.074198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 308.5, "completions/min_length": 207.0, "epoch": 4.596360628618693, "grad_norm": 0.1885184645652771, "kl": 0.05682373046875, "learning_rate": 1.9724468558240838e-07, "loss": 0.0005671009421348572, "memory(GiB)": 38.15, "reward": 0.49935242533683777, "reward_std": 0.023161601275205612, "rewards/VisualizationJSONCombinedORM/mean": 0.49935242533683777, "rewards/VisualizationJSONCombinedORM/std": 0.1854211390018463, "step": 5557, "train_speed(iter/s)": 0.074187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 300.3125, "completions/min_length": 241.0, "epoch": 4.597187758478081, "grad_norm": 0.1995425671339035, "kl": 0.1068115234375, "learning_rate": 1.9644246400596644e-07, "loss": 0.0010674968361854553, "memory(GiB)": 38.15, "reward": 0.5722744464874268, "reward_std": 0.08944499492645264, "rewards/VisualizationJSONCombinedORM/mean": 0.5722744464874268, "rewards/VisualizationJSONCombinedORM/std": 0.13908520340919495, "step": 5558, "train_speed(iter/s)": 0.074169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 337.6875, "completions/min_length": 253.0, "epoch": 4.5980148883374685, "grad_norm": 0.266967236995697, "kl": 0.0777587890625, "learning_rate": 1.956418444384489e-07, "loss": 0.0007793549448251724, "memory(GiB)": 38.15, "reward": 0.5638933181762695, "reward_std": 0.0731426477432251, "rewards/VisualizationJSONCombinedORM/mean": 0.5638933181762695, "rewards/VisualizationJSONCombinedORM/std": 0.22995181381702423, "step": 5559, "train_speed(iter/s)": 0.074155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 304.5625, "completions/min_length": 259.0, "epoch": 4.598842018196857, "grad_norm": 0.24240869283676147, "kl": 0.095947265625, "learning_rate": 1.9484282714686442e-07, "loss": 0.0009569898247718811, "memory(GiB)": 38.15, "reward": 0.3721628785133362, "reward_std": 0.01255848165601492, "rewards/VisualizationJSONCombinedORM/mean": 0.3721628785133362, "rewards/VisualizationJSONCombinedORM/std": 0.11396687477827072, "step": 5560, "train_speed(iter/s)": 0.074139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 292.9375, "completions/min_length": 233.0, "epoch": 4.599669148056245, "grad_norm": 0.23187567293643951, "kl": 0.0626220703125, "learning_rate": 1.940454123976898e-07, "loss": 0.0006252750754356384, "memory(GiB)": 38.15, "reward": 0.6290256381034851, "reward_std": 0.039965253323316574, "rewards/VisualizationJSONCombinedORM/mean": 0.6290256381034851, "rewards/VisualizationJSONCombinedORM/std": 0.11402332782745361, "step": 5561, "train_speed(iter/s)": 0.074125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 299.5, "completions/min_length": 252.0, "epoch": 4.600496277915632, "grad_norm": 0.22922582924365997, "kl": 0.06011962890625, "learning_rate": 1.9324960045686736e-07, "loss": 0.0006010401993989944, "memory(GiB)": 38.15, "reward": 0.517857551574707, "reward_std": 0.040875144302845, "rewards/VisualizationJSONCombinedORM/mean": 0.517857551574707, "rewards/VisualizationJSONCombinedORM/std": 0.10488610714673996, "step": 5562, "train_speed(iter/s)": 0.074113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 320.0625, "completions/min_length": 242.0, "epoch": 4.601323407775021, "grad_norm": 0.25565406680107117, "kl": 0.0555419921875, "learning_rate": 1.9245539158980365e-07, "loss": 0.0005549080669879913, "memory(GiB)": 38.15, "reward": 0.6213110685348511, "reward_std": 0.06992821395397186, "rewards/VisualizationJSONCombinedORM/mean": 0.6213110685348511, "rewards/VisualizationJSONCombinedORM/std": 0.06861958652734756, "step": 5563, "train_speed(iter/s)": 0.074101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 312.6875, "completions/min_length": 240.0, "epoch": 4.602150537634409, "grad_norm": 0.1810978651046753, "kl": 0.0712890625, "learning_rate": 1.9166278606136955e-07, "loss": 0.0007110647857189178, "memory(GiB)": 38.15, "reward": 0.4602835178375244, "reward_std": 0.05272964388132095, "rewards/VisualizationJSONCombinedORM/mean": 0.4602835178375244, "rewards/VisualizationJSONCombinedORM/std": 0.14708122611045837, "step": 5564, "train_speed(iter/s)": 0.074088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 317.8125, "completions/min_length": 235.0, "epoch": 4.602977667493796, "grad_norm": 0.18276935815811157, "kl": 0.06439208984375, "learning_rate": 1.908717841359048e-07, "loss": 0.0006448626518249512, "memory(GiB)": 38.15, "reward": 0.5384825468063354, "reward_std": 0.05505025386810303, "rewards/VisualizationJSONCombinedORM/mean": 0.5384825468063354, "rewards/VisualizationJSONCombinedORM/std": 0.06397100538015366, "step": 5565, "train_speed(iter/s)": 0.074073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 303.4375, "completions/min_length": 226.0, "epoch": 4.603804797353185, "grad_norm": 0.2082376331090927, "kl": 0.065673828125, "learning_rate": 1.9008238607721162e-07, "loss": 0.000659547746181488, "memory(GiB)": 38.15, "reward": 0.5839411020278931, "reward_std": 0.0308521818369627, "rewards/VisualizationJSONCombinedORM/mean": 0.5839411020278931, "rewards/VisualizationJSONCombinedORM/std": 0.2447516918182373, "step": 5566, "train_speed(iter/s)": 0.074067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 324.6875, "completions/min_length": 258.0, "epoch": 4.604631927212573, "grad_norm": 0.2184762954711914, "kl": 0.06671142578125, "learning_rate": 1.8929459214855727e-07, "loss": 0.0006681233644485474, "memory(GiB)": 38.15, "reward": 0.7563657760620117, "reward_std": 0.05185100436210632, "rewards/VisualizationJSONCombinedORM/mean": 0.7563657760620117, "rewards/VisualizationJSONCombinedORM/std": 0.08027244359254837, "step": 5567, "train_speed(iter/s)": 0.074057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 306.0625, "completions/min_length": 244.0, "epoch": 4.60545905707196, "grad_norm": 0.17600266635417938, "kl": 0.04840087890625, "learning_rate": 1.8850840261267543e-07, "loss": 0.0004835650324821472, "memory(GiB)": 38.15, "reward": 0.5778539180755615, "reward_std": 0.026694655418395996, "rewards/VisualizationJSONCombinedORM/mean": 0.5778539180755615, "rewards/VisualizationJSONCombinedORM/std": 0.250632643699646, "step": 5568, "train_speed(iter/s)": 0.074044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 310.6875, "completions/min_length": 221.0, "epoch": 4.6062861869313485, "grad_norm": 0.17636829614639282, "kl": 0.033599853515625, "learning_rate": 1.8772381773176417e-07, "loss": 0.00033556297421455383, "memory(GiB)": 38.15, "reward": 0.5736870765686035, "reward_std": 0.030116334557533264, "rewards/VisualizationJSONCombinedORM/mean": 0.5736870765686035, "rewards/VisualizationJSONCombinedORM/std": 0.21447418630123138, "step": 5569, "train_speed(iter/s)": 0.074028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 300.9375, "completions/min_length": 249.0, "epoch": 4.607113316790736, "grad_norm": 0.20509091019630432, "kl": 0.0831298828125, "learning_rate": 1.8694083776748472e-07, "loss": 0.0008309017866849899, "memory(GiB)": 38.15, "reward": 0.7543714046478271, "reward_std": 0.08871205151081085, "rewards/VisualizationJSONCombinedORM/mean": 0.7543714046478271, "rewards/VisualizationJSONCombinedORM/std": 0.08652027696371078, "step": 5570, "train_speed(iter/s)": 0.074018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 297.0, "completions/min_length": 233.0, "epoch": 4.607940446650124, "grad_norm": 0.16269446909427643, "kl": 0.03387451171875, "learning_rate": 1.8615946298096654e-07, "loss": 0.0003374554216861725, "memory(GiB)": 38.15, "reward": 0.7314566373825073, "reward_std": 0.029604636132717133, "rewards/VisualizationJSONCombinedORM/mean": 0.7314566373825073, "rewards/VisualizationJSONCombinedORM/std": 0.15394754707813263, "step": 5571, "train_speed(iter/s)": 0.074006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 318.6875, "completions/min_length": 264.0, "epoch": 4.608767576509512, "grad_norm": 0.19339174032211304, "kl": 0.0718994140625, "learning_rate": 1.8537969363280061e-07, "loss": 0.0007190145552158356, "memory(GiB)": 38.15, "reward": 0.49430051445961, "reward_std": 0.04968772083520889, "rewards/VisualizationJSONCombinedORM/mean": 0.49430051445961, "rewards/VisualizationJSONCombinedORM/std": 0.23611080646514893, "step": 5572, "train_speed(iter/s)": 0.073993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 326.875, "completions/min_length": 267.0, "epoch": 4.6095947063689, "grad_norm": 0.19879384338855743, "kl": 0.0440673828125, "learning_rate": 1.8460152998304393e-07, "loss": 0.00044099241495132446, "memory(GiB)": 38.15, "reward": 0.6931794285774231, "reward_std": 0.07698953151702881, "rewards/VisualizationJSONCombinedORM/mean": 0.6931794285774231, "rewards/VisualizationJSONCombinedORM/std": 0.10355527698993683, "step": 5573, "train_speed(iter/s)": 0.073975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 353.875, "completions/min_length": 272.0, "epoch": 4.610421836228288, "grad_norm": 0.1773989200592041, "kl": 0.073974609375, "learning_rate": 1.8382497229121776e-07, "loss": 0.0007412396371364594, "memory(GiB)": 38.15, "reward": 0.6479287147521973, "reward_std": 0.0649196207523346, "rewards/VisualizationJSONCombinedORM/mean": 0.6479287147521973, "rewards/VisualizationJSONCombinedORM/std": 0.11545220017433167, "step": 5574, "train_speed(iter/s)": 0.073961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 309.6875, "completions/min_length": 217.0, "epoch": 4.611248966087675, "grad_norm": 0.21411727368831635, "kl": 0.07696533203125, "learning_rate": 1.8305002081630885e-07, "loss": 0.0007708743214607239, "memory(GiB)": 38.15, "reward": 0.4796451926231384, "reward_std": 0.08074911683797836, "rewards/VisualizationJSONCombinedORM/mean": 0.4796451926231384, "rewards/VisualizationJSONCombinedORM/std": 0.2915348708629608, "step": 5575, "train_speed(iter/s)": 0.073946 }, { "epoch": 4.611248966087675, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 369.875, "eval_completions/mean_length": 310.96875, "eval_completions/min_length": 257.0416666666667, "eval_kl": 0.098602294921875, "eval_loss": 0.0010123798856511712, "eval_reward": 0.452771441390117, "eval_reward_std": 0.05115384371796002, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.452771441390117, "eval_rewards/VisualizationJSONCombinedORM/std": 0.051153845308969416, "eval_runtime": 314.6658, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 320.6875, "completions/min_length": 250.0, "epoch": 4.612076095947064, "grad_norm": 0.21504411101341248, "kl": 0.0526123046875, "learning_rate": 1.8227667581676488e-07, "loss": 0.0005274266004562378, "memory(GiB)": 38.15, "reward": 0.6492576599121094, "reward_std": 0.049140844494104385, "rewards/VisualizationJSONCombinedORM/mean": 0.6492576599121094, "rewards/VisualizationJSONCombinedORM/std": 0.04854366555809975, "step": 5576, "train_speed(iter/s)": 0.073627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 312.125, "completions/min_length": 267.0, "epoch": 4.612903225806452, "grad_norm": 0.20577214658260345, "kl": 0.06988525390625, "learning_rate": 1.8150493755050068e-07, "loss": 0.000698871910572052, "memory(GiB)": 38.15, "reward": 0.7668859958648682, "reward_std": 0.0336996391415596, "rewards/VisualizationJSONCombinedORM/mean": 0.7668859958648682, "rewards/VisualizationJSONCombinedORM/std": 0.04478687793016434, "step": 5577, "train_speed(iter/s)": 0.073614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 283.3125, "completions/min_length": 220.0, "epoch": 4.613730355665839, "grad_norm": 0.22397717833518982, "kl": 0.05426025390625, "learning_rate": 1.8073480627489593e-07, "loss": 0.0005427822470664978, "memory(GiB)": 38.15, "reward": 0.7650255560874939, "reward_std": 0.0704977810382843, "rewards/VisualizationJSONCombinedORM/mean": 0.7650255560874939, "rewards/VisualizationJSONCombinedORM/std": 0.12442279607057571, "step": 5578, "train_speed(iter/s)": 0.073599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 315.25, "completions/min_length": 238.0, "epoch": 4.614557485525228, "grad_norm": 0.17745231091976166, "kl": 0.06561279296875, "learning_rate": 1.7996628224679237e-07, "loss": 0.0006563663482666016, "memory(GiB)": 38.15, "reward": 0.6361091136932373, "reward_std": 0.07458475232124329, "rewards/VisualizationJSONCombinedORM/mean": 0.6361091136932373, "rewards/VisualizationJSONCombinedORM/std": 0.07612857967615128, "step": 5579, "train_speed(iter/s)": 0.073582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 317.625, "completions/min_length": 226.0, "epoch": 4.615384615384615, "grad_norm": 0.17969246208667755, "kl": 0.070556640625, "learning_rate": 1.7919936572249442e-07, "loss": 0.0007068738341331482, "memory(GiB)": 38.15, "reward": 0.39996057748794556, "reward_std": 0.04524702578783035, "rewards/VisualizationJSONCombinedORM/mean": 0.39996057748794556, "rewards/VisualizationJSONCombinedORM/std": 0.1060575470328331, "step": 5580, "train_speed(iter/s)": 0.073568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 327.5625, "completions/min_length": 238.0, "epoch": 4.616211745244003, "grad_norm": 0.24979346990585327, "kl": 0.0704345703125, "learning_rate": 1.7843405695777582e-07, "loss": 0.0007047262042760849, "memory(GiB)": 38.15, "reward": 0.5106542110443115, "reward_std": 0.07148073613643646, "rewards/VisualizationJSONCombinedORM/mean": 0.5106542110443115, "rewards/VisualizationJSONCombinedORM/std": 0.08090841770172119, "step": 5581, "train_speed(iter/s)": 0.073561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 330.625, "completions/min_length": 277.0, "epoch": 4.6170388751033915, "grad_norm": 0.22269508242607117, "kl": 0.061279296875, "learning_rate": 1.77670356207868e-07, "loss": 0.0006144195795059204, "memory(GiB)": 38.15, "reward": 0.670708417892456, "reward_std": 0.0788641944527626, "rewards/VisualizationJSONCombinedORM/mean": 0.670708417892456, "rewards/VisualizationJSONCombinedORM/std": 0.1378486007452011, "step": 5582, "train_speed(iter/s)": 0.073549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 317.125, "completions/min_length": 242.0, "epoch": 4.617866004962779, "grad_norm": 0.1857176125049591, "kl": 0.03485107421875, "learning_rate": 1.7690826372746994e-07, "loss": 0.00034989044070243835, "memory(GiB)": 38.15, "reward": 0.509796142578125, "reward_std": 0.02272937446832657, "rewards/VisualizationJSONCombinedORM/mean": 0.509796142578125, "rewards/VisualizationJSONCombinedORM/std": 0.13812819123268127, "step": 5583, "train_speed(iter/s)": 0.073533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 310.875, "completions/min_length": 240.0, "epoch": 4.618693134822167, "grad_norm": 0.22505593299865723, "kl": 0.133544921875, "learning_rate": 1.7614777977074227e-07, "loss": 0.0013323724269866943, "memory(GiB)": 38.15, "reward": 0.3736823797225952, "reward_std": 0.05761351063847542, "rewards/VisualizationJSONCombinedORM/mean": 0.3736823797225952, "rewards/VisualizationJSONCombinedORM/std": 0.11081547290086746, "step": 5584, "train_speed(iter/s)": 0.073521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 323.25, "completions/min_length": 255.0, "epoch": 4.6195202646815545, "grad_norm": 0.3225257396697998, "kl": 0.06005859375, "learning_rate": 1.7538890459131098e-07, "loss": 0.0006000548601150513, "memory(GiB)": 38.15, "reward": 0.6717309951782227, "reward_std": 0.09190866351127625, "rewards/VisualizationJSONCombinedORM/mean": 0.6717309951782227, "rewards/VisualizationJSONCombinedORM/std": 0.09218280017375946, "step": 5585, "train_speed(iter/s)": 0.073505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 299.9375, "completions/min_length": 243.0, "epoch": 4.620347394540943, "grad_norm": 0.15975765883922577, "kl": 0.041259765625, "learning_rate": 1.7463163844226304e-07, "loss": 0.0004120245575904846, "memory(GiB)": 38.15, "reward": 0.6130496263504028, "reward_std": 0.05579536035656929, "rewards/VisualizationJSONCombinedORM/mean": 0.6130496263504028, "rewards/VisualizationJSONCombinedORM/std": 0.16075028479099274, "step": 5586, "train_speed(iter/s)": 0.073491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 317.5625, "completions/min_length": 248.0, "epoch": 4.621174524400331, "grad_norm": 0.21876247227191925, "kl": 0.04034423828125, "learning_rate": 1.7387598157615205e-07, "loss": 0.000403478741645813, "memory(GiB)": 38.15, "reward": 0.8750700354576111, "reward_std": 0.08283977210521698, "rewards/VisualizationJSONCombinedORM/mean": 0.8750700354576111, "rewards/VisualizationJSONCombinedORM/std": 0.09983182698488235, "step": 5587, "train_speed(iter/s)": 0.073476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 268.9375, "completions/min_length": 231.0, "epoch": 4.622001654259718, "grad_norm": 0.2873629927635193, "kl": 0.083984375, "learning_rate": 1.7312193424499134e-07, "loss": 0.0008403435349464417, "memory(GiB)": 38.15, "reward": 0.4823440909385681, "reward_std": 0.03713272139430046, "rewards/VisualizationJSONCombinedORM/mean": 0.4823440909385681, "rewards/VisualizationJSONCombinedORM/std": 0.15341223776340485, "step": 5588, "train_speed(iter/s)": 0.073466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 289.75, "completions/min_length": 258.0, "epoch": 4.622828784119107, "grad_norm": 0.21417400240898132, "kl": 0.0789794921875, "learning_rate": 1.7236949670026037e-07, "loss": 0.0007880330085754395, "memory(GiB)": 38.15, "reward": 0.7379131317138672, "reward_std": 0.09040091186761856, "rewards/VisualizationJSONCombinedORM/mean": 0.7379131317138672, "rewards/VisualizationJSONCombinedORM/std": 0.09756535291671753, "step": 5589, "train_speed(iter/s)": 0.073455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 307.6875, "completions/min_length": 231.0, "epoch": 4.623655913978495, "grad_norm": 0.18686850368976593, "kl": 0.08184814453125, "learning_rate": 1.7161866919290004e-07, "loss": 0.000819087028503418, "memory(GiB)": 38.15, "reward": 0.7099250555038452, "reward_std": 0.05305821821093559, "rewards/VisualizationJSONCombinedORM/mean": 0.7099250555038452, "rewards/VisualizationJSONCombinedORM/std": 0.1590801179409027, "step": 5590, "train_speed(iter/s)": 0.073443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 337.4375, "completions/min_length": 283.0, "epoch": 4.624483043837882, "grad_norm": 0.17095312476158142, "kl": 0.085693359375, "learning_rate": 1.7086945197331562e-07, "loss": 0.0008574109524488449, "memory(GiB)": 38.15, "reward": 0.6186319589614868, "reward_std": 0.03847993165254593, "rewards/VisualizationJSONCombinedORM/mean": 0.6186319589614868, "rewards/VisualizationJSONCombinedORM/std": 0.08614037185907364, "step": 5591, "train_speed(iter/s)": 0.073432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 314.125, "completions/min_length": 271.0, "epoch": 4.625310173697271, "grad_norm": 0.18618221580982208, "kl": 0.0789794921875, "learning_rate": 1.7012184529137387e-07, "loss": 0.0007892921566963196, "memory(GiB)": 38.15, "reward": 0.27805793285369873, "reward_std": 0.023734863847494125, "rewards/VisualizationJSONCombinedORM/mean": 0.27805793285369873, "rewards/VisualizationJSONCombinedORM/std": 0.05710691958665848, "step": 5592, "train_speed(iter/s)": 0.073419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 319.1875, "completions/min_length": 266.0, "epoch": 4.626137303556659, "grad_norm": 0.17938324809074402, "kl": 0.11865234375, "learning_rate": 1.6937584939640483e-07, "loss": 0.001187693327665329, "memory(GiB)": 38.15, "reward": 0.5469012260437012, "reward_std": 0.05517898499965668, "rewards/VisualizationJSONCombinedORM/mean": 0.5469012260437012, "rewards/VisualizationJSONCombinedORM/std": 0.20209424197673798, "step": 5593, "train_speed(iter/s)": 0.073407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 284.6875, "completions/min_length": 241.0, "epoch": 4.626964433416046, "grad_norm": 0.19975730776786804, "kl": 0.125244140625, "learning_rate": 1.686314645372017e-07, "loss": 0.0012526288628578186, "memory(GiB)": 38.15, "reward": 0.3289741277694702, "reward_std": 0.037956684827804565, "rewards/VisualizationJSONCombinedORM/mean": 0.3289741277694702, "rewards/VisualizationJSONCombinedORM/std": 0.10108619928359985, "step": 5594, "train_speed(iter/s)": 0.073395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 323.3125, "completions/min_length": 279.0, "epoch": 4.6277915632754345, "grad_norm": 0.16650308668613434, "kl": 0.08221435546875, "learning_rate": 1.6788869096202197e-07, "loss": 0.0008210241794586182, "memory(GiB)": 38.15, "reward": 0.6749445199966431, "reward_std": 0.02966531552374363, "rewards/VisualizationJSONCombinedORM/mean": 0.6749445199966431, "rewards/VisualizationJSONCombinedORM/std": 0.16820430755615234, "step": 5595, "train_speed(iter/s)": 0.073382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 301.875, "completions/min_length": 224.0, "epoch": 4.628618693134822, "grad_norm": 0.18083007633686066, "kl": 0.027130126953125, "learning_rate": 1.6714752891858088e-07, "loss": 0.00027115270495414734, "memory(GiB)": 38.15, "reward": 0.7318387627601624, "reward_std": 0.07058502733707428, "rewards/VisualizationJSONCombinedORM/mean": 0.7318387627601624, "rewards/VisualizationJSONCombinedORM/std": 0.08172889798879623, "step": 5596, "train_speed(iter/s)": 0.073365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 330.125, "completions/min_length": 250.0, "epoch": 4.62944582299421, "grad_norm": 0.3213180899620056, "kl": 0.094970703125, "learning_rate": 1.664079786540629e-07, "loss": 0.0009525679051876068, "memory(GiB)": 38.15, "reward": 0.5107743740081787, "reward_std": 0.07736731320619583, "rewards/VisualizationJSONCombinedORM/mean": 0.5107743740081787, "rewards/VisualizationJSONCombinedORM/std": 0.16951029002666473, "step": 5597, "train_speed(iter/s)": 0.07335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 291.4375, "completions/min_length": 226.0, "epoch": 4.630272952853598, "grad_norm": 0.18839271366596222, "kl": 0.0635986328125, "learning_rate": 1.6567004041510958e-07, "loss": 0.0006335265934467316, "memory(GiB)": 38.15, "reward": 0.3604424297809601, "reward_std": 0.020564377307891846, "rewards/VisualizationJSONCombinedORM/mean": 0.3604424297809601, "rewards/VisualizationJSONCombinedORM/std": 0.05507107824087143, "step": 5598, "train_speed(iter/s)": 0.073342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 314.375, "completions/min_length": 246.0, "epoch": 4.631100082712986, "grad_norm": 0.15617090463638306, "kl": 0.0791015625, "learning_rate": 1.649337144478269e-07, "loss": 0.0007908940315246582, "memory(GiB)": 38.15, "reward": 0.6042342185974121, "reward_std": 0.04357041046023369, "rewards/VisualizationJSONCombinedORM/mean": 0.6042342185974121, "rewards/VisualizationJSONCombinedORM/std": 0.07589443773031235, "step": 5599, "train_speed(iter/s)": 0.073331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 309.5, "completions/min_length": 256.0, "epoch": 4.631927212572374, "grad_norm": 0.26460331678390503, "kl": 0.20166015625, "learning_rate": 1.641990009977834e-07, "loss": 0.0020195022225379944, "memory(GiB)": 38.15, "reward": 0.5133585929870605, "reward_std": 0.024634620174765587, "rewards/VisualizationJSONCombinedORM/mean": 0.5133585929870605, "rewards/VisualizationJSONCombinedORM/std": 0.15187795460224152, "step": 5600, "train_speed(iter/s)": 0.073318 }, { "epoch": 4.631927212572374, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.9166666666667, "eval_completions/mean_length": 309.3802083333333, "eval_completions/min_length": 252.75, "eval_kl": 0.09918212890625, "eval_loss": 0.0010050572454929352, "eval_reward": 0.45465902611613274, "eval_reward_std": 0.05014678826167559, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45465902611613274, "eval_rewards/VisualizationJSONCombinedORM/std": 0.0501467875437811, "eval_runtime": 318.3544, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 297.0625, "completions/min_length": 230.0, "epoch": 4.632754342431761, "grad_norm": 0.20798662304878235, "kl": 0.07611083984375, "learning_rate": 1.6346590031000974e-07, "loss": 0.0007606968283653259, "memory(GiB)": 38.15, "reward": 0.7748165130615234, "reward_std": 0.10658220946788788, "rewards/VisualizationJSONCombinedORM/mean": 0.7748165130615234, "rewards/VisualizationJSONCombinedORM/std": 0.10735325515270233, "step": 5601, "train_speed(iter/s)": 0.073003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 316.6875, "completions/min_length": 231.0, "epoch": 4.63358147229115, "grad_norm": 0.2121259570121765, "kl": 0.07373046875, "learning_rate": 1.6273441262899647e-07, "loss": 0.0007383301854133606, "memory(GiB)": 38.15, "reward": 0.2862153649330139, "reward_std": 0.022941280156373978, "rewards/VisualizationJSONCombinedORM/mean": 0.2862153649330139, "rewards/VisualizationJSONCombinedORM/std": 0.09049653261899948, "step": 5602, "train_speed(iter/s)": 0.072993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 306.9375, "completions/min_length": 253.0, "epoch": 4.634408602150538, "grad_norm": 0.32584288716316223, "kl": 0.2860107421875, "learning_rate": 1.6200453819870122e-07, "loss": 0.0028610490262508392, "memory(GiB)": 38.15, "reward": 0.4388112425804138, "reward_std": 0.05058792605996132, "rewards/VisualizationJSONCombinedORM/mean": 0.4388112425804138, "rewards/VisualizationJSONCombinedORM/std": 0.159542515873909, "step": 5603, "train_speed(iter/s)": 0.072984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 313.625, "completions/min_length": 251.0, "epoch": 4.635235732009925, "grad_norm": 0.21222996711730957, "kl": 0.21441650390625, "learning_rate": 1.612762772625387e-07, "loss": 0.0021393001079559326, "memory(GiB)": 38.15, "reward": 0.6700850129127502, "reward_std": 0.09030942618846893, "rewards/VisualizationJSONCombinedORM/mean": 0.6700850129127502, "rewards/VisualizationJSONCombinedORM/std": 0.13804703950881958, "step": 5604, "train_speed(iter/s)": 0.072972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 323.5625, "completions/min_length": 244.0, "epoch": 4.636062861869314, "grad_norm": 0.20888464152812958, "kl": 0.035369873046875, "learning_rate": 1.6054963006338742e-07, "loss": 0.0003542788326740265, "memory(GiB)": 38.15, "reward": 0.35834312438964844, "reward_std": 0.028884198516607285, "rewards/VisualizationJSONCombinedORM/mean": 0.35834312438964844, "rewards/VisualizationJSONCombinedORM/std": 0.12936706840991974, "step": 5605, "train_speed(iter/s)": 0.072959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 325.5625, "completions/min_length": 267.0, "epoch": 4.636889991728701, "grad_norm": 0.18048560619354248, "kl": 0.0595703125, "learning_rate": 1.598245968435874e-07, "loss": 0.0005969777703285217, "memory(GiB)": 38.15, "reward": 0.5015363097190857, "reward_std": 0.05218881741166115, "rewards/VisualizationJSONCombinedORM/mean": 0.5015363097190857, "rewards/VisualizationJSONCombinedORM/std": 0.05182257294654846, "step": 5606, "train_speed(iter/s)": 0.072943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 287.375, "completions/min_length": 229.0, "epoch": 4.637717121588089, "grad_norm": 0.1761120855808258, "kl": 0.075927734375, "learning_rate": 1.5910117784494194e-07, "loss": 0.0007588174194097519, "memory(GiB)": 38.15, "reward": 0.5293630361557007, "reward_std": 0.05312186852097511, "rewards/VisualizationJSONCombinedORM/mean": 0.5293630361557007, "rewards/VisualizationJSONCombinedORM/std": 0.12498899549245834, "step": 5607, "train_speed(iter/s)": 0.072932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 302.0, "completions/min_length": 244.0, "epoch": 4.6385442514474775, "grad_norm": 0.24751628935337067, "kl": 0.05853271484375, "learning_rate": 1.5837937330871357e-07, "loss": 0.0005856715142726898, "memory(GiB)": 38.15, "reward": 0.5389241576194763, "reward_std": 0.0672072097659111, "rewards/VisualizationJSONCombinedORM/mean": 0.5389241576194763, "rewards/VisualizationJSONCombinedORM/std": 0.20706744492053986, "step": 5608, "train_speed(iter/s)": 0.072922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 331.5625, "completions/min_length": 248.0, "epoch": 4.639371381306865, "grad_norm": 0.2238590270280838, "kl": 0.05841064453125, "learning_rate": 1.5765918347562747e-07, "loss": 0.0005835257470607758, "memory(GiB)": 38.15, "reward": 0.42937034368515015, "reward_std": 0.06256522983312607, "rewards/VisualizationJSONCombinedORM/mean": 0.42937034368515015, "rewards/VisualizationJSONCombinedORM/std": 0.15261128544807434, "step": 5609, "train_speed(iter/s)": 0.072907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 287.9375, "completions/min_length": 237.0, "epoch": 4.640198511166253, "grad_norm": 0.18868882954120636, "kl": 0.1058349609375, "learning_rate": 1.5694060858587046e-07, "loss": 0.0010615400969982147, "memory(GiB)": 38.15, "reward": 0.5687182545661926, "reward_std": 0.06941156834363937, "rewards/VisualizationJSONCombinedORM/mean": 0.5687182545661926, "rewards/VisualizationJSONCombinedORM/std": 0.1758004128932953, "step": 5610, "train_speed(iter/s)": 0.0729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 311.75, "completions/min_length": 262.0, "epoch": 4.641025641025641, "grad_norm": 0.17531053721904755, "kl": 0.160888671875, "learning_rate": 1.5622364887909135e-07, "loss": 0.0016112178564071655, "memory(GiB)": 38.15, "reward": 0.5291944742202759, "reward_std": 0.07737956941127777, "rewards/VisualizationJSONCombinedORM/mean": 0.5291944742202759, "rewards/VisualizationJSONCombinedORM/std": 0.13693803548812866, "step": 5611, "train_speed(iter/s)": 0.072891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 349.4375, "completions/min_length": 256.0, "epoch": 4.641852770885029, "grad_norm": 0.16355982422828674, "kl": 0.034423828125, "learning_rate": 1.555083045943978e-07, "loss": 0.00034395232796669006, "memory(GiB)": 38.15, "reward": 0.42077314853668213, "reward_std": 0.024154137820005417, "rewards/VisualizationJSONCombinedORM/mean": 0.42077314853668213, "rewards/VisualizationJSONCombinedORM/std": 0.18405427038669586, "step": 5612, "train_speed(iter/s)": 0.072879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 306.375, "completions/min_length": 246.0, "epoch": 4.642679900744417, "grad_norm": 0.23921281099319458, "kl": 0.09521484375, "learning_rate": 1.547945759703623e-07, "loss": 0.0009536221623420715, "memory(GiB)": 38.15, "reward": 0.7870358228683472, "reward_std": 0.05436372756958008, "rewards/VisualizationJSONCombinedORM/mean": 0.7870358228683472, "rewards/VisualizationJSONCombinedORM/std": 0.05659148469567299, "step": 5613, "train_speed(iter/s)": 0.072863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 332.625, "completions/min_length": 267.0, "epoch": 4.643507030603804, "grad_norm": 0.22660823166370392, "kl": 0.091796875, "learning_rate": 1.540824632450161e-07, "loss": 0.0009162947535514832, "memory(GiB)": 38.15, "reward": 0.5646420121192932, "reward_std": 0.059926748275756836, "rewards/VisualizationJSONCombinedORM/mean": 0.5646420121192932, "rewards/VisualizationJSONCombinedORM/std": 0.08730194717645645, "step": 5614, "train_speed(iter/s)": 0.072851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 314.8125, "completions/min_length": 232.0, "epoch": 4.644334160463193, "grad_norm": 0.2315453290939331, "kl": 0.1392822265625, "learning_rate": 1.533719666558514e-07, "loss": 0.0013941898941993713, "memory(GiB)": 38.15, "reward": 0.6063679456710815, "reward_std": 0.03952988237142563, "rewards/VisualizationJSONCombinedORM/mean": 0.6063679456710815, "rewards/VisualizationJSONCombinedORM/std": 0.13486911356449127, "step": 5615, "train_speed(iter/s)": 0.072839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 328.5625, "completions/min_length": 267.0, "epoch": 4.645161290322581, "grad_norm": 0.19288994371891022, "kl": 0.067138671875, "learning_rate": 1.526630864398232e-07, "loss": 0.0006718710064888, "memory(GiB)": 38.15, "reward": 0.5138983726501465, "reward_std": 0.03450901061296463, "rewards/VisualizationJSONCombinedORM/mean": 0.5138983726501465, "rewards/VisualizationJSONCombinedORM/std": 0.14347794651985168, "step": 5616, "train_speed(iter/s)": 0.072827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 324.0, "completions/min_length": 257.0, "epoch": 4.645988420181968, "grad_norm": 0.18988776206970215, "kl": 0.10272216796875, "learning_rate": 1.519558228333462e-07, "loss": 0.001026645302772522, "memory(GiB)": 38.15, "reward": 0.42396193742752075, "reward_std": 0.038844261318445206, "rewards/VisualizationJSONCombinedORM/mean": 0.42396193742752075, "rewards/VisualizationJSONCombinedORM/std": 0.040138136595487595, "step": 5617, "train_speed(iter/s)": 0.07281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 305.875, "completions/min_length": 245.0, "epoch": 4.646815550041357, "grad_norm": 0.15457838773727417, "kl": 0.0804443359375, "learning_rate": 1.5125017607229454e-07, "loss": 0.0008063167333602905, "memory(GiB)": 38.15, "reward": 0.6762185096740723, "reward_std": 0.0440785214304924, "rewards/VisualizationJSONCombinedORM/mean": 0.6762185096740723, "rewards/VisualizationJSONCombinedORM/std": 0.12101906538009644, "step": 5618, "train_speed(iter/s)": 0.072802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 290.25, "completions/min_length": 190.0, "epoch": 4.647642679900745, "grad_norm": 0.20031960308551788, "kl": 0.033721923828125, "learning_rate": 1.505461463920077e-07, "loss": 0.0003374740481376648, "memory(GiB)": 38.15, "reward": 0.3083547353744507, "reward_std": 0.03303326293826103, "rewards/VisualizationJSONCombinedORM/mean": 0.3083547353744507, "rewards/VisualizationJSONCombinedORM/std": 0.04816945642232895, "step": 5619, "train_speed(iter/s)": 0.072786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 324.875, "completions/min_length": 252.0, "epoch": 4.648469809760132, "grad_norm": 0.2048642784357071, "kl": 0.04736328125, "learning_rate": 1.4984373402728014e-07, "loss": 0.00047281570732593536, "memory(GiB)": 38.15, "reward": 0.26643991470336914, "reward_std": 0.017981721088290215, "rewards/VisualizationJSONCombinedORM/mean": 0.26643991470336914, "rewards/VisualizationJSONCombinedORM/std": 0.0665670707821846, "step": 5620, "train_speed(iter/s)": 0.072773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/mean_length": 330.0, "completions/min_length": 275.0, "epoch": 4.6492969396195205, "grad_norm": 0.1796334981918335, "kl": 0.0692138671875, "learning_rate": 1.491429392123711e-07, "loss": 0.0006953924894332886, "memory(GiB)": 38.15, "reward": 0.5818138122558594, "reward_std": 0.03624952957034111, "rewards/VisualizationJSONCombinedORM/mean": 0.5818138122558594, "rewards/VisualizationJSONCombinedORM/std": 0.13887815177440643, "step": 5621, "train_speed(iter/s)": 0.072761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 347.3125, "completions/min_length": 295.0, "epoch": 4.650124069478908, "grad_norm": 0.17913012206554413, "kl": 0.04510498046875, "learning_rate": 1.484437621809981e-07, "loss": 0.0004506818950176239, "memory(GiB)": 38.15, "reward": 0.529691755771637, "reward_std": 0.06399376690387726, "rewards/VisualizationJSONCombinedORM/mean": 0.529691755771637, "rewards/VisualizationJSONCombinedORM/std": 0.11305241286754608, "step": 5622, "train_speed(iter/s)": 0.072748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 337.5, "completions/min_length": 258.0, "epoch": 4.650951199338296, "grad_norm": 0.17180900275707245, "kl": 0.0660400390625, "learning_rate": 1.477462031663407e-07, "loss": 0.0006614923477172852, "memory(GiB)": 38.15, "reward": 0.6681971549987793, "reward_std": 0.058941446244716644, "rewards/VisualizationJSONCombinedORM/mean": 0.6681971549987793, "rewards/VisualizationJSONCombinedORM/std": 0.15704315900802612, "step": 5623, "train_speed(iter/s)": 0.072734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 294.25, "completions/min_length": 239.0, "epoch": 4.651778329197684, "grad_norm": 0.24862049520015717, "kl": 0.101318359375, "learning_rate": 1.4705026240103837e-07, "loss": 0.0010120198130607605, "memory(GiB)": 38.15, "reward": 0.40681642293930054, "reward_std": 0.07129375636577606, "rewards/VisualizationJSONCombinedORM/mean": 0.40681642293930054, "rewards/VisualizationJSONCombinedORM/std": 0.12957170605659485, "step": 5624, "train_speed(iter/s)": 0.072719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 303.8125, "completions/min_length": 246.0, "epoch": 4.652605459057072, "grad_norm": 0.2093379944562912, "kl": 0.0936279296875, "learning_rate": 1.4635594011718935e-07, "loss": 0.0009339898824691772, "memory(GiB)": 38.15, "reward": 0.6930655837059021, "reward_std": 0.041239336133003235, "rewards/VisualizationJSONCombinedORM/mean": 0.6930655837059021, "rewards/VisualizationJSONCombinedORM/std": 0.15098875761032104, "step": 5625, "train_speed(iter/s)": 0.072708 }, { "epoch": 4.652605459057072, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 377.8333333333333, "eval_completions/mean_length": 316.3645833333333, "eval_completions/min_length": 260.6666666666667, "eval_kl": 0.08149210611979167, "eval_loss": 0.0008302057976834476, "eval_reward": 0.4676439433048169, "eval_reward_std": 0.052573889416332044, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4676439433048169, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052573890308849514, "eval_runtime": 319.8501, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 301.5625, "completions/min_length": 228.0, "epoch": 4.65343258891646, "grad_norm": 0.16594475507736206, "kl": 0.08087158203125, "learning_rate": 1.456632365463545e-07, "loss": 0.0008111037313938141, "memory(GiB)": 38.15, "reward": 0.5706025958061218, "reward_std": 0.06522233784198761, "rewards/VisualizationJSONCombinedORM/mean": 0.5706025958061218, "rewards/VisualizationJSONCombinedORM/std": 0.1981322318315506, "step": 5626, "train_speed(iter/s)": 0.072399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 290.125, "completions/min_length": 248.0, "epoch": 4.654259718775847, "grad_norm": 0.2816111445426941, "kl": 0.183837890625, "learning_rate": 1.4497215191955294e-07, "loss": 0.0018407255411148071, "memory(GiB)": 38.15, "reward": 0.5002357363700867, "reward_std": 0.031987182796001434, "rewards/VisualizationJSONCombinedORM/mean": 0.5002357363700867, "rewards/VisualizationJSONCombinedORM/std": 0.1357664167881012, "step": 5627, "train_speed(iter/s)": 0.072389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 331.0625, "completions/min_length": 256.0, "epoch": 4.655086848635236, "grad_norm": 0.17761024832725525, "kl": 0.03912353515625, "learning_rate": 1.442826864672653e-07, "loss": 0.00039143674075603485, "memory(GiB)": 38.15, "reward": 0.528448224067688, "reward_std": 0.05824735760688782, "rewards/VisualizationJSONCombinedORM/mean": 0.528448224067688, "rewards/VisualizationJSONCombinedORM/std": 0.06423554569482803, "step": 5628, "train_speed(iter/s)": 0.072375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 282.4375, "completions/min_length": 214.0, "epoch": 4.655913978494624, "grad_norm": 0.1916658580303192, "kl": 0.06683349609375, "learning_rate": 1.435948404194304e-07, "loss": 0.0006673373281955719, "memory(GiB)": 38.15, "reward": 0.6370905041694641, "reward_std": 0.048438891768455505, "rewards/VisualizationJSONCombinedORM/mean": 0.6370905041694641, "rewards/VisualizationJSONCombinedORM/std": 0.12453260272741318, "step": 5629, "train_speed(iter/s)": 0.072364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 326.5625, "completions/min_length": 249.0, "epoch": 4.656741108354011, "grad_norm": 0.176218181848526, "kl": 0.205322265625, "learning_rate": 1.4290861400545031e-07, "loss": 0.002059042453765869, "memory(GiB)": 38.15, "reward": 0.4921845495700836, "reward_std": 0.06190387159585953, "rewards/VisualizationJSONCombinedORM/mean": 0.4921845495700836, "rewards/VisualizationJSONCombinedORM/std": 0.12477569282054901, "step": 5630, "train_speed(iter/s)": 0.072351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 322.75, "completions/min_length": 262.0, "epoch": 4.6575682382134, "grad_norm": 0.1736474484205246, "kl": 0.064453125, "learning_rate": 1.4222400745418252e-07, "loss": 0.0006449278444051743, "memory(GiB)": 38.15, "reward": 0.5607891082763672, "reward_std": 0.02580394223332405, "rewards/VisualizationJSONCombinedORM/mean": 0.5607891082763672, "rewards/VisualizationJSONCombinedORM/std": 0.1266038715839386, "step": 5631, "train_speed(iter/s)": 0.072341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 300.125, "completions/min_length": 223.0, "epoch": 4.658395368072787, "grad_norm": 0.19307836890220642, "kl": 0.1453857421875, "learning_rate": 1.415410209939472e-07, "loss": 0.0014532878994941711, "memory(GiB)": 38.15, "reward": 0.46480095386505127, "reward_std": 0.04420313239097595, "rewards/VisualizationJSONCombinedORM/mean": 0.46480095386505127, "rewards/VisualizationJSONCombinedORM/std": 0.06839016824960709, "step": 5632, "train_speed(iter/s)": 0.072324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 333.5625, "completions/min_length": 272.0, "epoch": 4.659222497932175, "grad_norm": 0.26700034737586975, "kl": 0.119140625, "learning_rate": 1.4085965485252373e-07, "loss": 0.0011960417032241821, "memory(GiB)": 38.15, "reward": 0.654409646987915, "reward_std": 0.07054906338453293, "rewards/VisualizationJSONCombinedORM/mean": 0.654409646987915, "rewards/VisualizationJSONCombinedORM/std": 0.1003318801522255, "step": 5633, "train_speed(iter/s)": 0.072312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 311.4375, "completions/min_length": 253.0, "epoch": 4.6600496277915635, "grad_norm": 0.20016217231750488, "kl": 0.06146240234375, "learning_rate": 1.4017990925715152e-07, "loss": 0.0006139278411865234, "memory(GiB)": 38.15, "reward": 0.6833510398864746, "reward_std": 0.07081661373376846, "rewards/VisualizationJSONCombinedORM/mean": 0.6833510398864746, "rewards/VisualizationJSONCombinedORM/std": 0.07857409864664078, "step": 5634, "train_speed(iter/s)": 0.072298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 326.0, "completions/min_length": 247.0, "epoch": 4.660876757650951, "grad_norm": 0.2941451072692871, "kl": 0.1033935546875, "learning_rate": 1.39501784434527e-07, "loss": 0.0010333545506000519, "memory(GiB)": 38.15, "reward": 0.38702115416526794, "reward_std": 0.04609996825456619, "rewards/VisualizationJSONCombinedORM/mean": 0.38702115416526794, "rewards/VisualizationJSONCombinedORM/std": 0.1765604466199875, "step": 5635, "train_speed(iter/s)": 0.072285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 297.375, "completions/min_length": 260.0, "epoch": 4.661703887510339, "grad_norm": 0.176380455493927, "kl": 0.14898681640625, "learning_rate": 1.3882528061081034e-07, "loss": 0.00148676335811615, "memory(GiB)": 38.15, "reward": 0.37975406646728516, "reward_std": 0.03820318356156349, "rewards/VisualizationJSONCombinedORM/mean": 0.37975406646728516, "rewards/VisualizationJSONCombinedORM/std": 0.16044898331165314, "step": 5636, "train_speed(iter/s)": 0.072272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 318.9375, "completions/min_length": 241.0, "epoch": 4.662531017369727, "grad_norm": 0.2180740386247635, "kl": 0.07806396484375, "learning_rate": 1.3815039801161723e-07, "loss": 0.0007801875472068787, "memory(GiB)": 38.15, "reward": 0.42541539669036865, "reward_std": 0.03783554211258888, "rewards/VisualizationJSONCombinedORM/mean": 0.42541539669036865, "rewards/VisualizationJSONCombinedORM/std": 0.12497515976428986, "step": 5637, "train_speed(iter/s)": 0.07226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 298.5, "completions/min_length": 236.0, "epoch": 4.663358147229115, "grad_norm": 0.2221921682357788, "kl": 0.1153564453125, "learning_rate": 1.3747713686202424e-07, "loss": 0.0011538304388523102, "memory(GiB)": 38.15, "reward": 0.6075658798217773, "reward_std": 0.05743040889501572, "rewards/VisualizationJSONCombinedORM/mean": 0.6075658798217773, "rewards/VisualizationJSONCombinedORM/std": 0.08630674332380295, "step": 5638, "train_speed(iter/s)": 0.072252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 281.0625, "completions/min_length": 215.0, "epoch": 4.664185277088503, "grad_norm": 0.18867143988609314, "kl": 0.06951904296875, "learning_rate": 1.368054973865679e-07, "loss": 0.0006959214806556702, "memory(GiB)": 38.15, "reward": 0.3965393602848053, "reward_std": 0.05058378726243973, "rewards/VisualizationJSONCombinedORM/mean": 0.3965393602848053, "rewards/VisualizationJSONCombinedORM/std": 0.21744580566883087, "step": 5639, "train_speed(iter/s)": 0.072239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 310.875, "completions/min_length": 263.0, "epoch": 4.66501240694789, "grad_norm": 0.22902169823646545, "kl": 0.0849609375, "learning_rate": 1.361354798092429e-07, "loss": 0.0008498877286911011, "memory(GiB)": 38.15, "reward": 0.506710946559906, "reward_std": 0.03176604583859444, "rewards/VisualizationJSONCombinedORM/mean": 0.506710946559906, "rewards/VisualizationJSONCombinedORM/std": 0.07466112822294235, "step": 5640, "train_speed(iter/s)": 0.07223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 327.4375, "completions/min_length": 250.0, "epoch": 4.665839536807279, "grad_norm": 0.15540070831775665, "kl": 0.04443359375, "learning_rate": 1.354670843535022e-07, "loss": 0.00044523924589157104, "memory(GiB)": 38.15, "reward": 0.5287830829620361, "reward_std": 0.02408464625477791, "rewards/VisualizationJSONCombinedORM/mean": 0.5287830829620361, "rewards/VisualizationJSONCombinedORM/std": 0.060235511511564255, "step": 5641, "train_speed(iter/s)": 0.072218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 307.375, "completions/min_length": 243.0, "epoch": 4.666666666666667, "grad_norm": 0.21385100483894348, "kl": 0.05682373046875, "learning_rate": 1.3480031124226022e-07, "loss": 0.0005678925663232803, "memory(GiB)": 38.15, "reward": 0.38823890686035156, "reward_std": 0.03808296471834183, "rewards/VisualizationJSONCombinedORM/mean": 0.38823890686035156, "rewards/VisualizationJSONCombinedORM/std": 0.038933396339416504, "step": 5642, "train_speed(iter/s)": 0.072203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 307.5625, "completions/min_length": 234.0, "epoch": 4.667493796526054, "grad_norm": 0.2150043398141861, "kl": 0.1358642578125, "learning_rate": 1.3413516069788802e-07, "loss": 0.0013527590781450272, "memory(GiB)": 38.15, "reward": 0.3766319155693054, "reward_std": 0.06731579452753067, "rewards/VisualizationJSONCombinedORM/mean": 0.3766319155693054, "rewards/VisualizationJSONCombinedORM/std": 0.14431338012218475, "step": 5643, "train_speed(iter/s)": 0.072193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 349.375, "completions/min_length": 232.0, "epoch": 4.668320926385443, "grad_norm": 0.22428885102272034, "kl": 0.107666015625, "learning_rate": 1.3347163294221643e-07, "loss": 0.0010773614048957825, "memory(GiB)": 38.15, "reward": 0.8847200870513916, "reward_std": 0.03215184807777405, "rewards/VisualizationJSONCombinedORM/mean": 0.8847200870513916, "rewards/VisualizationJSONCombinedORM/std": 0.0325847752392292, "step": 5644, "train_speed(iter/s)": 0.072181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/mean_length": 272.9375, "completions/min_length": 240.0, "epoch": 4.669148056244831, "grad_norm": 0.17756767570972443, "kl": 0.095458984375, "learning_rate": 1.328097281965357e-07, "loss": 0.0009535513818264008, "memory(GiB)": 38.15, "reward": 0.476456880569458, "reward_std": 0.0416608601808548, "rewards/VisualizationJSONCombinedORM/mean": 0.476456880569458, "rewards/VisualizationJSONCombinedORM/std": 0.2170737236738205, "step": 5645, "train_speed(iter/s)": 0.072171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/mean_length": 310.0625, "completions/min_length": 261.0, "epoch": 4.669975186104218, "grad_norm": 0.49961256980895996, "kl": 0.203125, "learning_rate": 1.3214944668159423e-07, "loss": 0.00203598290681839, "memory(GiB)": 38.15, "reward": 0.30119940638542175, "reward_std": 0.03657597303390503, "rewards/VisualizationJSONCombinedORM/mean": 0.30119940638542175, "rewards/VisualizationJSONCombinedORM/std": 0.03564925491809845, "step": 5646, "train_speed(iter/s)": 0.072158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 341.25, "completions/min_length": 280.0, "epoch": 4.6708023159636065, "grad_norm": 0.2238781601190567, "kl": 0.06707763671875, "learning_rate": 1.314907886175981e-07, "loss": 0.0006715916097164154, "memory(GiB)": 38.15, "reward": 0.4033668637275696, "reward_std": 0.04434552788734436, "rewards/VisualizationJSONCombinedORM/mean": 0.4033668637275696, "rewards/VisualizationJSONCombinedORM/std": 0.11884542554616928, "step": 5647, "train_speed(iter/s)": 0.072148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 327.5, "completions/min_length": 239.0, "epoch": 4.671629445822994, "grad_norm": 0.19219541549682617, "kl": 0.04693603515625, "learning_rate": 1.308337542242133e-07, "loss": 0.0004693269729614258, "memory(GiB)": 38.15, "reward": 0.7218437194824219, "reward_std": 0.06905186176300049, "rewards/VisualizationJSONCombinedORM/mean": 0.7218437194824219, "rewards/VisualizationJSONCombinedORM/std": 0.06833925098180771, "step": 5648, "train_speed(iter/s)": 0.072137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 261.5, "completions/min_length": 215.0, "epoch": 4.672456575682382, "grad_norm": 0.15095989406108856, "kl": 0.0262451171875, "learning_rate": 1.3017834372056393e-07, "loss": 0.0002626478672027588, "memory(GiB)": 38.15, "reward": 0.6461020708084106, "reward_std": 0.05327525734901428, "rewards/VisualizationJSONCombinedORM/mean": 0.6461020708084106, "rewards/VisualizationJSONCombinedORM/std": 0.12206254154443741, "step": 5649, "train_speed(iter/s)": 0.072124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 266.25, "completions/min_length": 220.0, "epoch": 4.67328370554177, "grad_norm": 0.16297318041324615, "kl": 0.11083984375, "learning_rate": 1.2952455732523238e-07, "loss": 0.0010991059243679047, "memory(GiB)": 38.15, "reward": 0.47376739978790283, "reward_std": 0.05967985838651657, "rewards/VisualizationJSONCombinedORM/mean": 0.47376739978790283, "rewards/VisualizationJSONCombinedORM/std": 0.0712541863322258, "step": 5650, "train_speed(iter/s)": 0.07211 }, { "epoch": 4.67328370554177, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.5, "eval_completions/mean_length": 311.8125, "eval_completions/min_length": 264.1666666666667, "eval_kl": 0.10120646158854167, "eval_loss": 0.001030524610541761, "eval_reward": 0.4633673212180535, "eval_reward_std": 0.05363531061448157, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4633673212180535, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05363531123536328, "eval_runtime": 315.9903, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 298.125, "completions/min_length": 242.0, "epoch": 4.674110835401158, "grad_norm": 0.24957260489463806, "kl": 0.0877685546875, "learning_rate": 1.2887239525625928e-07, "loss": 0.0008769631385803223, "memory(GiB)": 38.15, "reward": 0.5661052465438843, "reward_std": 0.039598822593688965, "rewards/VisualizationJSONCombinedORM/mean": 0.5661052465438843, "rewards/VisualizationJSONCombinedORM/std": 0.1407645046710968, "step": 5651, "train_speed(iter/s)": 0.07181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 313.25, "completions/min_length": 260.0, "epoch": 4.674937965260546, "grad_norm": 0.23021206259727478, "kl": 0.091064453125, "learning_rate": 1.2822185773114447e-07, "loss": 0.0009099841117858887, "memory(GiB)": 38.15, "reward": 0.399986207485199, "reward_std": 0.05677884817123413, "rewards/VisualizationJSONCombinedORM/mean": 0.399986207485199, "rewards/VisualizationJSONCombinedORM/std": 0.17149384319782257, "step": 5652, "train_speed(iter/s)": 0.0718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 313.6875, "completions/min_length": 240.0, "epoch": 4.675765095119933, "grad_norm": 0.15351779758930206, "kl": 0.037353515625, "learning_rate": 1.2757294496684447e-07, "loss": 0.0003725886344909668, "memory(GiB)": 38.15, "reward": 0.4402811825275421, "reward_std": 0.04678349196910858, "rewards/VisualizationJSONCombinedORM/mean": 0.4402811825275421, "rewards/VisualizationJSONCombinedORM/std": 0.11161903291940689, "step": 5653, "train_speed(iter/s)": 0.071788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 312.875, "completions/min_length": 236.0, "epoch": 4.676592224979322, "grad_norm": 0.17635172605514526, "kl": 0.058837890625, "learning_rate": 1.26925657179775e-07, "loss": 0.0005885735154151917, "memory(GiB)": 38.15, "reward": 0.6122442483901978, "reward_std": 0.044240303337574005, "rewards/VisualizationJSONCombinedORM/mean": 0.6122442483901978, "rewards/VisualizationJSONCombinedORM/std": 0.19029708206653595, "step": 5654, "train_speed(iter/s)": 0.071782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 289.0, "completions/min_length": 239.0, "epoch": 4.67741935483871, "grad_norm": 0.18241719901561737, "kl": 0.0224609375, "learning_rate": 1.2627999458580952e-07, "loss": 0.00022380799055099487, "memory(GiB)": 38.15, "reward": 0.7297401428222656, "reward_std": 0.03219516575336456, "rewards/VisualizationJSONCombinedORM/mean": 0.7297401428222656, "rewards/VisualizationJSONCombinedORM/std": 0.11378002166748047, "step": 5655, "train_speed(iter/s)": 0.071771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 316.75, "completions/min_length": 249.0, "epoch": 4.678246484698097, "grad_norm": 0.18711155652999878, "kl": 0.20703125, "learning_rate": 1.2563595740028022e-07, "loss": 0.0020675696432590485, "memory(GiB)": 38.15, "reward": 0.47293031215667725, "reward_std": 0.03904173523187637, "rewards/VisualizationJSONCombinedORM/mean": 0.47293031215667725, "rewards/VisualizationJSONCombinedORM/std": 0.2065446525812149, "step": 5656, "train_speed(iter/s)": 0.071754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 301.8125, "completions/min_length": 243.0, "epoch": 4.679073614557486, "grad_norm": 0.16854141652584076, "kl": 0.0948486328125, "learning_rate": 1.2499354583797473e-07, "loss": 0.0009481050074100494, "memory(GiB)": 38.15, "reward": 0.30549463629722595, "reward_std": 0.018481114879250526, "rewards/VisualizationJSONCombinedORM/mean": 0.30549463629722595, "rewards/VisualizationJSONCombinedORM/std": 0.10450208187103271, "step": 5657, "train_speed(iter/s)": 0.071744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 321.375, "completions/min_length": 285.0, "epoch": 4.679900744416873, "grad_norm": 0.22567515075206757, "kl": 0.1842041015625, "learning_rate": 1.243527601131428e-07, "loss": 0.001847490668296814, "memory(GiB)": 38.15, "reward": 0.7293171882629395, "reward_std": 0.025830483064055443, "rewards/VisualizationJSONCombinedORM/mean": 0.7293171882629395, "rewards/VisualizationJSONCombinedORM/std": 0.07545176148414612, "step": 5658, "train_speed(iter/s)": 0.071734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 276.125, "completions/min_length": 237.0, "epoch": 4.680727874276261, "grad_norm": 0.2736741304397583, "kl": 0.0587158203125, "learning_rate": 1.2371360043948733e-07, "loss": 0.0005873925983905792, "memory(GiB)": 38.15, "reward": 0.4191107749938965, "reward_std": 0.04919329285621643, "rewards/VisualizationJSONCombinedORM/mean": 0.4191107749938965, "rewards/VisualizationJSONCombinedORM/std": 0.05829188600182533, "step": 5659, "train_speed(iter/s)": 0.071721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 326.5625, "completions/min_length": 279.0, "epoch": 4.6815550041356495, "grad_norm": 0.20169585943222046, "kl": 0.060546875, "learning_rate": 1.2307606703017173e-07, "loss": 0.0006060749292373657, "memory(GiB)": 38.15, "reward": 0.4739131033420563, "reward_std": 0.06484789401292801, "rewards/VisualizationJSONCombinedORM/mean": 0.4739131033420563, "rewards/VisualizationJSONCombinedORM/std": 0.20433291792869568, "step": 5660, "train_speed(iter/s)": 0.07171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 288.1875, "completions/min_length": 232.0, "epoch": 4.682382133995037, "grad_norm": 0.19093258678913116, "kl": 0.049560546875, "learning_rate": 1.22440160097817e-07, "loss": 0.00049610435962677, "memory(GiB)": 38.15, "reward": 0.7310314178466797, "reward_std": 0.06588533520698547, "rewards/VisualizationJSONCombinedORM/mean": 0.7310314178466797, "rewards/VisualizationJSONCombinedORM/std": 0.11097943782806396, "step": 5661, "train_speed(iter/s)": 0.0717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 296.6875, "completions/min_length": 243.0, "epoch": 4.683209263854425, "grad_norm": 0.22345811128616333, "kl": 0.18060302734375, "learning_rate": 1.2180587985450077e-07, "loss": 0.0018161088228225708, "memory(GiB)": 38.15, "reward": 0.2928522825241089, "reward_std": 0.04335900396108627, "rewards/VisualizationJSONCombinedORM/mean": 0.2928522825241089, "rewards/VisualizationJSONCombinedORM/std": 0.05929306522011757, "step": 5662, "train_speed(iter/s)": 0.071684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 307.0625, "completions/min_length": 233.0, "epoch": 4.684036393713813, "grad_norm": 0.2064083069562912, "kl": 0.098388671875, "learning_rate": 1.2117322651175766e-07, "loss": 0.0009842319414019585, "memory(GiB)": 38.15, "reward": 0.4505564570426941, "reward_std": 0.0276719368994236, "rewards/VisualizationJSONCombinedORM/mean": 0.4505564570426941, "rewards/VisualizationJSONCombinedORM/std": 0.09846477210521698, "step": 5663, "train_speed(iter/s)": 0.071669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 323.9375, "completions/min_length": 237.0, "epoch": 4.684863523573201, "grad_norm": 0.2805713415145874, "kl": 0.0904541015625, "learning_rate": 1.2054220028058118e-07, "loss": 0.0009055212140083313, "memory(GiB)": 38.15, "reward": 0.6755915880203247, "reward_std": 0.10062801092863083, "rewards/VisualizationJSONCombinedORM/mean": 0.6755915880203247, "rewards/VisualizationJSONCombinedORM/std": 0.09946314245462418, "step": 5664, "train_speed(iter/s)": 0.071656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 311.25, "completions/min_length": 225.0, "epoch": 4.685690653432589, "grad_norm": 0.17192569375038147, "kl": 0.085205078125, "learning_rate": 1.199128013714218e-07, "loss": 0.0008537881076335907, "memory(GiB)": 38.15, "reward": 0.4351031184196472, "reward_std": 0.04961079731583595, "rewards/VisualizationJSONCombinedORM/mean": 0.4351031184196472, "rewards/VisualizationJSONCombinedORM/std": 0.08434755355119705, "step": 5665, "train_speed(iter/s)": 0.071644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 282.375, "completions/min_length": 224.0, "epoch": 4.686517783291977, "grad_norm": 0.1862441897392273, "kl": 0.08624267578125, "learning_rate": 1.1928502999418724e-07, "loss": 0.0008599422872066498, "memory(GiB)": 38.15, "reward": 0.5836373567581177, "reward_std": 0.08614317327737808, "rewards/VisualizationJSONCombinedORM/mean": 0.5836373567581177, "rewards/VisualizationJSONCombinedORM/std": 0.1780329793691635, "step": 5666, "train_speed(iter/s)": 0.071638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 288.25, "completions/min_length": 212.0, "epoch": 4.687344913151365, "grad_norm": 0.20721758902072906, "kl": 0.066650390625, "learning_rate": 1.1865888635823997e-07, "loss": 0.0006646998226642609, "memory(GiB)": 38.15, "reward": 0.5813928842544556, "reward_std": 0.041488006711006165, "rewards/VisualizationJSONCombinedORM/mean": 0.5813928842544556, "rewards/VisualizationJSONCombinedORM/std": 0.050303395837545395, "step": 5667, "train_speed(iter/s)": 0.071628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 322.625, "completions/min_length": 276.0, "epoch": 4.688172043010753, "grad_norm": 0.1819797158241272, "kl": 0.06549072265625, "learning_rate": 1.180343706724052e-07, "loss": 0.000654079020023346, "memory(GiB)": 38.15, "reward": 0.5993272066116333, "reward_std": 0.02498702146112919, "rewards/VisualizationJSONCombinedORM/mean": 0.5993272066116333, "rewards/VisualizationJSONCombinedORM/std": 0.06467032432556152, "step": 5668, "train_speed(iter/s)": 0.071615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 332.625, "completions/min_length": 232.0, "epoch": 4.68899917287014, "grad_norm": 0.19841839373111725, "kl": 0.05377197265625, "learning_rate": 1.1741148314495965e-07, "loss": 0.0005394965410232544, "memory(GiB)": 38.15, "reward": 0.41824865341186523, "reward_std": 0.02916942536830902, "rewards/VisualizationJSONCombinedORM/mean": 0.41824865341186523, "rewards/VisualizationJSONCombinedORM/std": 0.09031809866428375, "step": 5669, "train_speed(iter/s)": 0.071605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 328.75, "completions/min_length": 267.0, "epoch": 4.689826302729529, "grad_norm": 0.20180052518844604, "kl": 0.04681396484375, "learning_rate": 1.1679022398363937e-07, "loss": 0.0004679635167121887, "memory(GiB)": 38.15, "reward": 0.3554545044898987, "reward_std": 0.019837701693177223, "rewards/VisualizationJSONCombinedORM/mean": 0.3554545044898987, "rewards/VisualizationJSONCombinedORM/std": 0.027868816629052162, "step": 5670, "train_speed(iter/s)": 0.071593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 282.5625, "completions/min_length": 232.0, "epoch": 4.690653432588917, "grad_norm": 0.18921180069446564, "kl": 0.0965576171875, "learning_rate": 1.1617059339563807e-07, "loss": 0.0009677521884441376, "memory(GiB)": 38.15, "reward": 0.4399428367614746, "reward_std": 0.13838164508342743, "rewards/VisualizationJSONCombinedORM/mean": 0.4399428367614746, "rewards/VisualizationJSONCombinedORM/std": 0.21336515247821808, "step": 5671, "train_speed(iter/s)": 0.071582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 324.5, "completions/min_length": 261.0, "epoch": 4.691480562448304, "grad_norm": 0.19498960673809052, "kl": 0.08807373046875, "learning_rate": 1.155525915876049e-07, "loss": 0.0008837059140205383, "memory(GiB)": 38.15, "reward": 0.5340478420257568, "reward_std": 0.02359248325228691, "rewards/VisualizationJSONCombinedORM/mean": 0.5340478420257568, "rewards/VisualizationJSONCombinedORM/std": 0.03571360930800438, "step": 5672, "train_speed(iter/s)": 0.071573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 277.6875, "completions/min_length": 225.0, "epoch": 4.6923076923076925, "grad_norm": 0.19118732213974, "kl": 0.06488037109375, "learning_rate": 1.1493621876564554e-07, "loss": 0.0006492882966995239, "memory(GiB)": 38.15, "reward": 0.5376365780830383, "reward_std": 0.020350810140371323, "rewards/VisualizationJSONCombinedORM/mean": 0.5376365780830383, "rewards/VisualizationJSONCombinedORM/std": 0.26242849230766296, "step": 5673, "train_speed(iter/s)": 0.071564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 329.0, "completions/min_length": 246.0, "epoch": 4.69313482216708, "grad_norm": 0.17951273918151855, "kl": 0.11865234375, "learning_rate": 1.1432147513532499e-07, "loss": 0.0011905375868082047, "memory(GiB)": 38.15, "reward": 0.5607970356941223, "reward_std": 0.03152426332235336, "rewards/VisualizationJSONCombinedORM/mean": 0.5607970356941223, "rewards/VisualizationJSONCombinedORM/std": 0.2456095814704895, "step": 5674, "train_speed(iter/s)": 0.071552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 294.5625, "completions/min_length": 251.0, "epoch": 4.693961952026468, "grad_norm": 0.2291938066482544, "kl": 0.05572509765625, "learning_rate": 1.1370836090166204e-07, "loss": 0.0005573667585849762, "memory(GiB)": 38.15, "reward": 0.550514817237854, "reward_std": 0.05048733204603195, "rewards/VisualizationJSONCombinedORM/mean": 0.550514817237854, "rewards/VisualizationJSONCombinedORM/std": 0.11815127730369568, "step": 5675, "train_speed(iter/s)": 0.071543 }, { "epoch": 4.693961952026468, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 371.4166666666667, "eval_completions/mean_length": 309.0833333333333, "eval_completions/min_length": 261.75, "eval_kl": 0.0909423828125, "eval_loss": 0.0009215809404850006, "eval_reward": 0.4537342997888724, "eval_reward_std": 0.05266148866697525, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4537342997888724, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052661492043019585, "eval_runtime": 316.1063, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 279.8125, "completions/min_length": 241.0, "epoch": 4.694789081885856, "grad_norm": 0.24260132014751434, "kl": 0.05047607421875, "learning_rate": 1.1309687626913312e-07, "loss": 0.0005052760243415833, "memory(GiB)": 38.15, "reward": 0.5200050473213196, "reward_std": 0.07554098963737488, "rewards/VisualizationJSONCombinedORM/mean": 0.5200050473213196, "rewards/VisualizationJSONCombinedORM/std": 0.22035667300224304, "step": 5676, "train_speed(iter/s)": 0.071249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 314.0, "completions/min_length": 259.0, "epoch": 4.695616211745244, "grad_norm": 0.14361080527305603, "kl": 0.047271728515625, "learning_rate": 1.1248702144167123e-07, "loss": 0.00047479942440986633, "memory(GiB)": 38.15, "reward": 0.6860417723655701, "reward_std": 0.04569288715720177, "rewards/VisualizationJSONCombinedORM/mean": 0.6860417723655701, "rewards/VisualizationJSONCombinedORM/std": 0.08640150725841522, "step": 5677, "train_speed(iter/s)": 0.071239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 309.75, "completions/min_length": 233.0, "epoch": 4.696443341604632, "grad_norm": 0.1793128401041031, "kl": 0.07513427734375, "learning_rate": 1.1187879662266699e-07, "loss": 0.0007496923208236694, "memory(GiB)": 38.15, "reward": 0.405826210975647, "reward_std": 0.021134858950972557, "rewards/VisualizationJSONCombinedORM/mean": 0.405826210975647, "rewards/VisualizationJSONCombinedORM/std": 0.029451757669448853, "step": 5678, "train_speed(iter/s)": 0.071228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 310.625, "completions/min_length": 232.0, "epoch": 4.697270471464019, "grad_norm": 0.18915098905563354, "kl": 0.1220703125, "learning_rate": 1.1127220201496425e-07, "loss": 0.0012215226888656616, "memory(GiB)": 38.15, "reward": 0.5811630487442017, "reward_std": 0.0632111132144928, "rewards/VisualizationJSONCombinedORM/mean": 0.5811630487442017, "rewards/VisualizationJSONCombinedORM/std": 0.10325521230697632, "step": 5679, "train_speed(iter/s)": 0.071216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 307.25, "completions/min_length": 256.0, "epoch": 4.698097601323408, "grad_norm": 0.18200501799583435, "kl": 0.04437255859375, "learning_rate": 1.1066723782086619e-07, "loss": 0.00044348835945129395, "memory(GiB)": 38.15, "reward": 0.49595969915390015, "reward_std": 0.0397050715982914, "rewards/VisualizationJSONCombinedORM/mean": 0.49595969915390015, "rewards/VisualizationJSONCombinedORM/std": 0.1542471945285797, "step": 5680, "train_speed(iter/s)": 0.071203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 334.5, "completions/min_length": 284.0, "epoch": 4.698924731182796, "grad_norm": 0.22696064412593842, "kl": 0.062255859375, "learning_rate": 1.1006390424213143e-07, "loss": 0.0006230855360627174, "memory(GiB)": 38.15, "reward": 0.47144556045532227, "reward_std": 0.05221770331263542, "rewards/VisualizationJSONCombinedORM/mean": 0.47144556045532227, "rewards/VisualizationJSONCombinedORM/std": 0.1156165599822998, "step": 5681, "train_speed(iter/s)": 0.071188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 324.625, "completions/min_length": 242.0, "epoch": 4.699751861042183, "grad_norm": 0.2058897167444229, "kl": 0.0936279296875, "learning_rate": 1.0946220147997455e-07, "loss": 0.0009354930371046066, "memory(GiB)": 38.15, "reward": 0.5021405220031738, "reward_std": 0.04899358004331589, "rewards/VisualizationJSONCombinedORM/mean": 0.5021405220031738, "rewards/VisualizationJSONCombinedORM/std": 0.13244690001010895, "step": 5682, "train_speed(iter/s)": 0.071176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 296.8125, "completions/min_length": 238.0, "epoch": 4.700578990901572, "grad_norm": 0.25883474946022034, "kl": 0.2122802734375, "learning_rate": 1.0886212973506504e-07, "loss": 0.002116832882165909, "memory(GiB)": 38.15, "reward": 0.5354632139205933, "reward_std": 0.026335187256336212, "rewards/VisualizationJSONCombinedORM/mean": 0.5354632139205933, "rewards/VisualizationJSONCombinedORM/std": 0.2990102469921112, "step": 5683, "train_speed(iter/s)": 0.071166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 313.75, "completions/min_length": 250.0, "epoch": 4.701406120760959, "grad_norm": 0.18233728408813477, "kl": 0.0867919921875, "learning_rate": 1.0826368920753172e-07, "loss": 0.0008669383823871613, "memory(GiB)": 38.15, "reward": 0.7156957387924194, "reward_std": 0.04283160716295242, "rewards/VisualizationJSONCombinedORM/mean": 0.7156957387924194, "rewards/VisualizationJSONCombinedORM/std": 0.15017050504684448, "step": 5684, "train_speed(iter/s)": 0.07116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 306.3125, "completions/min_length": 240.0, "epoch": 4.702233250620347, "grad_norm": 0.1933455467224121, "kl": 0.1378173828125, "learning_rate": 1.0766688009695548e-07, "loss": 0.0013775229454040527, "memory(GiB)": 38.15, "reward": 0.6968642473220825, "reward_std": 0.07769216597080231, "rewards/VisualizationJSONCombinedORM/mean": 0.6968642473220825, "rewards/VisualizationJSONCombinedORM/std": 0.09920405596494675, "step": 5685, "train_speed(iter/s)": 0.071148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 278.1875, "completions/min_length": 223.0, "epoch": 4.7030603804797355, "grad_norm": 0.20484225451946259, "kl": 0.05889892578125, "learning_rate": 1.0707170260237543e-07, "loss": 0.000589124858379364, "memory(GiB)": 38.15, "reward": 0.5244313478469849, "reward_std": 0.059512149542570114, "rewards/VisualizationJSONCombinedORM/mean": 0.5244313478469849, "rewards/VisualizationJSONCombinedORM/std": 0.13762708008289337, "step": 5686, "train_speed(iter/s)": 0.071134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 309.75, "completions/min_length": 234.0, "epoch": 4.703887510339123, "grad_norm": 0.21297718584537506, "kl": 0.071533203125, "learning_rate": 1.0647815692228614e-07, "loss": 0.000715106725692749, "memory(GiB)": 38.15, "reward": 0.5056732296943665, "reward_std": 0.06688271462917328, "rewards/VisualizationJSONCombinedORM/mean": 0.5056732296943665, "rewards/VisualizationJSONCombinedORM/std": 0.2741331458091736, "step": 5687, "train_speed(iter/s)": 0.071122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 316.125, "completions/min_length": 244.0, "epoch": 4.704714640198511, "grad_norm": 0.1782342493534088, "kl": 0.125, "learning_rate": 1.058862432546387e-07, "loss": 0.001248735934495926, "memory(GiB)": 38.15, "reward": 0.3429214060306549, "reward_std": 0.020856890827417374, "rewards/VisualizationJSONCombinedORM/mean": 0.3429214060306549, "rewards/VisualizationJSONCombinedORM/std": 0.13025864958763123, "step": 5688, "train_speed(iter/s)": 0.071109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 276.9375, "completions/min_length": 236.0, "epoch": 4.705541770057899, "grad_norm": 0.19593271613121033, "kl": 0.104248046875, "learning_rate": 1.0529596179683743e-07, "loss": 0.0010402724146842957, "memory(GiB)": 38.15, "reward": 0.42774564027786255, "reward_std": 0.053501538932323456, "rewards/VisualizationJSONCombinedORM/mean": 0.42774564027786255, "rewards/VisualizationJSONCombinedORM/std": 0.09065086394548416, "step": 5689, "train_speed(iter/s)": 0.071097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 297.4375, "completions/min_length": 244.0, "epoch": 4.706368899917287, "grad_norm": 0.1621520221233368, "kl": 0.036285400390625, "learning_rate": 1.0470731274574542e-07, "loss": 0.0003636553883552551, "memory(GiB)": 38.15, "reward": 0.6297742128372192, "reward_std": 0.05425863713026047, "rewards/VisualizationJSONCombinedORM/mean": 0.6297742128372192, "rewards/VisualizationJSONCombinedORM/std": 0.16918331384658813, "step": 5690, "train_speed(iter/s)": 0.071085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 319.3125, "completions/min_length": 266.0, "epoch": 4.707196029776675, "grad_norm": 0.19869546592235565, "kl": 0.076904296875, "learning_rate": 1.0412029629767895e-07, "loss": 0.0007673949003219604, "memory(GiB)": 38.15, "reward": 0.6020397543907166, "reward_std": 0.06323456019163132, "rewards/VisualizationJSONCombinedORM/mean": 0.6020397543907166, "rewards/VisualizationJSONCombinedORM/std": 0.07079694420099258, "step": 5691, "train_speed(iter/s)": 0.071075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 322.3125, "completions/min_length": 258.0, "epoch": 4.708023159636063, "grad_norm": 0.16897441446781158, "kl": 0.04754638671875, "learning_rate": 1.0353491264841143e-07, "loss": 0.0004745312035083771, "memory(GiB)": 38.15, "reward": 0.6354731321334839, "reward_std": 0.05724582076072693, "rewards/VisualizationJSONCombinedORM/mean": 0.6354731321334839, "rewards/VisualizationJSONCombinedORM/std": 0.10902637988328934, "step": 5692, "train_speed(iter/s)": 0.071066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/mean_length": 319.1875, "completions/min_length": 227.0, "epoch": 4.708850289495451, "grad_norm": 0.202677920460701, "kl": 0.079833984375, "learning_rate": 1.0295116199317057e-07, "loss": 0.0007989741861820221, "memory(GiB)": 38.15, "reward": 0.6303942799568176, "reward_std": 0.06804421544075012, "rewards/VisualizationJSONCombinedORM/mean": 0.6303942799568176, "rewards/VisualizationJSONCombinedORM/std": 0.09123514592647552, "step": 5693, "train_speed(iter/s)": 0.071054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 304.4375, "completions/min_length": 215.0, "epoch": 4.709677419354839, "grad_norm": 0.2024940848350525, "kl": 0.1285400390625, "learning_rate": 1.0236904452664065e-07, "loss": 0.0012866761535406113, "memory(GiB)": 38.15, "reward": 0.3805742561817169, "reward_std": 0.03631160408258438, "rewards/VisualizationJSONCombinedORM/mean": 0.3805742561817169, "rewards/VisualizationJSONCombinedORM/std": 0.11464335769414902, "step": 5694, "train_speed(iter/s)": 0.071048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 324.9375, "completions/min_length": 262.0, "epoch": 4.710504549214226, "grad_norm": 0.17197442054748535, "kl": 0.05877685546875, "learning_rate": 1.0178856044295971e-07, "loss": 0.00058794766664505, "memory(GiB)": 38.15, "reward": 0.435156911611557, "reward_std": 0.02782581001520157, "rewards/VisualizationJSONCombinedORM/mean": 0.435156911611557, "rewards/VisualizationJSONCombinedORM/std": 0.19417588412761688, "step": 5695, "train_speed(iter/s)": 0.071036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 307.8125, "completions/min_length": 237.0, "epoch": 4.711331679073615, "grad_norm": 0.22615273296833038, "kl": 0.1153564453125, "learning_rate": 1.0120970993572232e-07, "loss": 0.0011531859636306763, "memory(GiB)": 38.15, "reward": 0.47511470317840576, "reward_std": 0.057113420218229294, "rewards/VisualizationJSONCombinedORM/mean": 0.47511470317840576, "rewards/VisualizationJSONCombinedORM/std": 0.13812676072120667, "step": 5696, "train_speed(iter/s)": 0.071028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 341.5625, "completions/min_length": 253.0, "epoch": 4.712158808933003, "grad_norm": 0.17479157447814941, "kl": 0.0625, "learning_rate": 1.0063249319797741e-07, "loss": 0.0006244629621505737, "memory(GiB)": 38.15, "reward": 0.47765040397644043, "reward_std": 0.05578617751598358, "rewards/VisualizationJSONCombinedORM/mean": 0.47765040397644043, "rewards/VisualizationJSONCombinedORM/std": 0.11289538443088531, "step": 5697, "train_speed(iter/s)": 0.071014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 312.25, "completions/min_length": 247.0, "epoch": 4.71298593879239, "grad_norm": 0.18228930234909058, "kl": 0.162841796875, "learning_rate": 1.0005691042223042e-07, "loss": 0.001630915328860283, "memory(GiB)": 38.15, "reward": 0.500304102897644, "reward_std": 0.02150457724928856, "rewards/VisualizationJSONCombinedORM/mean": 0.500304102897644, "rewards/VisualizationJSONCombinedORM/std": 0.03559169918298721, "step": 5698, "train_speed(iter/s)": 0.071003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 348.625, "completions/min_length": 262.0, "epoch": 4.7138130686517785, "grad_norm": 0.19545607268810272, "kl": 0.0374755859375, "learning_rate": 9.948296180043949e-08, "loss": 0.0003744177520275116, "memory(GiB)": 38.15, "reward": 0.5836831331253052, "reward_std": 0.06783310323953629, "rewards/VisualizationJSONCombinedORM/mean": 0.5836831331253052, "rewards/VisualizationJSONCombinedORM/std": 0.08575338870286942, "step": 5699, "train_speed(iter/s)": 0.070989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 298.9375, "completions/min_length": 234.0, "epoch": 4.714640198511166, "grad_norm": 0.17033490538597107, "kl": 0.072021484375, "learning_rate": 9.891064752402091e-08, "loss": 0.0007196962833404541, "memory(GiB)": 38.15, "reward": 0.6155960559844971, "reward_std": 0.040709156543016434, "rewards/VisualizationJSONCombinedORM/mean": 0.6155960559844971, "rewards/VisualizationJSONCombinedORM/std": 0.17051182687282562, "step": 5700, "train_speed(iter/s)": 0.070976 }, { "epoch": 4.714640198511166, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 367.25, "eval_completions/mean_length": 312.2135416666667, "eval_completions/min_length": 260.25, "eval_kl": 0.082183837890625, "eval_loss": 0.0008290236000902951, "eval_reward": 0.45809767084817093, "eval_reward_std": 0.05246983099884043, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45809767084817093, "eval_rewards/VisualizationJSONCombinedORM/std": 0.052469831076450646, "eval_runtime": 313.9904, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 287.0625, "completions/min_length": 233.0, "epoch": 4.715467328370554, "grad_norm": 0.1655576229095459, "kl": 0.04681396484375, "learning_rate": 9.833996778384259e-08, "loss": 0.0004684962332248688, "memory(GiB)": 38.15, "reward": 0.7639601826667786, "reward_std": 0.0600237175822258, "rewards/VisualizationJSONCombinedORM/mean": 0.7639601826667786, "rewards/VisualizationJSONCombinedORM/std": 0.05804039537906647, "step": 5701, "train_speed(iter/s)": 0.070687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 276.625, "completions/min_length": 214.0, "epoch": 4.716294458229942, "grad_norm": 0.19676144421100616, "kl": 0.033782958984375, "learning_rate": 9.777092277022948e-08, "loss": 0.0003370586782693863, "memory(GiB)": 38.15, "reward": 0.5150700807571411, "reward_std": 0.06035744026303291, "rewards/VisualizationJSONCombinedORM/mean": 0.5150700807571411, "rewards/VisualizationJSONCombinedORM/std": 0.05864459276199341, "step": 5702, "train_speed(iter/s)": 0.070671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 291.9375, "completions/min_length": 225.0, "epoch": 4.71712158808933, "grad_norm": 0.2053157389163971, "kl": 0.07171630859375, "learning_rate": 9.72035126729609e-08, "loss": 0.0007159747183322906, "memory(GiB)": 38.15, "reward": 0.45442497730255127, "reward_std": 0.04834767058491707, "rewards/VisualizationJSONCombinedORM/mean": 0.45442497730255127, "rewards/VisualizationJSONCombinedORM/std": 0.04718228802084923, "step": 5703, "train_speed(iter/s)": 0.070658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 300.0, "completions/min_length": 239.0, "epoch": 4.717948717948718, "grad_norm": 0.17809878289699554, "kl": 0.0687255859375, "learning_rate": 9.663773768127105e-08, "loss": 0.0006868895143270493, "memory(GiB)": 38.15, "reward": 0.47855639457702637, "reward_std": 0.06856115162372589, "rewards/VisualizationJSONCombinedORM/mean": 0.47855639457702637, "rewards/VisualizationJSONCombinedORM/std": 0.1770787537097931, "step": 5704, "train_speed(iter/s)": 0.070645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 299.5, "completions/min_length": 225.0, "epoch": 4.718775847808105, "grad_norm": 0.18694305419921875, "kl": 0.08984375, "learning_rate": 9.607359798384785e-08, "loss": 0.0008993819355964661, "memory(GiB)": 38.15, "reward": 0.578531801700592, "reward_std": 0.03492514416575432, "rewards/VisualizationJSONCombinedORM/mean": 0.578531801700592, "rewards/VisualizationJSONCombinedORM/std": 0.19191163778305054, "step": 5705, "train_speed(iter/s)": 0.070638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 302.0625, "completions/min_length": 241.0, "epoch": 4.719602977667494, "grad_norm": 0.1850908100605011, "kl": 0.13079833984375, "learning_rate": 9.551109376883471e-08, "loss": 0.001307288184762001, "memory(GiB)": 38.15, "reward": 0.6646163463592529, "reward_std": 0.04679550975561142, "rewards/VisualizationJSONCombinedORM/mean": 0.6646163463592529, "rewards/VisualizationJSONCombinedORM/std": 0.0989503413438797, "step": 5706, "train_speed(iter/s)": 0.070627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 298.75, "completions/min_length": 252.0, "epoch": 4.720430107526882, "grad_norm": 0.1669101119041443, "kl": 0.0743408203125, "learning_rate": 9.495022522383046e-08, "loss": 0.0007424652576446533, "memory(GiB)": 38.15, "reward": 0.34086865186691284, "reward_std": 0.019795304164290428, "rewards/VisualizationJSONCombinedORM/mean": 0.34086865186691284, "rewards/VisualizationJSONCombinedORM/std": 0.028731022030115128, "step": 5707, "train_speed(iter/s)": 0.070619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 304.8125, "completions/min_length": 250.0, "epoch": 4.721257237386269, "grad_norm": 0.19935216009616852, "kl": 0.078125, "learning_rate": 9.439099253588546e-08, "loss": 0.0007817819714546204, "memory(GiB)": 38.15, "reward": 0.45189425349235535, "reward_std": 0.04259571060538292, "rewards/VisualizationJSONCombinedORM/mean": 0.45189425349235535, "rewards/VisualizationJSONCombinedORM/std": 0.17686370015144348, "step": 5708, "train_speed(iter/s)": 0.07061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 295.25, "completions/min_length": 230.0, "epoch": 4.722084367245658, "grad_norm": 0.24298475682735443, "kl": 0.103240966796875, "learning_rate": 9.383339589150776e-08, "loss": 0.0010367333889007568, "memory(GiB)": 38.15, "reward": 0.766706109046936, "reward_std": 0.06966787576675415, "rewards/VisualizationJSONCombinedORM/mean": 0.766706109046936, "rewards/VisualizationJSONCombinedORM/std": 0.1239752471446991, "step": 5709, "train_speed(iter/s)": 0.070604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 311.25, "completions/min_length": 257.0, "epoch": 4.722911497105046, "grad_norm": 0.232517808675766, "kl": 0.1566162109375, "learning_rate": 9.327743547665858e-08, "loss": 0.0015664920210838318, "memory(GiB)": 38.15, "reward": 0.40533316135406494, "reward_std": 0.035333096981048584, "rewards/VisualizationJSONCombinedORM/mean": 0.40533316135406494, "rewards/VisualizationJSONCombinedORM/std": 0.10543134063482285, "step": 5710, "train_speed(iter/s)": 0.070588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 314.8125, "completions/min_length": 257.0, "epoch": 4.723738626964433, "grad_norm": 0.20663531124591827, "kl": 0.08624267578125, "learning_rate": 9.272311147675295e-08, "loss": 0.0008620526641607285, "memory(GiB)": 38.15, "reward": 0.42787477374076843, "reward_std": 0.06082333251833916, "rewards/VisualizationJSONCombinedORM/mean": 0.42787477374076843, "rewards/VisualizationJSONCombinedORM/std": 0.21159252524375916, "step": 5711, "train_speed(iter/s)": 0.070578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 325.375, "completions/min_length": 245.0, "epoch": 4.7245657568238215, "grad_norm": 0.17847172915935516, "kl": 0.04327392578125, "learning_rate": 9.21704240766591e-08, "loss": 0.00043259933590888977, "memory(GiB)": 38.15, "reward": 0.6995376348495483, "reward_std": 0.08013425022363663, "rewards/VisualizationJSONCombinedORM/mean": 0.6995376348495483, "rewards/VisualizationJSONCombinedORM/std": 0.11224240064620972, "step": 5712, "train_speed(iter/s)": 0.070568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 307.0625, "completions/min_length": 254.0, "epoch": 4.725392886683209, "grad_norm": 0.1772376298904419, "kl": 0.07379150390625, "learning_rate": 9.161937346070404e-08, "loss": 0.0007381662726402283, "memory(GiB)": 38.15, "reward": 0.7001855969429016, "reward_std": 0.06780403107404709, "rewards/VisualizationJSONCombinedORM/mean": 0.7001855969429016, "rewards/VisualizationJSONCombinedORM/std": 0.06730683147907257, "step": 5713, "train_speed(iter/s)": 0.070561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 307.625, "completions/min_length": 204.0, "epoch": 4.726220016542597, "grad_norm": 0.271034836769104, "kl": 0.21258544921875, "learning_rate": 9.106995981266298e-08, "loss": 0.0021224981173872948, "memory(GiB)": 38.15, "reward": 0.39319083094596863, "reward_std": 0.05376196652650833, "rewards/VisualizationJSONCombinedORM/mean": 0.39319083094596863, "rewards/VisualizationJSONCombinedORM/std": 0.1491764783859253, "step": 5714, "train_speed(iter/s)": 0.070547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 307.75, "completions/min_length": 234.0, "epoch": 4.727047146401985, "grad_norm": 0.15897603332996368, "kl": 0.0498046875, "learning_rate": 9.052218331576878e-08, "loss": 0.0004989653825759888, "memory(GiB)": 38.15, "reward": 0.5130364894866943, "reward_std": 0.03565920144319534, "rewards/VisualizationJSONCombinedORM/mean": 0.5130364894866943, "rewards/VisualizationJSONCombinedORM/std": 0.2059629112482071, "step": 5715, "train_speed(iter/s)": 0.070536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 320.5, "completions/min_length": 269.0, "epoch": 4.727874276261373, "grad_norm": 0.1867835372686386, "kl": 0.1099853515625, "learning_rate": 8.997604415270756e-08, "loss": 0.0010983701795339584, "memory(GiB)": 38.15, "reward": 0.37296563386917114, "reward_std": 0.035719454288482666, "rewards/VisualizationJSONCombinedORM/mean": 0.37296563386917114, "rewards/VisualizationJSONCombinedORM/std": 0.16098414361476898, "step": 5716, "train_speed(iter/s)": 0.070525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 312.1875, "completions/min_length": 253.0, "epoch": 4.728701406120761, "grad_norm": 0.22821222245693207, "kl": 0.04986572265625, "learning_rate": 8.943154250562025e-08, "loss": 0.0004978552460670471, "memory(GiB)": 38.15, "reward": 0.5387733578681946, "reward_std": 0.02956363372504711, "rewards/VisualizationJSONCombinedORM/mean": 0.5387733578681946, "rewards/VisualizationJSONCombinedORM/std": 0.17926768958568573, "step": 5717, "train_speed(iter/s)": 0.070514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 311.75, "completions/min_length": 245.0, "epoch": 4.729528535980149, "grad_norm": 0.16031494736671448, "kl": 0.1141357421875, "learning_rate": 8.888867855609884e-08, "loss": 0.0011438950896263123, "memory(GiB)": 38.15, "reward": 0.48884081840515137, "reward_std": 0.04769744351506233, "rewards/VisualizationJSONCombinedORM/mean": 0.48884081840515137, "rewards/VisualizationJSONCombinedORM/std": 0.22294901311397552, "step": 5718, "train_speed(iter/s)": 0.070504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 326.5625, "completions/min_length": 249.0, "epoch": 4.730355665839537, "grad_norm": 0.271697461605072, "kl": 0.0762939453125, "learning_rate": 8.834745248519183e-08, "loss": 0.0007624775171279907, "memory(GiB)": 38.15, "reward": 0.4721909165382385, "reward_std": 0.06412012875080109, "rewards/VisualizationJSONCombinedORM/mean": 0.4721909165382385, "rewards/VisualizationJSONCombinedORM/std": 0.22070075571537018, "step": 5719, "train_speed(iter/s)": 0.070491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 319.1875, "completions/min_length": 270.0, "epoch": 4.731182795698925, "grad_norm": 0.17933551967144012, "kl": 0.06317138671875, "learning_rate": 8.780786447340095e-08, "loss": 0.0006315149366855621, "memory(GiB)": 38.15, "reward": 0.434318870306015, "reward_std": 0.0396578349173069, "rewards/VisualizationJSONCombinedORM/mean": 0.434318870306015, "rewards/VisualizationJSONCombinedORM/std": 0.04928988590836525, "step": 5720, "train_speed(iter/s)": 0.070484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 317.4375, "completions/min_length": 233.0, "epoch": 4.732009925558312, "grad_norm": 0.16947805881500244, "kl": 0.101318359375, "learning_rate": 8.726991470068169e-08, "loss": 0.001016169786453247, "memory(GiB)": 38.15, "reward": 0.7152379751205444, "reward_std": 0.08528399467468262, "rewards/VisualizationJSONCombinedORM/mean": 0.7152379751205444, "rewards/VisualizationJSONCombinedORM/std": 0.11714193969964981, "step": 5721, "train_speed(iter/s)": 0.070469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 326.8125, "completions/min_length": 257.0, "epoch": 4.732837055417701, "grad_norm": 0.20237348973751068, "kl": 0.063232421875, "learning_rate": 8.67336033464411e-08, "loss": 0.0006333291530609131, "memory(GiB)": 38.15, "reward": 0.49867963790893555, "reward_std": 0.05061463266611099, "rewards/VisualizationJSONCombinedORM/mean": 0.49867963790893555, "rewards/VisualizationJSONCombinedORM/std": 0.050730835646390915, "step": 5722, "train_speed(iter/s)": 0.070457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 298.5625, "completions/min_length": 249.0, "epoch": 4.733664185277089, "grad_norm": 0.2202017903327942, "kl": 0.0771484375, "learning_rate": 8.619893058954388e-08, "loss": 0.0007700212299823761, "memory(GiB)": 38.15, "reward": 0.5666409134864807, "reward_std": 0.04430375248193741, "rewards/VisualizationJSONCombinedORM/mean": 0.5666409134864807, "rewards/VisualizationJSONCombinedORM/std": 0.07523935288190842, "step": 5723, "train_speed(iter/s)": 0.070447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 289.5, "completions/min_length": 208.0, "epoch": 4.734491315136476, "grad_norm": 0.252593457698822, "kl": 0.089599609375, "learning_rate": 8.566589660830349e-08, "loss": 0.0008943779394030571, "memory(GiB)": 38.15, "reward": 0.5950499773025513, "reward_std": 0.06794343143701553, "rewards/VisualizationJSONCombinedORM/mean": 0.5950499773025513, "rewards/VisualizationJSONCombinedORM/std": 0.1469244509935379, "step": 5724, "train_speed(iter/s)": 0.070436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 313.0625, "completions/min_length": 260.0, "epoch": 4.7353184449958645, "grad_norm": 0.21872201561927795, "kl": 0.0908203125, "learning_rate": 8.513450158049109e-08, "loss": 0.0009076297283172607, "memory(GiB)": 38.15, "reward": 0.6356005668640137, "reward_std": 0.09777912497520447, "rewards/VisualizationJSONCombinedORM/mean": 0.6356005668640137, "rewards/VisualizationJSONCombinedORM/std": 0.15764793753623962, "step": 5725, "train_speed(iter/s)": 0.070425 }, { "epoch": 4.7353184449958645, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.625, "eval_completions/mean_length": 308.9375, "eval_completions/min_length": 256.5833333333333, "eval_kl": 0.08983357747395833, "eval_loss": 0.0009025285835377872, "eval_reward": 0.4510270847628514, "eval_reward_std": 0.053181713446974754, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4510270847628514, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05318171224401643, "eval_runtime": 314.9121, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 317.0, "completions/min_length": 267.0, "epoch": 4.736145574855252, "grad_norm": 0.20250427722930908, "kl": 0.0863037109375, "learning_rate": 8.460474568332877e-08, "loss": 0.0008615292608737946, "memory(GiB)": 38.15, "reward": 0.615973174571991, "reward_std": 0.06111548840999603, "rewards/VisualizationJSONCombinedORM/mean": 0.615973174571991, "rewards/VisualizationJSONCombinedORM/std": 0.24232271313667297, "step": 5726, "train_speed(iter/s)": 0.070142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/mean_length": 318.375, "completions/min_length": 253.0, "epoch": 4.73697270471464, "grad_norm": 0.23039589822292328, "kl": 0.065185546875, "learning_rate": 8.407662909349246e-08, "loss": 0.0006522573530673981, "memory(GiB)": 38.15, "reward": 0.36912816762924194, "reward_std": 0.0565231516957283, "rewards/VisualizationJSONCombinedORM/mean": 0.36912816762924194, "rewards/VisualizationJSONCombinedORM/std": 0.13349702954292297, "step": 5727, "train_speed(iter/s)": 0.070126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 333.125, "completions/min_length": 243.0, "epoch": 4.737799834574028, "grad_norm": 0.1773066371679306, "kl": 0.036102294921875, "learning_rate": 8.355015198711125e-08, "loss": 0.0003606155514717102, "memory(GiB)": 38.15, "reward": 0.40927770733833313, "reward_std": 0.050628650933504105, "rewards/VisualizationJSONCombinedORM/mean": 0.40927770733833313, "rewards/VisualizationJSONCombinedORM/std": 0.19645339250564575, "step": 5728, "train_speed(iter/s)": 0.070113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 311.75, "completions/min_length": 248.0, "epoch": 4.738626964433416, "grad_norm": 0.24919240176677704, "kl": 0.1083984375, "learning_rate": 8.302531453976914e-08, "loss": 0.0010853782296180725, "memory(GiB)": 38.15, "reward": 0.40096691250801086, "reward_std": 0.046839114278554916, "rewards/VisualizationJSONCombinedORM/mean": 0.40096691250801086, "rewards/VisualizationJSONCombinedORM/std": 0.10850134491920471, "step": 5729, "train_speed(iter/s)": 0.070104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 335.6875, "completions/min_length": 245.0, "epoch": 4.739454094292804, "grad_norm": 0.1917967051267624, "kl": 0.047119140625, "learning_rate": 8.250211692650001e-08, "loss": 0.0004712417721748352, "memory(GiB)": 38.15, "reward": 0.4630899131298065, "reward_std": 0.04669308289885521, "rewards/VisualizationJSONCombinedORM/mean": 0.4630899131298065, "rewards/VisualizationJSONCombinedORM/std": 0.07569440454244614, "step": 5730, "train_speed(iter/s)": 0.070093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 313.3125, "completions/min_length": 228.0, "epoch": 4.740281224152191, "grad_norm": 0.17630743980407715, "kl": 0.054931640625, "learning_rate": 8.198055932179372e-08, "loss": 0.0005500763654708862, "memory(GiB)": 38.15, "reward": 0.6927484273910522, "reward_std": 0.06162739545106888, "rewards/VisualizationJSONCombinedORM/mean": 0.6927484273910522, "rewards/VisualizationJSONCombinedORM/std": 0.11832942813634872, "step": 5731, "train_speed(iter/s)": 0.070085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 330.3125, "completions/min_length": 264.0, "epoch": 4.74110835401158, "grad_norm": 0.17598681151866913, "kl": 0.1241455078125, "learning_rate": 8.146064189959168e-08, "loss": 0.0012434571981430054, "memory(GiB)": 38.15, "reward": 0.4540483355522156, "reward_std": 0.03153381496667862, "rewards/VisualizationJSONCombinedORM/mean": 0.4540483355522156, "rewards/VisualizationJSONCombinedORM/std": 0.0487394742667675, "step": 5732, "train_speed(iter/s)": 0.070072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 309.6875, "completions/min_length": 240.0, "epoch": 4.741935483870968, "grad_norm": 0.19214916229248047, "kl": 0.08349609375, "learning_rate": 8.094236483329022e-08, "loss": 0.0008352845907211304, "memory(GiB)": 38.15, "reward": 0.21292567253112793, "reward_std": 0.02456953004002571, "rewards/VisualizationJSONCombinedORM/mean": 0.21292567253112793, "rewards/VisualizationJSONCombinedORM/std": 0.024006551131606102, "step": 5733, "train_speed(iter/s)": 0.070063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 307.75, "completions/min_length": 242.0, "epoch": 4.742762613730355, "grad_norm": 0.18320715427398682, "kl": 0.0948486328125, "learning_rate": 8.042572829573492e-08, "loss": 0.0009486675262451172, "memory(GiB)": 38.15, "reward": 0.4655998647212982, "reward_std": 0.03450736403465271, "rewards/VisualizationJSONCombinedORM/mean": 0.4655998647212982, "rewards/VisualizationJSONCombinedORM/std": 0.22165663540363312, "step": 5734, "train_speed(iter/s)": 0.070054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 288.3125, "completions/min_length": 245.0, "epoch": 4.743589743589744, "grad_norm": 0.28142961859703064, "kl": 0.0501708984375, "learning_rate": 7.991073245922798e-08, "loss": 0.0005016019567847252, "memory(GiB)": 38.15, "reward": 0.5435259938240051, "reward_std": 0.03457561507821083, "rewards/VisualizationJSONCombinedORM/mean": 0.5435259938240051, "rewards/VisualizationJSONCombinedORM/std": 0.23846964538097382, "step": 5735, "train_speed(iter/s)": 0.070043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 338.5, "completions/min_length": 248.0, "epoch": 4.744416873449132, "grad_norm": 0.1638564169406891, "kl": 0.0523681640625, "learning_rate": 7.939737749552257e-08, "loss": 0.0005252622067928314, "memory(GiB)": 38.15, "reward": 0.5093191862106323, "reward_std": 0.022587507963180542, "rewards/VisualizationJSONCombinedORM/mean": 0.5093191862106323, "rewards/VisualizationJSONCombinedORM/std": 0.29842251539230347, "step": 5736, "train_speed(iter/s)": 0.070033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 326.4375, "completions/min_length": 248.0, "epoch": 4.745244003308519, "grad_norm": 0.18062910437583923, "kl": 0.1002197265625, "learning_rate": 7.888566357582506e-08, "loss": 0.0010018087923526764, "memory(GiB)": 38.15, "reward": 0.6825089454650879, "reward_std": 0.0392133891582489, "rewards/VisualizationJSONCombinedORM/mean": 0.6825089454650879, "rewards/VisualizationJSONCombinedORM/std": 0.05699506774544716, "step": 5737, "train_speed(iter/s)": 0.070021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 353.5, "completions/min_length": 290.0, "epoch": 4.7460711331679075, "grad_norm": 0.21136847138404846, "kl": 0.10693359375, "learning_rate": 7.837559087079394e-08, "loss": 0.0010677799582481384, "memory(GiB)": 38.15, "reward": 0.32699257135391235, "reward_std": 0.03081350401043892, "rewards/VisualizationJSONCombinedORM/mean": 0.32699257135391235, "rewards/VisualizationJSONCombinedORM/std": 0.12836213409900665, "step": 5738, "train_speed(iter/s)": 0.07001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 310.0625, "completions/min_length": 251.0, "epoch": 4.746898263027296, "grad_norm": 0.24613116681575775, "kl": 0.074462890625, "learning_rate": 7.786715955054202e-08, "loss": 0.000744163990020752, "memory(GiB)": 38.15, "reward": 0.6766736507415771, "reward_std": 0.08452547341585159, "rewards/VisualizationJSONCombinedORM/mean": 0.6766736507415771, "rewards/VisualizationJSONCombinedORM/std": 0.1602923423051834, "step": 5739, "train_speed(iter/s)": 0.070001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 302.25, "completions/min_length": 244.0, "epoch": 4.747725392886683, "grad_norm": 0.1832432895898819, "kl": 0.07208251953125, "learning_rate": 7.736036978463202e-08, "loss": 0.0007213763892650604, "memory(GiB)": 38.15, "reward": 0.2929016351699829, "reward_std": 0.01871977001428604, "rewards/VisualizationJSONCombinedORM/mean": 0.2929016351699829, "rewards/VisualizationJSONCombinedORM/std": 0.02184092253446579, "step": 5740, "train_speed(iter/s)": 0.069991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 304.875, "completions/min_length": 248.0, "epoch": 4.748552522746071, "grad_norm": 0.24056290090084076, "kl": 0.1314697265625, "learning_rate": 7.685522174208205e-08, "loss": 0.001315760426223278, "memory(GiB)": 38.15, "reward": 0.43624213337898254, "reward_std": 0.06118256598711014, "rewards/VisualizationJSONCombinedORM/mean": 0.43624213337898254, "rewards/VisualizationJSONCombinedORM/std": 0.21568498015403748, "step": 5741, "train_speed(iter/s)": 0.069981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 312.0625, "completions/min_length": 251.0, "epoch": 4.749379652605459, "grad_norm": 0.22755467891693115, "kl": 0.07672119140625, "learning_rate": 7.635171559136012e-08, "loss": 0.0007665101438760757, "memory(GiB)": 38.15, "reward": 0.6683511734008789, "reward_std": 0.12889009714126587, "rewards/VisualizationJSONCombinedORM/mean": 0.6683511734008789, "rewards/VisualizationJSONCombinedORM/std": 0.12706191837787628, "step": 5742, "train_speed(iter/s)": 0.06997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 313.4375, "completions/min_length": 239.0, "epoch": 4.750206782464847, "grad_norm": 0.1677177995443344, "kl": 0.0390625, "learning_rate": 7.584985150038914e-08, "loss": 0.0003912299871444702, "memory(GiB)": 38.15, "reward": 0.5191535949707031, "reward_std": 0.04783671349287033, "rewards/VisualizationJSONCombinedORM/mean": 0.5191535949707031, "rewards/VisualizationJSONCombinedORM/std": 0.11773636192083359, "step": 5743, "train_speed(iter/s)": 0.06996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 301.5, "completions/min_length": 237.0, "epoch": 4.751033912324235, "grad_norm": 0.1784784495830536, "kl": 0.07843017578125, "learning_rate": 7.534962963654136e-08, "loss": 0.0007846727967262268, "memory(GiB)": 38.15, "reward": 0.4954657554626465, "reward_std": 0.06502975523471832, "rewards/VisualizationJSONCombinedORM/mean": 0.4954657554626465, "rewards/VisualizationJSONCombinedORM/std": 0.11875952035188675, "step": 5744, "train_speed(iter/s)": 0.069952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 304.25, "completions/min_length": 222.0, "epoch": 4.751861042183623, "grad_norm": 0.1629309505224228, "kl": 0.04302978515625, "learning_rate": 7.485105016664551e-08, "loss": 0.0004288656637072563, "memory(GiB)": 38.15, "reward": 0.7102991938591003, "reward_std": 0.042632050812244415, "rewards/VisualizationJSONCombinedORM/mean": 0.7102991938591003, "rewards/VisualizationJSONCombinedORM/std": 0.09007126837968826, "step": 5745, "train_speed(iter/s)": 0.06994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 283.4375, "completions/min_length": 219.0, "epoch": 4.752688172043011, "grad_norm": 0.25083282589912415, "kl": 0.1474609375, "learning_rate": 7.43541132569786e-08, "loss": 0.0014749318361282349, "memory(GiB)": 38.15, "reward": 0.3857164978981018, "reward_std": 0.045820385217666626, "rewards/VisualizationJSONCombinedORM/mean": 0.3857164978981018, "rewards/VisualizationJSONCombinedORM/std": 0.050915174186229706, "step": 5746, "train_speed(iter/s)": 0.069929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 298.5, "completions/min_length": 234.0, "epoch": 4.753515301902398, "grad_norm": 0.25069209933280945, "kl": 0.05389404296875, "learning_rate": 7.385881907327141e-08, "loss": 0.000538654625415802, "memory(GiB)": 38.15, "reward": 0.42080527544021606, "reward_std": 0.05773359537124634, "rewards/VisualizationJSONCombinedORM/mean": 0.42080527544021606, "rewards/VisualizationJSONCombinedORM/std": 0.060653965920209885, "step": 5747, "train_speed(iter/s)": 0.069919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 333.25, "completions/min_length": 254.0, "epoch": 4.754342431761787, "grad_norm": 0.18970221281051636, "kl": 0.056396484375, "learning_rate": 7.336516778070735e-08, "loss": 0.0005639456212520599, "memory(GiB)": 38.15, "reward": 0.6129395365715027, "reward_std": 0.05128731578588486, "rewards/VisualizationJSONCombinedORM/mean": 0.6129395365715027, "rewards/VisualizationJSONCombinedORM/std": 0.18993625044822693, "step": 5748, "train_speed(iter/s)": 0.069908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 313.375, "completions/min_length": 242.0, "epoch": 4.755169561621175, "grad_norm": 0.20108434557914734, "kl": 0.05767822265625, "learning_rate": 7.287315954392137e-08, "loss": 0.0005770251154899597, "memory(GiB)": 38.15, "reward": 0.4125654995441437, "reward_std": 0.049079518765211105, "rewards/VisualizationJSONCombinedORM/mean": 0.4125654995441437, "rewards/VisualizationJSONCombinedORM/std": 0.20754994451999664, "step": 5749, "train_speed(iter/s)": 0.069898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 307.5625, "completions/min_length": 265.0, "epoch": 4.755996691480562, "grad_norm": 0.2397773414850235, "kl": 0.06640625, "learning_rate": 7.238279452700004e-08, "loss": 0.0006645694375038147, "memory(GiB)": 38.15, "reward": 0.38436877727508545, "reward_std": 0.040183134377002716, "rewards/VisualizationJSONCombinedORM/mean": 0.38436877727508545, "rewards/VisualizationJSONCombinedORM/std": 0.03891481086611748, "step": 5750, "train_speed(iter/s)": 0.069891 }, { "epoch": 4.755996691480562, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 375.4583333333333, "eval_completions/mean_length": 317.171875, "eval_completions/min_length": 269.0833333333333, "eval_kl": 0.08063252766927083, "eval_loss": 0.0008073399658314884, "eval_reward": 0.45121583218375844, "eval_reward_std": 0.05921395522697518, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45121583218375844, "eval_rewards/VisualizationJSONCombinedORM/std": 0.059213956158297755, "eval_runtime": 319.2687, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 280.875, "completions/min_length": 213.0, "epoch": 4.7568238213399505, "grad_norm": 0.2791464328765869, "kl": 0.12237548828125, "learning_rate": 7.189407289348305e-08, "loss": 0.0012215282768011093, "memory(GiB)": 38.15, "reward": 0.48911023139953613, "reward_std": 0.05934082716703415, "rewards/VisualizationJSONCombinedORM/mean": 0.48911023139953613, "rewards/VisualizationJSONCombinedORM/std": 0.11527041345834732, "step": 5751, "train_speed(iter/s)": 0.069613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 332.5625, "completions/min_length": 264.0, "epoch": 4.757650951199338, "grad_norm": 0.18139563500881195, "kl": 0.1048583984375, "learning_rate": 7.140699480636115e-08, "loss": 0.0010455064475536346, "memory(GiB)": 38.15, "reward": 0.4659435749053955, "reward_std": 0.04880308359861374, "rewards/VisualizationJSONCombinedORM/mean": 0.4659435749053955, "rewards/VisualizationJSONCombinedORM/std": 0.1679621934890747, "step": 5752, "train_speed(iter/s)": 0.0696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 321.1875, "completions/min_length": 238.0, "epoch": 4.758478081058726, "grad_norm": 0.22048625349998474, "kl": 0.1937255859375, "learning_rate": 7.092156042807719e-08, "loss": 0.0019438043236732483, "memory(GiB)": 38.15, "reward": 0.3595197796821594, "reward_std": 0.03392237424850464, "rewards/VisualizationJSONCombinedORM/mean": 0.3595197796821594, "rewards/VisualizationJSONCombinedORM/std": 0.13517417013645172, "step": 5753, "train_speed(iter/s)": 0.069588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 324.625, "completions/min_length": 266.0, "epoch": 4.759305210918114, "grad_norm": 0.24038411676883698, "kl": 0.130859375, "learning_rate": 7.043776992052497e-08, "loss": 0.0013102777302265167, "memory(GiB)": 38.15, "reward": 0.46534043550491333, "reward_std": 0.0684208944439888, "rewards/VisualizationJSONCombinedORM/mean": 0.46534043550491333, "rewards/VisualizationJSONCombinedORM/std": 0.15057599544525146, "step": 5754, "train_speed(iter/s)": 0.069579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 341.9375, "completions/min_length": 259.0, "epoch": 4.760132340777502, "grad_norm": 0.17392811179161072, "kl": 0.111328125, "learning_rate": 6.995562344505213e-08, "loss": 0.0011106133460998535, "memory(GiB)": 38.15, "reward": 0.382354736328125, "reward_std": 0.041138891130685806, "rewards/VisualizationJSONCombinedORM/mean": 0.382354736328125, "rewards/VisualizationJSONCombinedORM/std": 0.1829407662153244, "step": 5755, "train_speed(iter/s)": 0.069568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 281.75, "completions/min_length": 224.0, "epoch": 4.76095947063689, "grad_norm": 0.2305588573217392, "kl": 0.0927734375, "learning_rate": 6.947512116245669e-08, "loss": 0.0009299125522375107, "memory(GiB)": 38.15, "reward": 0.39995259046554565, "reward_std": 0.046374812722206116, "rewards/VisualizationJSONCombinedORM/mean": 0.39995259046554565, "rewards/VisualizationJSONCombinedORM/std": 0.04651709645986557, "step": 5756, "train_speed(iter/s)": 0.06956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 305.625, "completions/min_length": 251.0, "epoch": 4.761786600496277, "grad_norm": 0.21226045489311218, "kl": 0.064208984375, "learning_rate": 6.899626323298714e-08, "loss": 0.0006420239806175232, "memory(GiB)": 38.15, "reward": 0.4962482154369354, "reward_std": 0.05634787306189537, "rewards/VisualizationJSONCombinedORM/mean": 0.4962482154369354, "rewards/VisualizationJSONCombinedORM/std": 0.17091840505599976, "step": 5757, "train_speed(iter/s)": 0.069551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 285.25, "completions/min_length": 228.0, "epoch": 4.762613730355666, "grad_norm": 0.21185904741287231, "kl": 0.05255126953125, "learning_rate": 6.851904981634628e-08, "loss": 0.0005258470773696899, "memory(GiB)": 38.15, "reward": 0.6657847166061401, "reward_std": 0.10269876569509506, "rewards/VisualizationJSONCombinedORM/mean": 0.6657847166061401, "rewards/VisualizationJSONCombinedORM/std": 0.12755222618579865, "step": 5758, "train_speed(iter/s)": 0.069542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 300.9375, "completions/min_length": 248.0, "epoch": 4.763440860215054, "grad_norm": 0.17661374807357788, "kl": 0.033233642578125, "learning_rate": 6.804348107168623e-08, "loss": 0.0003325715661048889, "memory(GiB)": 38.15, "reward": 0.5129553079605103, "reward_std": 0.028560686856508255, "rewards/VisualizationJSONCombinedORM/mean": 0.5129553079605103, "rewards/VisualizationJSONCombinedORM/std": 0.1453171819448471, "step": 5759, "train_speed(iter/s)": 0.06953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 271.25, "completions/min_length": 233.0, "epoch": 4.764267990074441, "grad_norm": 0.1570417582988739, "kl": 0.058319091796875, "learning_rate": 6.756955715761127e-08, "loss": 0.0005831867456436157, "memory(GiB)": 38.15, "reward": 0.7201313972473145, "reward_std": 0.08582408726215363, "rewards/VisualizationJSONCombinedORM/mean": 0.7201313972473145, "rewards/VisualizationJSONCombinedORM/std": 0.1668321043252945, "step": 5760, "train_speed(iter/s)": 0.069522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 291.125, "completions/min_length": 245.0, "epoch": 4.76509511993383, "grad_norm": 0.19662773609161377, "kl": 0.052001953125, "learning_rate": 6.709727823217827e-08, "loss": 0.0005208961665630341, "memory(GiB)": 38.15, "reward": 0.7456406354904175, "reward_std": 0.07433082163333893, "rewards/VisualizationJSONCombinedORM/mean": 0.7456406354904175, "rewards/VisualizationJSONCombinedORM/std": 0.07865738868713379, "step": 5761, "train_speed(iter/s)": 0.069513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 305.8125, "completions/min_length": 251.0, "epoch": 4.765922249793218, "grad_norm": 0.19152913987636566, "kl": 0.1336669921875, "learning_rate": 6.662664445289347e-08, "loss": 0.0013376139104366302, "memory(GiB)": 38.15, "reward": 0.4006693661212921, "reward_std": 0.04266801476478577, "rewards/VisualizationJSONCombinedORM/mean": 0.4006693661212921, "rewards/VisualizationJSONCombinedORM/std": 0.04216163232922554, "step": 5762, "train_speed(iter/s)": 0.069504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 317.5, "completions/min_length": 238.0, "epoch": 4.766749379652605, "grad_norm": 0.2621534466743469, "kl": 0.060302734375, "learning_rate": 6.615765597671575e-08, "loss": 0.0006029903888702393, "memory(GiB)": 38.15, "reward": 0.5381760597229004, "reward_std": 0.060849472880363464, "rewards/VisualizationJSONCombinedORM/mean": 0.5381760597229004, "rewards/VisualizationJSONCombinedORM/std": 0.06320410221815109, "step": 5763, "train_speed(iter/s)": 0.069488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 316.1875, "completions/min_length": 243.0, "epoch": 4.7675765095119935, "grad_norm": 0.18758150935173035, "kl": 0.15478515625, "learning_rate": 6.569031296005445e-08, "loss": 0.0015471354126930237, "memory(GiB)": 38.15, "reward": 0.3538689613342285, "reward_std": 0.06777192652225494, "rewards/VisualizationJSONCombinedORM/mean": 0.3538689613342285, "rewards/VisualizationJSONCombinedORM/std": 0.18796859681606293, "step": 5764, "train_speed(iter/s)": 0.06948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 332.9375, "completions/min_length": 287.0, "epoch": 4.768403639371382, "grad_norm": 0.24923405051231384, "kl": 0.04827880859375, "learning_rate": 6.522461555877213e-08, "loss": 0.0004829391837120056, "memory(GiB)": 38.15, "reward": 0.36410582065582275, "reward_std": 0.04676519334316254, "rewards/VisualizationJSONCombinedORM/mean": 0.36410582065582275, "rewards/VisualizationJSONCombinedORM/std": 0.24550239741802216, "step": 5765, "train_speed(iter/s)": 0.069466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 284.3125, "completions/min_length": 248.0, "epoch": 4.769230769230769, "grad_norm": 0.14955051243305206, "kl": 0.03466796875, "learning_rate": 6.476056392817898e-08, "loss": 0.0003455355763435364, "memory(GiB)": 38.15, "reward": 0.6261537671089172, "reward_std": 0.021548917517066002, "rewards/VisualizationJSONCombinedORM/mean": 0.6261537671089172, "rewards/VisualizationJSONCombinedORM/std": 0.08154796808958054, "step": 5766, "train_speed(iter/s)": 0.069457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 305.1875, "completions/min_length": 258.0, "epoch": 4.770057899090157, "grad_norm": 0.18790841102600098, "kl": 0.0538330078125, "learning_rate": 6.429815822304008e-08, "loss": 0.0005388762801885605, "memory(GiB)": 38.15, "reward": 0.7668041586875916, "reward_std": 0.07556980103254318, "rewards/VisualizationJSONCombinedORM/mean": 0.7668041586875916, "rewards/VisualizationJSONCombinedORM/std": 0.08175661414861679, "step": 5767, "train_speed(iter/s)": 0.06945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 292.375, "completions/min_length": 229.0, "epoch": 4.770885028949545, "grad_norm": 0.18200711905956268, "kl": 0.0684814453125, "learning_rate": 6.383739859756932e-08, "loss": 0.0006856024265289307, "memory(GiB)": 38.15, "reward": 0.33431047201156616, "reward_std": 0.02504022605717182, "rewards/VisualizationJSONCombinedORM/mean": 0.33431047201156616, "rewards/VisualizationJSONCombinedORM/std": 0.09521740674972534, "step": 5768, "train_speed(iter/s)": 0.06944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 273.25, "completions/min_length": 229.0, "epoch": 4.771712158808933, "grad_norm": 0.17177075147628784, "kl": 0.078125, "learning_rate": 6.33782852054321e-08, "loss": 0.0007802750915288925, "memory(GiB)": 38.15, "reward": 0.4140580892562866, "reward_std": 0.010272073559463024, "rewards/VisualizationJSONCombinedORM/mean": 0.4140580892562866, "rewards/VisualizationJSONCombinedORM/std": 0.19865559041500092, "step": 5769, "train_speed(iter/s)": 0.069432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 284.625, "completions/min_length": 231.0, "epoch": 4.772539288668321, "grad_norm": 0.17555013298988342, "kl": 0.0599365234375, "learning_rate": 6.292081819974427e-08, "loss": 0.0005993172526359558, "memory(GiB)": 38.15, "reward": 0.43408775329589844, "reward_std": 0.03866557404398918, "rewards/VisualizationJSONCombinedORM/mean": 0.43408775329589844, "rewards/VisualizationJSONCombinedORM/std": 0.17996786534786224, "step": 5770, "train_speed(iter/s)": 0.069424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 310.9375, "completions/min_length": 240.0, "epoch": 4.773366418527709, "grad_norm": 0.2699434757232666, "kl": 0.04132080078125, "learning_rate": 6.246499773307491e-08, "loss": 0.0004137568175792694, "memory(GiB)": 38.15, "reward": 0.6384083032608032, "reward_std": 0.07905300706624985, "rewards/VisualizationJSONCombinedORM/mean": 0.6384083032608032, "rewards/VisualizationJSONCombinedORM/std": 0.13660110533237457, "step": 5771, "train_speed(iter/s)": 0.069413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 316.3125, "completions/min_length": 252.0, "epoch": 4.774193548387097, "grad_norm": 0.25375694036483765, "kl": 0.07171630859375, "learning_rate": 6.201082395744073e-08, "loss": 0.0007186383008956909, "memory(GiB)": 38.15, "reward": 0.6960647106170654, "reward_std": 0.0650201290845871, "rewards/VisualizationJSONCombinedORM/mean": 0.6960647106170654, "rewards/VisualizationJSONCombinedORM/std": 0.0990683063864708, "step": 5772, "train_speed(iter/s)": 0.069398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 295.0625, "completions/min_length": 251.0, "epoch": 4.775020678246484, "grad_norm": 0.1739102452993393, "kl": 0.060302734375, "learning_rate": 6.15582970243117e-08, "loss": 0.0006028860807418823, "memory(GiB)": 38.15, "reward": 0.4301304519176483, "reward_std": 0.028048936277627945, "rewards/VisualizationJSONCombinedORM/mean": 0.4301304519176483, "rewards/VisualizationJSONCombinedORM/std": 0.0840759351849556, "step": 5773, "train_speed(iter/s)": 0.06939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 325.125, "completions/min_length": 241.0, "epoch": 4.775847808105873, "grad_norm": 0.21315965056419373, "kl": 0.04876708984375, "learning_rate": 6.110741708460654e-08, "loss": 0.00048773735761642456, "memory(GiB)": 38.15, "reward": 0.5439435839653015, "reward_std": 0.05495616793632507, "rewards/VisualizationJSONCombinedORM/mean": 0.5439435839653015, "rewards/VisualizationJSONCombinedORM/std": 0.23167003691196442, "step": 5774, "train_speed(iter/s)": 0.069381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 300.1875, "completions/min_length": 224.0, "epoch": 4.776674937965261, "grad_norm": 0.23020882904529572, "kl": 0.1007080078125, "learning_rate": 6.065818428869774e-08, "loss": 0.0010055303573608398, "memory(GiB)": 38.15, "reward": 0.5221276879310608, "reward_std": 0.14459113776683807, "rewards/VisualizationJSONCombinedORM/mean": 0.5221276879310608, "rewards/VisualizationJSONCombinedORM/std": 0.21244870126247406, "step": 5775, "train_speed(iter/s)": 0.069375 }, { "epoch": 4.776674937965261, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 377.3333333333333, "eval_completions/mean_length": 315.203125, "eval_completions/min_length": 259.8333333333333, "eval_kl": 0.08701578776041667, "eval_loss": 0.0008863682742230594, "eval_reward": 0.4557727345575889, "eval_reward_std": 0.054168547639468066, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4557727345575889, "eval_rewards/VisualizationJSONCombinedORM/std": 0.054168549443905555, "eval_runtime": 319.4877, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 297.875, "completions/min_length": 223.0, "epoch": 4.777502067824648, "grad_norm": 0.19645537436008453, "kl": 0.1463623046875, "learning_rate": 6.021059878640433e-08, "loss": 0.0014601945877075195, "memory(GiB)": 38.15, "reward": 0.42547714710235596, "reward_std": 0.023766590282320976, "rewards/VisualizationJSONCombinedORM/mean": 0.42547714710235596, "rewards/VisualizationJSONCombinedORM/std": 0.13481861352920532, "step": 5776, "train_speed(iter/s)": 0.069102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 331.8125, "completions/min_length": 236.0, "epoch": 4.7783291976840365, "grad_norm": 0.19556055963039398, "kl": 0.03668212890625, "learning_rate": 5.976466072699971e-08, "loss": 0.0003676116466522217, "memory(GiB)": 38.15, "reward": 0.6218323707580566, "reward_std": 0.05861334502696991, "rewards/VisualizationJSONCombinedORM/mean": 0.6218323707580566, "rewards/VisualizationJSONCombinedORM/std": 0.2842927575111389, "step": 5777, "train_speed(iter/s)": 0.069093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 302.4375, "completions/min_length": 255.0, "epoch": 4.779156327543424, "grad_norm": 0.20591847598552704, "kl": 0.0570068359375, "learning_rate": 5.932037025920601e-08, "loss": 0.0005692318081855774, "memory(GiB)": 38.15, "reward": 0.3698440492153168, "reward_std": 0.03415510058403015, "rewards/VisualizationJSONCombinedORM/mean": 0.3698440492153168, "rewards/VisualizationJSONCombinedORM/std": 0.20958101749420166, "step": 5778, "train_speed(iter/s)": 0.069087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 333.625, "completions/min_length": 269.0, "epoch": 4.779983457402812, "grad_norm": 0.18018120527267456, "kl": 0.0850830078125, "learning_rate": 5.8877727531195804e-08, "loss": 0.0008518695831298828, "memory(GiB)": 38.15, "reward": 0.45660707354545593, "reward_std": 0.01531967706978321, "rewards/VisualizationJSONCombinedORM/mean": 0.45660707354545593, "rewards/VisualizationJSONCombinedORM/std": 0.2918721139431, "step": 5779, "train_speed(iter/s)": 0.069077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 300.5625, "completions/min_length": 244.0, "epoch": 4.7808105872622, "grad_norm": 0.19473959505558014, "kl": 0.178955078125, "learning_rate": 5.843673269059269e-08, "loss": 0.0017896369099617004, "memory(GiB)": 38.15, "reward": 0.40730535984039307, "reward_std": 0.05900796502828598, "rewards/VisualizationJSONCombinedORM/mean": 0.40730535984039307, "rewards/VisualizationJSONCombinedORM/std": 0.12246276438236237, "step": 5780, "train_speed(iter/s)": 0.069063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 341.875, "completions/min_length": 274.0, "epoch": 4.781637717121588, "grad_norm": 0.19340837001800537, "kl": 0.0704345703125, "learning_rate": 5.799738588447068e-08, "loss": 0.0007053427398204803, "memory(GiB)": 38.15, "reward": 0.4850243926048279, "reward_std": 0.06266467273235321, "rewards/VisualizationJSONCombinedORM/mean": 0.4850243926048279, "rewards/VisualizationJSONCombinedORM/std": 0.2943302094936371, "step": 5781, "train_speed(iter/s)": 0.069048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 339.625, "completions/min_length": 254.0, "epoch": 4.782464846980976, "grad_norm": 0.18670423328876495, "kl": 0.060546875, "learning_rate": 5.755968725935368e-08, "loss": 0.000603310763835907, "memory(GiB)": 38.15, "reward": 0.3779457211494446, "reward_std": 0.036901623010635376, "rewards/VisualizationJSONCombinedORM/mean": 0.3779457211494446, "rewards/VisualizationJSONCombinedORM/std": 0.05397943779826164, "step": 5782, "train_speed(iter/s)": 0.069038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 314.25, "completions/min_length": 240.0, "epoch": 4.7832919768403634, "grad_norm": 0.17834237217903137, "kl": 0.0894775390625, "learning_rate": 5.712363696121659e-08, "loss": 0.0008971728384494781, "memory(GiB)": 38.15, "reward": 0.48183223605155945, "reward_std": 0.015305960550904274, "rewards/VisualizationJSONCombinedORM/mean": 0.48183223605155945, "rewards/VisualizationJSONCombinedORM/std": 0.33086562156677246, "step": 5783, "train_speed(iter/s)": 0.069029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/mean_length": 337.25, "completions/min_length": 256.0, "epoch": 4.784119106699752, "grad_norm": 0.16177156567573547, "kl": 0.05670166015625, "learning_rate": 5.668923513548363e-08, "loss": 0.0005655288696289062, "memory(GiB)": 38.15, "reward": 0.24469062685966492, "reward_std": 0.010557293891906738, "rewards/VisualizationJSONCombinedORM/mean": 0.24469062685966492, "rewards/VisualizationJSONCombinedORM/std": 0.03263424336910248, "step": 5784, "train_speed(iter/s)": 0.069017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 327.75, "completions/min_length": 266.0, "epoch": 4.78494623655914, "grad_norm": 0.218565434217453, "kl": 0.07568359375, "learning_rate": 5.625648192703115e-08, "loss": 0.0007567033171653748, "memory(GiB)": 38.15, "reward": 0.48814287781715393, "reward_std": 0.06301294267177582, "rewards/VisualizationJSONCombinedORM/mean": 0.48814287781715393, "rewards/VisualizationJSONCombinedORM/std": 0.07414727658033371, "step": 5785, "train_speed(iter/s)": 0.069007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 316.375, "completions/min_length": 235.0, "epoch": 4.785773366418527, "grad_norm": 0.2276124209165573, "kl": 0.06451416015625, "learning_rate": 5.582537748018258e-08, "loss": 0.0006460826843976974, "memory(GiB)": 38.15, "reward": 0.46282845735549927, "reward_std": 0.058361224830150604, "rewards/VisualizationJSONCombinedORM/mean": 0.46282845735549927, "rewards/VisualizationJSONCombinedORM/std": 0.07513902336359024, "step": 5786, "train_speed(iter/s)": 0.069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 318.875, "completions/min_length": 220.0, "epoch": 4.786600496277916, "grad_norm": 0.18393416702747345, "kl": 0.0546875, "learning_rate": 5.539592193871457e-08, "loss": 0.0005464516580104828, "memory(GiB)": 38.15, "reward": 0.6177268028259277, "reward_std": 0.05605056881904602, "rewards/VisualizationJSONCombinedORM/mean": 0.6177268028259277, "rewards/VisualizationJSONCombinedORM/std": 0.17785325646400452, "step": 5787, "train_speed(iter/s)": 0.068987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/mean_length": 324.375, "completions/min_length": 235.0, "epoch": 4.787427626137304, "grad_norm": 0.23420821130275726, "kl": 0.0347900390625, "learning_rate": 5.496811544585201e-08, "loss": 0.0003483407199382782, "memory(GiB)": 38.15, "reward": 0.6716694831848145, "reward_std": 0.08591391146183014, "rewards/VisualizationJSONCombinedORM/mean": 0.6716694831848145, "rewards/VisualizationJSONCombinedORM/std": 0.08933205902576447, "step": 5788, "train_speed(iter/s)": 0.068976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 310.5625, "completions/min_length": 263.0, "epoch": 4.788254755996691, "grad_norm": 0.19308193027973175, "kl": 0.05072021484375, "learning_rate": 5.454195814427021e-08, "loss": 0.0005074553191661835, "memory(GiB)": 38.15, "reward": 0.4696812331676483, "reward_std": 0.03041922301054001, "rewards/VisualizationJSONCombinedORM/mean": 0.4696812331676483, "rewards/VisualizationJSONCombinedORM/std": 0.08662412315607071, "step": 5789, "train_speed(iter/s)": 0.068966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 300.5625, "completions/min_length": 248.0, "epoch": 4.7890818858560795, "grad_norm": 0.21662968397140503, "kl": 0.080078125, "learning_rate": 5.411745017609493e-08, "loss": 0.00080128014087677, "memory(GiB)": 38.15, "reward": 0.5066916942596436, "reward_std": 0.03309144079685211, "rewards/VisualizationJSONCombinedORM/mean": 0.5066916942596436, "rewards/VisualizationJSONCombinedORM/std": 0.06325892359018326, "step": 5790, "train_speed(iter/s)": 0.068954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 314.1875, "completions/min_length": 243.0, "epoch": 4.789909015715468, "grad_norm": 0.17287315428256989, "kl": 0.13592529296875, "learning_rate": 5.369459168290181e-08, "loss": 0.0013568997383117676, "memory(GiB)": 38.15, "reward": 0.48145827651023865, "reward_std": 0.06441249698400497, "rewards/VisualizationJSONCombinedORM/mean": 0.48145827651023865, "rewards/VisualizationJSONCombinedORM/std": 0.12175361067056656, "step": 5791, "train_speed(iter/s)": 0.068943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 294.1875, "completions/min_length": 232.0, "epoch": 4.790736145574855, "grad_norm": 0.2629927396774292, "kl": 0.0760498046875, "learning_rate": 5.327338280571637e-08, "loss": 0.0007596537470817566, "memory(GiB)": 38.15, "reward": 0.5209413766860962, "reward_std": 0.055234409868717194, "rewards/VisualizationJSONCombinedORM/mean": 0.5209413766860962, "rewards/VisualizationJSONCombinedORM/std": 0.055959008634090424, "step": 5792, "train_speed(iter/s)": 0.068932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 314.125, "completions/min_length": 230.0, "epoch": 4.791563275434243, "grad_norm": 0.16604995727539062, "kl": 0.06976318359375, "learning_rate": 5.285382368501235e-08, "loss": 0.0006978698074817657, "memory(GiB)": 38.15, "reward": 0.6128097772598267, "reward_std": 0.050324127078056335, "rewards/VisualizationJSONCombinedORM/mean": 0.6128097772598267, "rewards/VisualizationJSONCombinedORM/std": 0.16641341149806976, "step": 5793, "train_speed(iter/s)": 0.06892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 329.3125, "completions/min_length": 267.0, "epoch": 4.792390405293631, "grad_norm": 0.18833954632282257, "kl": 0.0462646484375, "learning_rate": 5.243591446071616e-08, "loss": 0.0004632696509361267, "memory(GiB)": 38.15, "reward": 0.5475282669067383, "reward_std": 0.06326015293598175, "rewards/VisualizationJSONCombinedORM/mean": 0.5475282669067383, "rewards/VisualizationJSONCombinedORM/std": 0.09600697457790375, "step": 5794, "train_speed(iter/s)": 0.068907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 334.125, "completions/min_length": 247.0, "epoch": 4.793217535153019, "grad_norm": 0.2284369021654129, "kl": 0.05816650390625, "learning_rate": 5.201965527220188e-08, "loss": 0.0005814209580421448, "memory(GiB)": 38.15, "reward": 0.6522886753082275, "reward_std": 0.07045356184244156, "rewards/VisualizationJSONCombinedORM/mean": 0.6522886753082275, "rewards/VisualizationJSONCombinedORM/std": 0.12513643503189087, "step": 5795, "train_speed(iter/s)": 0.068897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 299.25, "completions/min_length": 241.0, "epoch": 4.794044665012407, "grad_norm": 0.23278167843818665, "kl": 0.0989990234375, "learning_rate": 5.160504625829343e-08, "loss": 0.0009896419942378998, "memory(GiB)": 38.15, "reward": 0.5852530002593994, "reward_std": 0.020166119560599327, "rewards/VisualizationJSONCombinedORM/mean": 0.5852530002593994, "rewards/VisualizationJSONCombinedORM/std": 0.3148981034755707, "step": 5796, "train_speed(iter/s)": 0.068889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 338.8125, "completions/min_length": 279.0, "epoch": 4.794871794871795, "grad_norm": 0.18098460137844086, "kl": 0.128173828125, "learning_rate": 5.119208755726579e-08, "loss": 0.0012806244194507599, "memory(GiB)": 38.15, "reward": 0.29261642694473267, "reward_std": 0.0439082570374012, "rewards/VisualizationJSONCombinedORM/mean": 0.29261642694473267, "rewards/VisualizationJSONCombinedORM/std": 0.11187704652547836, "step": 5797, "train_speed(iter/s)": 0.068874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 309.75, "completions/min_length": 245.0, "epoch": 4.795698924731183, "grad_norm": 0.18494153022766113, "kl": 0.054443359375, "learning_rate": 5.0780779306842664e-08, "loss": 0.0005441233515739441, "memory(GiB)": 38.15, "reward": 0.533960223197937, "reward_std": 0.04822307452559471, "rewards/VisualizationJSONCombinedORM/mean": 0.533960223197937, "rewards/VisualizationJSONCombinedORM/std": 0.14856888353824615, "step": 5798, "train_speed(iter/s)": 0.068865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 317.6875, "completions/min_length": 244.0, "epoch": 4.79652605459057, "grad_norm": 0.20073142647743225, "kl": 0.0694580078125, "learning_rate": 5.037112164419544e-08, "loss": 0.0006942246109247208, "memory(GiB)": 38.15, "reward": 0.669117271900177, "reward_std": 0.05049581825733185, "rewards/VisualizationJSONCombinedORM/mean": 0.669117271900177, "rewards/VisualizationJSONCombinedORM/std": 0.10993269830942154, "step": 5799, "train_speed(iter/s)": 0.068849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 314.4375, "completions/min_length": 234.0, "epoch": 4.797353184449959, "grad_norm": 0.19948266446590424, "kl": 0.06451416015625, "learning_rate": 4.996311470594928e-08, "loss": 0.0006451364606618881, "memory(GiB)": 38.15, "reward": 0.3614048361778259, "reward_std": 0.045002881437540054, "rewards/VisualizationJSONCombinedORM/mean": 0.3614048361778259, "rewards/VisualizationJSONCombinedORM/std": 0.06906230747699738, "step": 5800, "train_speed(iter/s)": 0.06884 }, { "epoch": 4.797353184449959, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 387.875, "eval_completions/mean_length": 313.0208333333333, "eval_completions/min_length": 260.9166666666667, "eval_kl": 0.08830769856770833, "eval_loss": 0.0008769619162194431, "eval_reward": 0.4511612926920255, "eval_reward_std": 0.04961395046363274, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4511612926920255, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04961395050243785, "eval_runtime": 326.1346, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 310.5625, "completions/min_length": 256.0, "epoch": 4.798180314309347, "grad_norm": 0.17594777047634125, "kl": 0.08447265625, "learning_rate": 4.955675862817533e-08, "loss": 0.0008453540503978729, "memory(GiB)": 38.15, "reward": 0.5082359313964844, "reward_std": 0.061576079577207565, "rewards/VisualizationJSONCombinedORM/mean": 0.5082359313964844, "rewards/VisualizationJSONCombinedORM/std": 0.2625754475593567, "step": 5801, "train_speed(iter/s)": 0.068566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 295.875, "completions/min_length": 234.0, "epoch": 4.799007444168734, "grad_norm": 0.16168959438800812, "kl": 0.1275634765625, "learning_rate": 4.9152053546394626e-08, "loss": 0.0012745410203933716, "memory(GiB)": 38.15, "reward": 0.47666680812835693, "reward_std": 0.03116478957235813, "rewards/VisualizationJSONCombinedORM/mean": 0.47666680812835693, "rewards/VisualizationJSONCombinedORM/std": 0.1090012937784195, "step": 5802, "train_speed(iter/s)": 0.068559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 342.25, "completions/min_length": 247.0, "epoch": 4.7998345740281225, "grad_norm": 0.18386362493038177, "kl": 0.070068359375, "learning_rate": 4.874899959557922e-08, "loss": 0.0007004141807556152, "memory(GiB)": 38.15, "reward": 0.6226793527603149, "reward_std": 0.06246176362037659, "rewards/VisualizationJSONCombinedORM/mean": 0.6226793527603149, "rewards/VisualizationJSONCombinedORM/std": 0.12835560739040375, "step": 5803, "train_speed(iter/s)": 0.068544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 325.3125, "completions/min_length": 246.0, "epoch": 4.80066170388751, "grad_norm": 0.32902663946151733, "kl": 0.100830078125, "learning_rate": 4.8347596910149343e-08, "loss": 0.0010074824094772339, "memory(GiB)": 38.15, "reward": 0.5581187009811401, "reward_std": 0.07877764105796814, "rewards/VisualizationJSONCombinedORM/mean": 0.5581187009811401, "rewards/VisualizationJSONCombinedORM/std": 0.07618745416402817, "step": 5804, "train_speed(iter/s)": 0.06853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 328.5, "completions/min_length": 272.0, "epoch": 4.801488833746898, "grad_norm": 0.1633739024400711, "kl": 0.06121826171875, "learning_rate": 4.794784562397459e-08, "loss": 0.0006124675273895264, "memory(GiB)": 38.15, "reward": 0.8196194171905518, "reward_std": 0.04544039070606232, "rewards/VisualizationJSONCombinedORM/mean": 0.8196194171905518, "rewards/VisualizationJSONCombinedORM/std": 0.04404407739639282, "step": 5805, "train_speed(iter/s)": 0.068519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 315.1875, "completions/min_length": 237.0, "epoch": 4.802315963606286, "grad_norm": 0.20809534192085266, "kl": 0.1138916015625, "learning_rate": 4.754974587037331e-08, "loss": 0.0011426359415054321, "memory(GiB)": 38.15, "reward": 0.5439130663871765, "reward_std": 0.08860232681035995, "rewards/VisualizationJSONCombinedORM/mean": 0.5439130663871765, "rewards/VisualizationJSONCombinedORM/std": 0.08840765058994293, "step": 5806, "train_speed(iter/s)": 0.06851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 313.0, "completions/min_length": 260.0, "epoch": 4.803143093465674, "grad_norm": 0.24442683160305023, "kl": 0.05267333984375, "learning_rate": 4.715329778211375e-08, "loss": 0.0005271062254905701, "memory(GiB)": 38.15, "reward": 0.5376132726669312, "reward_std": 0.06875325739383698, "rewards/VisualizationJSONCombinedORM/mean": 0.5376132726669312, "rewards/VisualizationJSONCombinedORM/std": 0.16281281411647797, "step": 5807, "train_speed(iter/s)": 0.0685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 312.8125, "completions/min_length": 261.0, "epoch": 4.803970223325062, "grad_norm": 0.24662254750728607, "kl": 0.0498046875, "learning_rate": 4.675850149141403e-08, "loss": 0.0004960633814334869, "memory(GiB)": 38.15, "reward": 0.37034574151039124, "reward_std": 0.051839813590049744, "rewards/VisualizationJSONCombinedORM/mean": 0.37034574151039124, "rewards/VisualizationJSONCombinedORM/std": 0.09179794043302536, "step": 5808, "train_speed(iter/s)": 0.068491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 323.6875, "completions/min_length": 267.0, "epoch": 4.80479735318445, "grad_norm": 0.21301200985908508, "kl": 0.2867431640625, "learning_rate": 4.636535712993939e-08, "loss": 0.0028752684593200684, "memory(GiB)": 38.15, "reward": 0.4672626256942749, "reward_std": 0.09046036005020142, "rewards/VisualizationJSONCombinedORM/mean": 0.4672626256942749, "rewards/VisualizationJSONCombinedORM/std": 0.11946094781160355, "step": 5809, "train_speed(iter/s)": 0.06848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 300.875, "completions/min_length": 234.0, "epoch": 4.805624483043838, "grad_norm": 0.36000779271125793, "kl": 0.1851806640625, "learning_rate": 4.597386482880717e-08, "loss": 0.0018533244729042053, "memory(GiB)": 38.15, "reward": 0.7141621112823486, "reward_std": 0.06775980442762375, "rewards/VisualizationJSONCombinedORM/mean": 0.7141621112823486, "rewards/VisualizationJSONCombinedORM/std": 0.10114576667547226, "step": 5810, "train_speed(iter/s)": 0.068468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 311.9375, "completions/min_length": 269.0, "epoch": 4.806451612903226, "grad_norm": 0.2208307981491089, "kl": 0.05242919921875, "learning_rate": 4.558402471857959e-08, "loss": 0.0005248449742794037, "memory(GiB)": 38.15, "reward": 0.6297930479049683, "reward_std": 0.07151404768228531, "rewards/VisualizationJSONCombinedORM/mean": 0.6297930479049683, "rewards/VisualizationJSONCombinedORM/std": 0.1371600478887558, "step": 5811, "train_speed(iter/s)": 0.068459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 335.75, "completions/min_length": 258.0, "epoch": 4.807278742762613, "grad_norm": 0.19242681562900543, "kl": 0.06610107421875, "learning_rate": 4.519583692927154e-08, "loss": 0.0006602797657251358, "memory(GiB)": 38.15, "reward": 0.6075974702835083, "reward_std": 0.04596494138240814, "rewards/VisualizationJSONCombinedORM/mean": 0.6075974702835083, "rewards/VisualizationJSONCombinedORM/std": 0.13682381808757782, "step": 5812, "train_speed(iter/s)": 0.068449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 324.1875, "completions/min_length": 291.0, "epoch": 4.808105872622002, "grad_norm": 0.19714117050170898, "kl": 0.065673828125, "learning_rate": 4.4809301590345576e-08, "loss": 0.0006571337580680847, "memory(GiB)": 38.15, "reward": 0.3852691054344177, "reward_std": 0.02968231588602066, "rewards/VisualizationJSONCombinedORM/mean": 0.3852691054344177, "rewards/VisualizationJSONCombinedORM/std": 0.08795084059238434, "step": 5813, "train_speed(iter/s)": 0.068439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 337.0, "completions/min_length": 269.0, "epoch": 4.80893300248139, "grad_norm": 0.1958654224872589, "kl": 0.05438232421875, "learning_rate": 4.442441883071247e-08, "loss": 0.000543348491191864, "memory(GiB)": 38.15, "reward": 0.49077075719833374, "reward_std": 0.07139299809932709, "rewards/VisualizationJSONCombinedORM/mean": 0.49077075719833374, "rewards/VisualizationJSONCombinedORM/std": 0.0993061363697052, "step": 5814, "train_speed(iter/s)": 0.068428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 339.3125, "completions/min_length": 284.0, "epoch": 4.809760132340777, "grad_norm": 0.18008166551589966, "kl": 0.068115234375, "learning_rate": 4.404118877873176e-08, "loss": 0.0006808433681726456, "memory(GiB)": 38.15, "reward": 0.5602316856384277, "reward_std": 0.0450601689517498, "rewards/VisualizationJSONCombinedORM/mean": 0.5602316856384277, "rewards/VisualizationJSONCombinedORM/std": 0.15560011565685272, "step": 5815, "train_speed(iter/s)": 0.068417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 310.25, "completions/min_length": 252.0, "epoch": 4.8105872622001655, "grad_norm": 0.19019059836864471, "kl": 0.087646484375, "learning_rate": 4.3659611562214546e-08, "loss": 0.0008777584880590439, "memory(GiB)": 38.15, "reward": 0.600009560585022, "reward_std": 0.051674310117959976, "rewards/VisualizationJSONCombinedORM/mean": 0.600009560585022, "rewards/VisualizationJSONCombinedORM/std": 0.1583177000284195, "step": 5816, "train_speed(iter/s)": 0.068411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 313.3125, "completions/min_length": 249.0, "epoch": 4.811414392059554, "grad_norm": 0.22056113183498383, "kl": 0.07403564453125, "learning_rate": 4.327968730841681e-08, "loss": 0.0007389746606349945, "memory(GiB)": 38.15, "reward": 0.30341222882270813, "reward_std": 0.037957899272441864, "rewards/VisualizationJSONCombinedORM/mean": 0.30341222882270813, "rewards/VisualizationJSONCombinedORM/std": 0.06344877183437347, "step": 5817, "train_speed(iter/s)": 0.068399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 307.75, "completions/min_length": 248.0, "epoch": 4.812241521918941, "grad_norm": 0.1594526171684265, "kl": 0.19287109375, "learning_rate": 4.2901416144045525e-08, "loss": 0.0019262023270130157, "memory(GiB)": 38.15, "reward": 0.5266351103782654, "reward_std": 0.025015773251652718, "rewards/VisualizationJSONCombinedORM/mean": 0.5266351103782654, "rewards/VisualizationJSONCombinedORM/std": 0.25777867436408997, "step": 5818, "train_speed(iter/s)": 0.06839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 358.4375, "completions/min_length": 285.0, "epoch": 4.813068651778329, "grad_norm": 0.19598282873630524, "kl": 0.064697265625, "learning_rate": 4.25247981952559e-08, "loss": 0.0006483495235443115, "memory(GiB)": 38.15, "reward": 0.5275280475616455, "reward_std": 0.05711507424712181, "rewards/VisualizationJSONCombinedORM/mean": 0.5275280475616455, "rewards/VisualizationJSONCombinedORM/std": 0.05619591474533081, "step": 5819, "train_speed(iter/s)": 0.068384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 301.9375, "completions/min_length": 230.0, "epoch": 4.813895781637717, "grad_norm": 0.17016202211380005, "kl": 0.05157470703125, "learning_rate": 4.21498335876519e-08, "loss": 0.0005165860056877136, "memory(GiB)": 38.15, "reward": 0.5877166986465454, "reward_std": 0.018589142709970474, "rewards/VisualizationJSONCombinedORM/mean": 0.5877166986465454, "rewards/VisualizationJSONCombinedORM/std": 0.23587505519390106, "step": 5820, "train_speed(iter/s)": 0.068377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 311.8125, "completions/min_length": 240.0, "epoch": 4.814722911497105, "grad_norm": 0.1829415261745453, "kl": 0.0953369140625, "learning_rate": 4.177652244628627e-08, "loss": 0.0009547248482704163, "memory(GiB)": 38.15, "reward": 0.5172878503799438, "reward_std": 0.07224521040916443, "rewards/VisualizationJSONCombinedORM/mean": 0.5172878503799438, "rewards/VisualizationJSONCombinedORM/std": 0.12872010469436646, "step": 5821, "train_speed(iter/s)": 0.068369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 298.75, "completions/min_length": 231.0, "epoch": 4.815550041356493, "grad_norm": 0.1888369768857956, "kl": 0.055908203125, "learning_rate": 4.1404864895659423e-08, "loss": 0.0005590766668319702, "memory(GiB)": 38.15, "reward": 0.7341086268424988, "reward_std": 0.06159024313092232, "rewards/VisualizationJSONCombinedORM/mean": 0.7341086268424988, "rewards/VisualizationJSONCombinedORM/std": 0.12771233916282654, "step": 5822, "train_speed(iter/s)": 0.06836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 279.375, "completions/min_length": 220.0, "epoch": 4.816377171215881, "grad_norm": 0.18355728685855865, "kl": 0.0758056640625, "learning_rate": 4.10348610597211e-08, "loss": 0.0007578097283840179, "memory(GiB)": 38.15, "reward": 0.3519001603126526, "reward_std": 0.0356697142124176, "rewards/VisualizationJSONCombinedORM/mean": 0.3519001603126526, "rewards/VisualizationJSONCombinedORM/std": 0.10514788329601288, "step": 5823, "train_speed(iter/s)": 0.068352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 282.0, "completions/min_length": 223.0, "epoch": 4.817204301075269, "grad_norm": 0.18882638216018677, "kl": 0.1077880859375, "learning_rate": 4.0666511061869804e-08, "loss": 0.0010784678161144257, "memory(GiB)": 38.15, "reward": 0.38980555534362793, "reward_std": 0.04757210239768028, "rewards/VisualizationJSONCombinedORM/mean": 0.38980555534362793, "rewards/VisualizationJSONCombinedORM/std": 0.05490529164671898, "step": 5824, "train_speed(iter/s)": 0.068346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 297.0, "completions/min_length": 212.0, "epoch": 4.818031430934656, "grad_norm": 0.2549644410610199, "kl": 0.0531005859375, "learning_rate": 4.029981502495117e-08, "loss": 0.0005309209227561951, "memory(GiB)": 38.15, "reward": 0.5408190488815308, "reward_std": 0.07161872088909149, "rewards/VisualizationJSONCombinedORM/mean": 0.5408190488815308, "rewards/VisualizationJSONCombinedORM/std": 0.1214166134595871, "step": 5825, "train_speed(iter/s)": 0.068338 }, { "epoch": 4.818031430934656, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.875, "eval_completions/mean_length": 310.8854166666667, "eval_completions/min_length": 258.875, "eval_kl": 0.102081298828125, "eval_loss": 0.0010327933123335242, "eval_reward": 0.4551797254631917, "eval_reward_std": 0.05468104581814259, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4551797254631917, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05468104667185495, "eval_runtime": 316.5772, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 295.875, "completions/min_length": 211.0, "epoch": 4.818858560794045, "grad_norm": 0.16737055778503418, "kl": 0.04571533203125, "learning_rate": 3.99347730712607e-08, "loss": 0.00045765936374664307, "memory(GiB)": 38.15, "reward": 0.5058645009994507, "reward_std": 0.04227724298834801, "rewards/VisualizationJSONCombinedORM/mean": 0.5058645009994507, "rewards/VisualizationJSONCombinedORM/std": 0.07933048903942108, "step": 5826, "train_speed(iter/s)": 0.068075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 318.5625, "completions/min_length": 265.0, "epoch": 4.819685690653433, "grad_norm": 0.18544398248195648, "kl": 0.0848388671875, "learning_rate": 3.957138532254157e-08, "loss": 0.000851750373840332, "memory(GiB)": 38.15, "reward": 0.4166271388530731, "reward_std": 0.03803347051143646, "rewards/VisualizationJSONCombinedORM/mean": 0.4166271388530731, "rewards/VisualizationJSONCombinedORM/std": 0.24467511475086212, "step": 5827, "train_speed(iter/s)": 0.068064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 312.1875, "completions/min_length": 274.0, "epoch": 4.82051282051282, "grad_norm": 0.32156723737716675, "kl": 0.10009765625, "learning_rate": 3.92096518999846e-08, "loss": 0.000999443233013153, "memory(GiB)": 38.15, "reward": 0.5846588611602783, "reward_std": 0.06445205211639404, "rewards/VisualizationJSONCombinedORM/mean": 0.5846588611602783, "rewards/VisualizationJSONCombinedORM/std": 0.07494286447763443, "step": 5828, "train_speed(iter/s)": 0.068054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 305.0, "completions/min_length": 228.0, "epoch": 4.8213399503722085, "grad_norm": 0.8094617128372192, "kl": 0.66259765625, "learning_rate": 3.884957292422997e-08, "loss": 0.00663042813539505, "memory(GiB)": 38.15, "reward": 0.2803961932659149, "reward_std": 0.02670321613550186, "rewards/VisualizationJSONCombinedORM/mean": 0.2803961932659149, "rewards/VisualizationJSONCombinedORM/std": 0.08201176673173904, "step": 5829, "train_speed(iter/s)": 0.06804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 311.4375, "completions/min_length": 271.0, "epoch": 4.822167080231596, "grad_norm": 0.19010865688323975, "kl": 0.053466796875, "learning_rate": 3.8491148515366064e-08, "loss": 0.0005339272320270538, "memory(GiB)": 38.15, "reward": 0.7631233930587769, "reward_std": 0.022328492254018784, "rewards/VisualizationJSONCombinedORM/mean": 0.7631233930587769, "rewards/VisualizationJSONCombinedORM/std": 0.04215414077043533, "step": 5830, "train_speed(iter/s)": 0.068032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 299.5, "completions/min_length": 245.0, "epoch": 4.822994210090984, "grad_norm": 0.18111613392829895, "kl": 0.0989990234375, "learning_rate": 3.8134378792928364e-08, "loss": 0.0009894073009490967, "memory(GiB)": 38.15, "reward": 0.39780309796333313, "reward_std": 0.04403279721736908, "rewards/VisualizationJSONCombinedORM/mean": 0.39780309796333313, "rewards/VisualizationJSONCombinedORM/std": 0.20734329521656036, "step": 5831, "train_speed(iter/s)": 0.068024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 300.3125, "completions/min_length": 232.0, "epoch": 4.823821339950372, "grad_norm": 0.19345510005950928, "kl": 0.05047607421875, "learning_rate": 3.777926387590225e-08, "loss": 0.000504031777381897, "memory(GiB)": 38.15, "reward": 0.6234786510467529, "reward_std": 0.018403373658657074, "rewards/VisualizationJSONCombinedORM/mean": 0.6234786510467529, "rewards/VisualizationJSONCombinedORM/std": 0.13651686906814575, "step": 5832, "train_speed(iter/s)": 0.06801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 322.3125, "completions/min_length": 258.0, "epoch": 4.82464846980976, "grad_norm": 0.18786673247814178, "kl": 0.061279296875, "learning_rate": 3.742580388271966e-08, "loss": 0.0006126537919044495, "memory(GiB)": 38.15, "reward": 0.5322372317314148, "reward_std": 0.04283584654331207, "rewards/VisualizationJSONCombinedORM/mean": 0.5322372317314148, "rewards/VisualizationJSONCombinedORM/std": 0.044663529843091965, "step": 5833, "train_speed(iter/s)": 0.068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 318.3125, "completions/min_length": 242.0, "epoch": 4.825475599669148, "grad_norm": 0.16446849703788757, "kl": 0.04974365234375, "learning_rate": 3.7073998931260737e-08, "loss": 0.0004971176385879517, "memory(GiB)": 38.15, "reward": 0.4871969223022461, "reward_std": 0.05729876831173897, "rewards/VisualizationJSONCombinedORM/mean": 0.4871969223022461, "rewards/VisualizationJSONCombinedORM/std": 0.06242746487259865, "step": 5834, "train_speed(iter/s)": 0.067986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 324.4375, "completions/min_length": 270.0, "epoch": 4.826302729528536, "grad_norm": 0.21293283998966217, "kl": 0.0906982421875, "learning_rate": 3.672384913885441e-08, "loss": 0.0009064376354217529, "memory(GiB)": 38.15, "reward": 0.44141510128974915, "reward_std": 0.052754372358322144, "rewards/VisualizationJSONCombinedORM/mean": 0.44141510128974915, "rewards/VisualizationJSONCombinedORM/std": 0.08488143235445023, "step": 5835, "train_speed(iter/s)": 0.067973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 386.5625, "completions/min_length": 286.0, "epoch": 4.827129859387924, "grad_norm": 0.2676204442977905, "kl": 0.05963134765625, "learning_rate": 3.637535462227782e-08, "loss": 0.0005952343344688416, "memory(GiB)": 38.15, "reward": 0.823470413684845, "reward_std": 0.08520175516605377, "rewards/VisualizationJSONCombinedORM/mean": 0.823470413684845, "rewards/VisualizationJSONCombinedORM/std": 0.08386505395174026, "step": 5836, "train_speed(iter/s)": 0.06796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 299.0625, "completions/min_length": 239.0, "epoch": 4.827956989247312, "grad_norm": 0.2602408230304718, "kl": 0.10595703125, "learning_rate": 3.602851549775521e-08, "loss": 0.001060187816619873, "memory(GiB)": 38.15, "reward": 0.5746898055076599, "reward_std": 0.02414916642010212, "rewards/VisualizationJSONCombinedORM/mean": 0.5746898055076599, "rewards/VisualizationJSONCombinedORM/std": 0.27333182096481323, "step": 5837, "train_speed(iter/s)": 0.06795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 269.75, "completions/min_length": 219.0, "epoch": 4.8287841191067, "grad_norm": 0.2177286297082901, "kl": 0.07733154296875, "learning_rate": 3.5683331880958515e-08, "loss": 0.0007733702659606934, "memory(GiB)": 38.15, "reward": 0.5642931461334229, "reward_std": 0.07010479271411896, "rewards/VisualizationJSONCombinedORM/mean": 0.5642931461334229, "rewards/VisualizationJSONCombinedORM/std": 0.12232537567615509, "step": 5838, "train_speed(iter/s)": 0.067939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 289.1875, "completions/min_length": 217.0, "epoch": 4.829611248966088, "grad_norm": 0.2502676844596863, "kl": 0.129638671875, "learning_rate": 3.533980388700842e-08, "loss": 0.0012926813215017319, "memory(GiB)": 38.15, "reward": 0.3654592037200928, "reward_std": 0.0689062550663948, "rewards/VisualizationJSONCombinedORM/mean": 0.3654592037200928, "rewards/VisualizationJSONCombinedORM/std": 0.07539816200733185, "step": 5839, "train_speed(iter/s)": 0.067932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 318.0, "completions/min_length": 262.0, "epoch": 4.830438378825476, "grad_norm": 0.18720689415931702, "kl": 0.061767578125, "learning_rate": 3.499793163047327e-08, "loss": 0.0006193146109580994, "memory(GiB)": 38.15, "reward": 0.4755884110927582, "reward_std": 0.03439474105834961, "rewards/VisualizationJSONCombinedORM/mean": 0.4755884110927582, "rewards/VisualizationJSONCombinedORM/std": 0.14141084253787994, "step": 5840, "train_speed(iter/s)": 0.067924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 315.125, "completions/min_length": 239.0, "epoch": 4.831265508684863, "grad_norm": 0.1863691657781601, "kl": 0.04351806640625, "learning_rate": 3.465771522536854e-08, "loss": 0.00043545663356781006, "memory(GiB)": 38.15, "reward": 0.3718675374984741, "reward_std": 0.024372462183237076, "rewards/VisualizationJSONCombinedORM/mean": 0.3718675374984741, "rewards/VisualizationJSONCombinedORM/std": 0.03991774469614029, "step": 5841, "train_speed(iter/s)": 0.067915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 288.1875, "completions/min_length": 231.0, "epoch": 4.8320926385442515, "grad_norm": 0.24730201065540314, "kl": 0.046630859375, "learning_rate": 3.431915478515902e-08, "loss": 0.00046598073095083237, "memory(GiB)": 38.15, "reward": 0.5375783443450928, "reward_std": 0.08280740678310394, "rewards/VisualizationJSONCombinedORM/mean": 0.5375783443450928, "rewards/VisualizationJSONCombinedORM/std": 0.17267434298992157, "step": 5842, "train_speed(iter/s)": 0.067906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 322.3125, "completions/min_length": 275.0, "epoch": 4.83291976840364, "grad_norm": 0.24004676938056946, "kl": 0.0791015625, "learning_rate": 3.398225042275494e-08, "loss": 0.0007899999618530273, "memory(GiB)": 38.15, "reward": 0.46256136894226074, "reward_std": 0.05648694932460785, "rewards/VisualizationJSONCombinedORM/mean": 0.46256136894226074, "rewards/VisualizationJSONCombinedORM/std": 0.1513621211051941, "step": 5843, "train_speed(iter/s)": 0.067902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 312.5, "completions/min_length": 256.0, "epoch": 4.833746898263027, "grad_norm": 0.17138275504112244, "kl": 0.04669189453125, "learning_rate": 3.3647002250516424e-08, "loss": 0.00046703219413757324, "memory(GiB)": 38.15, "reward": 0.7976859211921692, "reward_std": 0.04961609095335007, "rewards/VisualizationJSONCombinedORM/mean": 0.7976859211921692, "rewards/VisualizationJSONCombinedORM/std": 0.05998441204428673, "step": 5844, "train_speed(iter/s)": 0.067894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 312.25, "completions/min_length": 243.0, "epoch": 4.834574028122415, "grad_norm": 0.2277478128671646, "kl": 0.1314697265625, "learning_rate": 3.3313410380250157e-08, "loss": 0.0013180822134017944, "memory(GiB)": 38.15, "reward": 0.6113852262496948, "reward_std": 0.038220904767513275, "rewards/VisualizationJSONCombinedORM/mean": 0.6113852262496948, "rewards/VisualizationJSONCombinedORM/std": 0.21478721499443054, "step": 5845, "train_speed(iter/s)": 0.067884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 314.3125, "completions/min_length": 252.0, "epoch": 4.835401157981803, "grad_norm": 0.23832888901233673, "kl": 0.05914306640625, "learning_rate": 3.2981474923210466e-08, "loss": 0.0005912184715270996, "memory(GiB)": 38.15, "reward": 0.4773023724555969, "reward_std": 0.06601995974779129, "rewards/VisualizationJSONCombinedORM/mean": 0.4773023724555969, "rewards/VisualizationJSONCombinedORM/std": 0.09877782315015793, "step": 5846, "train_speed(iter/s)": 0.067874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 334.875, "completions/min_length": 264.0, "epoch": 4.836228287841191, "grad_norm": 0.2316155880689621, "kl": 0.07135009765625, "learning_rate": 3.265119599009936e-08, "loss": 0.0007126331329345703, "memory(GiB)": 38.15, "reward": 0.7351017594337463, "reward_std": 0.07462971657514572, "rewards/VisualizationJSONCombinedORM/mean": 0.7351017594337463, "rewards/VisualizationJSONCombinedORM/std": 0.10520631074905396, "step": 5847, "train_speed(iter/s)": 0.067866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 282.9375, "completions/min_length": 225.0, "epoch": 4.837055417700579, "grad_norm": 0.19738753139972687, "kl": 0.0821533203125, "learning_rate": 3.2322573691066505e-08, "loss": 0.0008224546909332275, "memory(GiB)": 38.15, "reward": 0.522117018699646, "reward_std": 0.07998248189687729, "rewards/VisualizationJSONCombinedORM/mean": 0.522117018699646, "rewards/VisualizationJSONCombinedORM/std": 0.19129610061645508, "step": 5848, "train_speed(iter/s)": 0.067857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 323.6875, "completions/min_length": 280.0, "epoch": 4.837882547559967, "grad_norm": 0.1680588275194168, "kl": 0.07672119140625, "learning_rate": 3.199560813570868e-08, "loss": 0.0007692774524912238, "memory(GiB)": 38.15, "reward": 0.33163982629776, "reward_std": 0.03407997637987137, "rewards/VisualizationJSONCombinedORM/mean": 0.33163982629776, "rewards/VisualizationJSONCombinedORM/std": 0.12022683769464493, "step": 5849, "train_speed(iter/s)": 0.067847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 302.1875, "completions/min_length": 233.0, "epoch": 4.838709677419355, "grad_norm": 0.17958152294158936, "kl": 0.04345703125, "learning_rate": 3.1670299433070315e-08, "loss": 0.00043431296944618225, "memory(GiB)": 38.15, "reward": 0.4493315815925598, "reward_std": 0.014684458263218403, "rewards/VisualizationJSONCombinedORM/mean": 0.4493315815925598, "rewards/VisualizationJSONCombinedORM/std": 0.13884353637695312, "step": 5850, "train_speed(iter/s)": 0.067838 }, { "epoch": 4.838709677419355, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 376.5416666666667, "eval_completions/mean_length": 318.03125, "eval_completions/min_length": 264.125, "eval_kl": 0.08196004231770833, "eval_loss": 0.0008231562678702176, "eval_reward": 0.46063978349169094, "eval_reward_std": 0.053894543496426195, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.46063978349169094, "eval_rewards/VisualizationJSONCombinedORM/std": 0.053894545785927526, "eval_runtime": 318.6472, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 326.625, "completions/min_length": 263.0, "epoch": 4.839536807278742, "grad_norm": 0.19998863339424133, "kl": 0.05517578125, "learning_rate": 3.1346647691644084e-08, "loss": 0.0005510002374649048, "memory(GiB)": 38.15, "reward": 0.7176672220230103, "reward_std": 0.0522754080593586, "rewards/VisualizationJSONCombinedORM/mean": 0.7176672220230103, "rewards/VisualizationJSONCombinedORM/std": 0.06438189744949341, "step": 5851, "train_speed(iter/s)": 0.067578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 286.8125, "completions/min_length": 238.0, "epoch": 4.840363937138131, "grad_norm": 0.23490795493125916, "kl": 0.033935546875, "learning_rate": 3.102465301936919e-08, "loss": 0.0003391839563846588, "memory(GiB)": 38.15, "reward": 0.7274256348609924, "reward_std": 0.05179784819483757, "rewards/VisualizationJSONCombinedORM/mean": 0.7274256348609924, "rewards/VisualizationJSONCombinedORM/std": 0.06268390268087387, "step": 5852, "train_speed(iter/s)": 0.06757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 283.9375, "completions/min_length": 234.0, "epoch": 4.841191066997519, "grad_norm": 0.21070009469985962, "kl": 0.06646728515625, "learning_rate": 3.0704315523631956e-08, "loss": 0.000665418803691864, "memory(GiB)": 38.15, "reward": 0.548707902431488, "reward_std": 0.023984109982848167, "rewards/VisualizationJSONCombinedORM/mean": 0.548707902431488, "rewards/VisualizationJSONCombinedORM/std": 0.22117123007774353, "step": 5853, "train_speed(iter/s)": 0.067562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 286.125, "completions/min_length": 206.0, "epoch": 4.842018196856906, "grad_norm": 0.20784547924995422, "kl": 0.0516357421875, "learning_rate": 3.038563531126637e-08, "loss": 0.0005169734358787537, "memory(GiB)": 38.15, "reward": 0.6057148575782776, "reward_std": 0.04028356075286865, "rewards/VisualizationJSONCombinedORM/mean": 0.6057148575782776, "rewards/VisualizationJSONCombinedORM/std": 0.13049758970737457, "step": 5854, "train_speed(iter/s)": 0.067555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 321.625, "completions/min_length": 231.0, "epoch": 4.8428453267162945, "grad_norm": 0.2128647118806839, "kl": 0.040283203125, "learning_rate": 3.0068612488554084e-08, "loss": 0.00040315836668014526, "memory(GiB)": 38.15, "reward": 0.6639305353164673, "reward_std": 0.08777067810297012, "rewards/VisualizationJSONCombinedORM/mean": 0.6639305353164673, "rewards/VisualizationJSONCombinedORM/std": 0.10009472072124481, "step": 5855, "train_speed(iter/s)": 0.067544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 332.0625, "completions/min_length": 242.0, "epoch": 4.843672456575682, "grad_norm": 0.1843010038137436, "kl": 0.085296630859375, "learning_rate": 2.9753247161223852e-08, "loss": 0.0008520409464836121, "memory(GiB)": 38.15, "reward": 0.3661108613014221, "reward_std": 0.051086731255054474, "rewards/VisualizationJSONCombinedORM/mean": 0.3661108613014221, "rewards/VisualizationJSONCombinedORM/std": 0.1880510151386261, "step": 5856, "train_speed(iter/s)": 0.067532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 321.8125, "completions/min_length": 231.0, "epoch": 4.84449958643507, "grad_norm": 0.19077113270759583, "kl": 0.152587890625, "learning_rate": 2.9439539434450994e-08, "loss": 0.001524273306131363, "memory(GiB)": 38.15, "reward": 0.42557960748672485, "reward_std": 0.07020702213048935, "rewards/VisualizationJSONCombinedORM/mean": 0.42557960748672485, "rewards/VisualizationJSONCombinedORM/std": 0.08772390335798264, "step": 5857, "train_speed(iter/s)": 0.067524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 303.6875, "completions/min_length": 222.0, "epoch": 4.845326716294458, "grad_norm": 0.17903132736682892, "kl": 0.044921875, "learning_rate": 2.9127489412859033e-08, "loss": 0.00044957175850868225, "memory(GiB)": 38.15, "reward": 0.6379980444908142, "reward_std": 0.05531960725784302, "rewards/VisualizationJSONCombinedORM/mean": 0.6379980444908142, "rewards/VisualizationJSONCombinedORM/std": 0.09317421168088913, "step": 5858, "train_speed(iter/s)": 0.06752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 267.5, "completions/min_length": 237.0, "epoch": 4.846153846153846, "grad_norm": 0.1606108397245407, "kl": 0.0987548828125, "learning_rate": 2.8817097200518064e-08, "loss": 0.0009878277778625488, "memory(GiB)": 38.15, "reward": 0.36608999967575073, "reward_std": 0.03615963086485863, "rewards/VisualizationJSONCombinedORM/mean": 0.36608999967575073, "rewards/VisualizationJSONCombinedORM/std": 0.15086886286735535, "step": 5859, "train_speed(iter/s)": 0.06751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 295.75, "completions/min_length": 232.0, "epoch": 4.846980976013234, "grad_norm": 0.24709422886371613, "kl": 0.068115234375, "learning_rate": 2.850836290094472e-08, "loss": 0.000681605190038681, "memory(GiB)": 38.15, "reward": 0.5147110223770142, "reward_std": 0.07088429480791092, "rewards/VisualizationJSONCombinedORM/mean": 0.5147110223770142, "rewards/VisualizationJSONCombinedORM/std": 0.257455050945282, "step": 5860, "train_speed(iter/s)": 0.067504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 306.0625, "completions/min_length": 240.0, "epoch": 4.847808105872622, "grad_norm": 0.16131077706813812, "kl": 0.046630859375, "learning_rate": 2.8201286617103863e-08, "loss": 0.0004659220576286316, "memory(GiB)": 38.15, "reward": 0.6724523305892944, "reward_std": 0.07299286127090454, "rewards/VisualizationJSONCombinedORM/mean": 0.6724523305892944, "rewards/VisualizationJSONCombinedORM/std": 0.14639008045196533, "step": 5861, "train_speed(iter/s)": 0.067498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 319.5625, "completions/min_length": 252.0, "epoch": 4.84863523573201, "grad_norm": 0.19730980694293976, "kl": 0.084716796875, "learning_rate": 2.7895868451406904e-08, "loss": 0.0008453056216239929, "memory(GiB)": 38.15, "reward": 0.4138660728931427, "reward_std": 0.06422501802444458, "rewards/VisualizationJSONCombinedORM/mean": 0.4138660728931427, "rewards/VisualizationJSONCombinedORM/std": 0.12181735783815384, "step": 5862, "train_speed(iter/s)": 0.06749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 301.4375, "completions/min_length": 254.0, "epoch": 4.849462365591398, "grad_norm": 0.20578132569789886, "kl": 0.175048828125, "learning_rate": 2.759210850571181e-08, "loss": 0.0017481204122304916, "memory(GiB)": 38.15, "reward": 0.7305563688278198, "reward_std": 0.06522023677825928, "rewards/VisualizationJSONCombinedORM/mean": 0.7305563688278198, "rewards/VisualizationJSONCombinedORM/std": 0.09047682583332062, "step": 5863, "train_speed(iter/s)": 0.067483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 307.5625, "completions/min_length": 246.0, "epoch": 4.850289495450786, "grad_norm": 0.11349350214004517, "kl": 0.0401611328125, "learning_rate": 2.7290006881324772e-08, "loss": 0.00040069042006507516, "memory(GiB)": 38.15, "reward": 0.5484122037887573, "reward_std": 0.006949794944375753, "rewards/VisualizationJSONCombinedORM/mean": 0.5484122037887573, "rewards/VisualizationJSONCombinedORM/std": 0.19852043688297272, "step": 5864, "train_speed(iter/s)": 0.067474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 324.9375, "completions/min_length": 240.0, "epoch": 4.851116625310174, "grad_norm": 0.21788565814495087, "kl": 0.052490234375, "learning_rate": 2.6989563678996856e-08, "loss": 0.0005258228629827499, "memory(GiB)": 38.15, "reward": 0.4783354699611664, "reward_std": 0.05970548465847969, "rewards/VisualizationJSONCombinedORM/mean": 0.4783354699611664, "rewards/VisualizationJSONCombinedORM/std": 0.21858270466327667, "step": 5865, "train_speed(iter/s)": 0.067461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 307.6875, "completions/min_length": 241.0, "epoch": 4.851943755169562, "grad_norm": 0.25778183341026306, "kl": 0.0472412109375, "learning_rate": 2.669077899892847e-08, "loss": 0.0004719868302345276, "memory(GiB)": 38.15, "reward": 0.5798454880714417, "reward_std": 0.05562974140048027, "rewards/VisualizationJSONCombinedORM/mean": 0.5798454880714417, "rewards/VisualizationJSONCombinedORM/std": 0.1476617455482483, "step": 5866, "train_speed(iter/s)": 0.067453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 320.6875, "completions/min_length": 247.0, "epoch": 4.852770885028949, "grad_norm": 0.20854802429676056, "kl": 0.1258544921875, "learning_rate": 2.63936529407649e-08, "loss": 0.0012568235397338867, "memory(GiB)": 38.15, "reward": 0.45104122161865234, "reward_std": 0.043812721967697144, "rewards/VisualizationJSONCombinedORM/mean": 0.45104122161865234, "rewards/VisualizationJSONCombinedORM/std": 0.15825723111629486, "step": 5867, "train_speed(iter/s)": 0.067442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 339.625, "completions/min_length": 285.0, "epoch": 4.8535980148883375, "grad_norm": 0.16125714778900146, "kl": 0.03460693359375, "learning_rate": 2.6098185603599668e-08, "loss": 0.0003449879586696625, "memory(GiB)": 38.15, "reward": 0.5577879548072815, "reward_std": 0.021376898512244225, "rewards/VisualizationJSONCombinedORM/mean": 0.5577879548072815, "rewards/VisualizationJSONCombinedORM/std": 0.15526026487350464, "step": 5868, "train_speed(iter/s)": 0.067434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 336.75, "completions/min_length": 254.0, "epoch": 4.854425144747726, "grad_norm": 0.1811482459306717, "kl": 0.0772705078125, "learning_rate": 2.5804377085972278e-08, "loss": 0.000773012638092041, "memory(GiB)": 38.15, "reward": 0.5435256361961365, "reward_std": 0.032507628202438354, "rewards/VisualizationJSONCombinedORM/mean": 0.5435256361961365, "rewards/VisualizationJSONCombinedORM/std": 0.2015850841999054, "step": 5869, "train_speed(iter/s)": 0.067425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 300.0, "completions/min_length": 233.0, "epoch": 4.855252274607113, "grad_norm": 0.20305639505386353, "kl": 0.045654296875, "learning_rate": 2.551222748586879e-08, "loss": 0.00045683979988098145, "memory(GiB)": 38.15, "reward": 0.744398832321167, "reward_std": 0.07562266290187836, "rewards/VisualizationJSONCombinedORM/mean": 0.744398832321167, "rewards/VisualizationJSONCombinedORM/std": 0.08257374912500381, "step": 5870, "train_speed(iter/s)": 0.067417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 327.0, "completions/min_length": 267.0, "epoch": 4.856079404466501, "grad_norm": 0.18043668568134308, "kl": 0.141357421875, "learning_rate": 2.522173690072349e-08, "loss": 0.0014148931950330734, "memory(GiB)": 38.15, "reward": 0.4158933758735657, "reward_std": 0.05100256949663162, "rewards/VisualizationJSONCombinedORM/mean": 0.4158933758735657, "rewards/VisualizationJSONCombinedORM/std": 0.0889919176697731, "step": 5871, "train_speed(iter/s)": 0.067406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 298.0, "completions/min_length": 245.0, "epoch": 4.856906534325889, "grad_norm": 0.18400295078754425, "kl": 0.157958984375, "learning_rate": 2.4932905427415553e-08, "loss": 0.0015791505575180054, "memory(GiB)": 38.15, "reward": 0.629155695438385, "reward_std": 0.06043166667222977, "rewards/VisualizationJSONCombinedORM/mean": 0.629155695438385, "rewards/VisualizationJSONCombinedORM/std": 0.0976853147149086, "step": 5872, "train_speed(iter/s)": 0.067396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 279.8125, "completions/min_length": 218.0, "epoch": 4.857733664185277, "grad_norm": 0.2143920361995697, "kl": 0.0684814453125, "learning_rate": 2.4645733162271255e-08, "loss": 0.000685572624206543, "memory(GiB)": 38.15, "reward": 0.5362165570259094, "reward_std": 0.04122301936149597, "rewards/VisualizationJSONCombinedORM/mean": 0.5362165570259094, "rewards/VisualizationJSONCombinedORM/std": 0.11639416217803955, "step": 5873, "train_speed(iter/s)": 0.067389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 279.3125, "completions/min_length": 213.0, "epoch": 4.858560794044665, "grad_norm": 0.3276638388633728, "kl": 0.15081787109375, "learning_rate": 2.4360220201064544e-08, "loss": 0.0014990754425525665, "memory(GiB)": 38.15, "reward": 0.5292472243309021, "reward_std": 0.1108274906873703, "rewards/VisualizationJSONCombinedORM/mean": 0.5292472243309021, "rewards/VisualizationJSONCombinedORM/std": 0.14869891107082367, "step": 5874, "train_speed(iter/s)": 0.067379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 314.5625, "completions/min_length": 199.0, "epoch": 4.859387923904053, "grad_norm": 0.17161840200424194, "kl": 0.0404052734375, "learning_rate": 2.4076366639015914e-08, "loss": 0.0004042908549308777, "memory(GiB)": 38.15, "reward": 0.5333468914031982, "reward_std": 0.05033678561449051, "rewards/VisualizationJSONCombinedORM/mean": 0.5333468914031982, "rewards/VisualizationJSONCombinedORM/std": 0.055980831384658813, "step": 5875, "train_speed(iter/s)": 0.067368 }, { "epoch": 4.859387923904053, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.9166666666667, "eval_completions/mean_length": 316.4635416666667, "eval_completions/min_length": 262.7083333333333, "eval_kl": 0.06769816080729167, "eval_loss": 0.00067802396370098, "eval_reward": 0.45941165400048095, "eval_reward_std": 0.049844434135593474, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45941165400048095, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04984443607584884, "eval_runtime": 317.3146, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 322.5625, "completions/min_length": 244.0, "epoch": 4.860215053763441, "grad_norm": 0.19610533118247986, "kl": 0.0401611328125, "learning_rate": 2.3794172570790198e-08, "loss": 0.00040238723158836365, "memory(GiB)": 38.15, "reward": 0.5459355115890503, "reward_std": 0.07910221070051193, "rewards/VisualizationJSONCombinedORM/mean": 0.5459355115890503, "rewards/VisualizationJSONCombinedORM/std": 0.13645169138908386, "step": 5876, "train_speed(iter/s)": 0.067115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 307.3125, "completions/min_length": 225.0, "epoch": 4.861042183622828, "grad_norm": 0.30510470271110535, "kl": 0.09161376953125, "learning_rate": 2.351363809050211e-08, "loss": 0.0009177401661872864, "memory(GiB)": 38.15, "reward": 0.6935211420059204, "reward_std": 0.05112361162900925, "rewards/VisualizationJSONCombinedORM/mean": 0.6935211420059204, "rewards/VisualizationJSONCombinedORM/std": 0.1798548847436905, "step": 5877, "train_speed(iter/s)": 0.067105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 339.5625, "completions/min_length": 257.0, "epoch": 4.861869313482217, "grad_norm": 0.4303778111934662, "kl": 0.0599365234375, "learning_rate": 2.323476329171015e-08, "loss": 0.0005985572934150696, "memory(GiB)": 38.15, "reward": 0.6055452227592468, "reward_std": 0.05478315427899361, "rewards/VisualizationJSONCombinedORM/mean": 0.6055452227592468, "rewards/VisualizationJSONCombinedORM/std": 0.08974367380142212, "step": 5878, "train_speed(iter/s)": 0.067094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 303.25, "completions/min_length": 247.0, "epoch": 4.862696443341605, "grad_norm": 0.2405054271221161, "kl": 0.06396484375, "learning_rate": 2.2957548267421024e-08, "loss": 0.0006411224603652954, "memory(GiB)": 38.15, "reward": 0.48176780343055725, "reward_std": 0.060882195830345154, "rewards/VisualizationJSONCombinedORM/mean": 0.48176780343055725, "rewards/VisualizationJSONCombinedORM/std": 0.06535327434539795, "step": 5879, "train_speed(iter/s)": 0.067088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 335.625, "completions/min_length": 278.0, "epoch": 4.863523573200992, "grad_norm": 0.20099228620529175, "kl": 0.0565185546875, "learning_rate": 2.26819931100869e-08, "loss": 0.0005648881196975708, "memory(GiB)": 38.15, "reward": 0.4238824248313904, "reward_std": 0.038502879440784454, "rewards/VisualizationJSONCombinedORM/mean": 0.4238824248313904, "rewards/VisualizationJSONCombinedORM/std": 0.1394645869731903, "step": 5880, "train_speed(iter/s)": 0.06708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/mean_length": 311.6875, "completions/min_length": 223.0, "epoch": 4.8643507030603805, "grad_norm": 0.20260795950889587, "kl": 0.2315673828125, "learning_rate": 2.2408097911606473e-08, "loss": 0.002314191311597824, "memory(GiB)": 38.15, "reward": 0.6883057951927185, "reward_std": 0.06869728863239288, "rewards/VisualizationJSONCombinedORM/mean": 0.6883057951927185, "rewards/VisualizationJSONCombinedORM/std": 0.0934055969119072, "step": 5881, "train_speed(iter/s)": 0.067075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 320.4375, "completions/min_length": 236.0, "epoch": 4.865177832919768, "grad_norm": 0.17500539124011993, "kl": 0.085205078125, "learning_rate": 2.2135862763325577e-08, "loss": 0.0008518621325492859, "memory(GiB)": 38.15, "reward": 0.6168823838233948, "reward_std": 0.051044464111328125, "rewards/VisualizationJSONCombinedORM/mean": 0.6168823838233948, "rewards/VisualizationJSONCombinedORM/std": 0.06414469331502914, "step": 5882, "train_speed(iter/s)": 0.067065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 300.6875, "completions/min_length": 231.0, "epoch": 4.866004962779156, "grad_norm": 0.29683801531791687, "kl": 0.143310546875, "learning_rate": 2.1865287756036025e-08, "loss": 0.0014321357011795044, "memory(GiB)": 38.15, "reward": 0.42625319957733154, "reward_std": 0.054028723388910294, "rewards/VisualizationJSONCombinedORM/mean": 0.42625319957733154, "rewards/VisualizationJSONCombinedORM/std": 0.12169316411018372, "step": 5883, "train_speed(iter/s)": 0.06706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 279.625, "completions/min_length": 237.0, "epoch": 4.866832092638544, "grad_norm": 0.18122035264968872, "kl": 0.04559326171875, "learning_rate": 2.159637297997508e-08, "loss": 0.00045584701001644135, "memory(GiB)": 38.15, "reward": 0.3514743745326996, "reward_std": 0.028982356190681458, "rewards/VisualizationJSONCombinedORM/mean": 0.3514743745326996, "rewards/VisualizationJSONCombinedORM/std": 0.04172162711620331, "step": 5884, "train_speed(iter/s)": 0.067053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 304.25, "completions/min_length": 251.0, "epoch": 4.867659222497932, "grad_norm": 0.20470201969146729, "kl": 0.17822265625, "learning_rate": 2.1329118524827662e-08, "loss": 0.001782342791557312, "memory(GiB)": 38.15, "reward": 0.3102187514305115, "reward_std": 0.03297469764947891, "rewards/VisualizationJSONCombinedORM/mean": 0.3102187514305115, "rewards/VisualizationJSONCombinedORM/std": 0.033424533903598785, "step": 5885, "train_speed(iter/s)": 0.067045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 297.6875, "completions/min_length": 234.0, "epoch": 4.86848635235732, "grad_norm": 0.18310141563415527, "kl": 0.0877685546875, "learning_rate": 2.1063524479723594e-08, "loss": 0.0008764266967773438, "memory(GiB)": 38.15, "reward": 0.551827073097229, "reward_std": 0.06400799006223679, "rewards/VisualizationJSONCombinedORM/mean": 0.551827073097229, "rewards/VisualizationJSONCombinedORM/std": 0.28821244835853577, "step": 5886, "train_speed(iter/s)": 0.067036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 322.125, "completions/min_length": 253.0, "epoch": 4.869313482216708, "grad_norm": 0.32758021354675293, "kl": 0.0670166015625, "learning_rate": 2.0799590933241465e-08, "loss": 0.0006709620356559753, "memory(GiB)": 38.15, "reward": 0.6090173125267029, "reward_std": 0.060060054063797, "rewards/VisualizationJSONCombinedORM/mean": 0.6090173125267029, "rewards/VisualizationJSONCombinedORM/std": 0.16496388614177704, "step": 5887, "train_speed(iter/s)": 0.067027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 311.625, "completions/min_length": 248.0, "epoch": 4.870140612076096, "grad_norm": 0.2641761600971222, "kl": 0.0662841796875, "learning_rate": 2.0537317973402526e-08, "loss": 0.0006629601120948792, "memory(GiB)": 38.15, "reward": 0.57296222448349, "reward_std": 0.09746727347373962, "rewards/VisualizationJSONCombinedORM/mean": 0.57296222448349, "rewards/VisualizationJSONCombinedORM/std": 0.10846726596355438, "step": 5888, "train_speed(iter/s)": 0.067019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 314.375, "completions/min_length": 245.0, "epoch": 4.870967741935484, "grad_norm": 0.23885557055473328, "kl": 0.05535888671875, "learning_rate": 2.0276705687676813e-08, "loss": 0.0005531646311283112, "memory(GiB)": 38.15, "reward": 0.6652501821517944, "reward_std": 0.04079456627368927, "rewards/VisualizationJSONCombinedORM/mean": 0.6652501821517944, "rewards/VisualizationJSONCombinedORM/std": 0.16581867635250092, "step": 5889, "train_speed(iter/s)": 0.067007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 318.125, "completions/min_length": 257.0, "epoch": 4.871794871794872, "grad_norm": 0.1800500452518463, "kl": 0.054443359375, "learning_rate": 2.0017754162979795e-08, "loss": 0.0005440860986709595, "memory(GiB)": 38.15, "reward": 0.4084550142288208, "reward_std": 0.03815772011876106, "rewards/VisualizationJSONCombinedORM/mean": 0.4084550142288208, "rewards/VisualizationJSONCombinedORM/std": 0.15253855288028717, "step": 5890, "train_speed(iter/s)": 0.066999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 291.5625, "completions/min_length": 236.0, "epoch": 4.87262200165426, "grad_norm": 0.18515482544898987, "kl": 0.15399169921875, "learning_rate": 1.9760463485672954e-08, "loss": 0.0015409477055072784, "memory(GiB)": 38.15, "reward": 0.6401983499526978, "reward_std": 0.0720151737332344, "rewards/VisualizationJSONCombinedORM/mean": 0.6401983499526978, "rewards/VisualizationJSONCombinedORM/std": 0.22003237903118134, "step": 5891, "train_speed(iter/s)": 0.06699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 310.9375, "completions/min_length": 241.0, "epoch": 4.873449131513648, "grad_norm": 0.23107662796974182, "kl": 0.1251220703125, "learning_rate": 1.950483374156431e-08, "loss": 0.0012528784573078156, "memory(GiB)": 38.15, "reward": 0.30611875653266907, "reward_std": 0.02531353011727333, "rewards/VisualizationJSONCombinedORM/mean": 0.30611875653266907, "rewards/VisualizationJSONCombinedORM/std": 0.025937287136912346, "step": 5892, "train_speed(iter/s)": 0.066981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 294.0625, "completions/min_length": 220.0, "epoch": 4.874276261373035, "grad_norm": 0.252520352602005, "kl": 0.06683349609375, "learning_rate": 1.9250865015906784e-08, "loss": 0.0006687603890895844, "memory(GiB)": 38.15, "reward": 0.6241265535354614, "reward_std": 0.07386958599090576, "rewards/VisualizationJSONCombinedORM/mean": 0.6241265535354614, "rewards/VisualizationJSONCombinedORM/std": 0.12249764055013657, "step": 5893, "train_speed(iter/s)": 0.066976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 289.375, "completions/min_length": 232.0, "epoch": 4.8751033912324235, "grad_norm": 0.21945197880268097, "kl": 0.06097412109375, "learning_rate": 1.8998557393400397e-08, "loss": 0.0006095618009567261, "memory(GiB)": 38.15, "reward": 0.3857543170452118, "reward_std": 0.05018911510705948, "rewards/VisualizationJSONCombinedORM/mean": 0.3857543170452118, "rewards/VisualizationJSONCombinedORM/std": 0.08221683651208878, "step": 5894, "train_speed(iter/s)": 0.066963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 303.5, "completions/min_length": 243.0, "epoch": 4.875930521091812, "grad_norm": 0.20841242372989655, "kl": 0.0740966796875, "learning_rate": 1.8747910958191173e-08, "loss": 0.0007401928305625916, "memory(GiB)": 38.15, "reward": 0.37147632241249084, "reward_std": 0.04157743602991104, "rewards/VisualizationJSONCombinedORM/mean": 0.37147632241249084, "rewards/VisualizationJSONCombinedORM/std": 0.061980120837688446, "step": 5895, "train_speed(iter/s)": 0.066953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/mean_length": 320.25, "completions/min_length": 243.0, "epoch": 4.876757650951199, "grad_norm": 0.20026624202728271, "kl": 0.0836181640625, "learning_rate": 1.849892579387058e-08, "loss": 0.0008358955383300781, "memory(GiB)": 38.15, "reward": 0.7110613584518433, "reward_std": 0.07279447466135025, "rewards/VisualizationJSONCombinedORM/mean": 0.7110613584518433, "rewards/VisualizationJSONCombinedORM/std": 0.07506642490625381, "step": 5896, "train_speed(iter/s)": 0.066939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 314.5, "completions/min_length": 229.0, "epoch": 4.877584780810587, "grad_norm": 0.16920804977416992, "kl": 0.0601806640625, "learning_rate": 1.825160198347664e-08, "loss": 0.0006024222820997238, "memory(GiB)": 38.15, "reward": 0.6558442115783691, "reward_std": 0.03981940075755119, "rewards/VisualizationJSONCombinedORM/mean": 0.6558442115783691, "rewards/VisualizationJSONCombinedORM/std": 0.19575364887714386, "step": 5897, "train_speed(iter/s)": 0.066931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 295.0, "completions/min_length": 235.0, "epoch": 4.878411910669975, "grad_norm": 0.21611963212490082, "kl": 0.1131591796875, "learning_rate": 1.8005939609492817e-08, "loss": 0.0011278130114078522, "memory(GiB)": 38.15, "reward": 0.5337594747543335, "reward_std": 0.03927275538444519, "rewards/VisualizationJSONCombinedORM/mean": 0.5337594747543335, "rewards/VisualizationJSONCombinedORM/std": 0.1297682374715805, "step": 5898, "train_speed(iter/s)": 0.066924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/mean_length": 289.4375, "completions/min_length": 229.0, "epoch": 4.879239040529363, "grad_norm": 0.21815477311611176, "kl": 0.104736328125, "learning_rate": 1.7761938753848574e-08, "loss": 0.001048251986503601, "memory(GiB)": 38.15, "reward": 0.5326417684555054, "reward_std": 0.04848536103963852, "rewards/VisualizationJSONCombinedORM/mean": 0.5326417684555054, "rewards/VisualizationJSONCombinedORM/std": 0.05354661867022514, "step": 5899, "train_speed(iter/s)": 0.066917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 321.25, "completions/min_length": 234.0, "epoch": 4.880066170388751, "grad_norm": 0.18693596124649048, "kl": 0.11846923828125, "learning_rate": 1.7519599497919926e-08, "loss": 0.0011833885218948126, "memory(GiB)": 38.15, "reward": 0.4480210542678833, "reward_std": 0.041360609233379364, "rewards/VisualizationJSONCombinedORM/mean": 0.4480210542678833, "rewards/VisualizationJSONCombinedORM/std": 0.060447417199611664, "step": 5900, "train_speed(iter/s)": 0.066909 }, { "epoch": 4.880066170388751, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 366.375, "eval_completions/mean_length": 312.1770833333333, "eval_completions/min_length": 264.9166666666667, "eval_kl": 0.092437744140625, "eval_loss": 0.000923637009691447, "eval_reward": 0.45013903205593425, "eval_reward_std": 0.04881275615965327, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45013903205593425, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04881275813871374, "eval_runtime": 312.8372, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 5900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 313.75, "completions/min_length": 250.0, "epoch": 4.880893300248139, "grad_norm": 0.23077166080474854, "kl": 0.04150390625, "learning_rate": 1.7278921922527224e-08, "loss": 0.0004142923280596733, "memory(GiB)": 38.15, "reward": 0.5914239883422852, "reward_std": 0.07423517107963562, "rewards/VisualizationJSONCombinedORM/mean": 0.5914239883422852, "rewards/VisualizationJSONCombinedORM/std": 0.07356952875852585, "step": 5901, "train_speed(iter/s)": 0.066663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 296.0625, "completions/min_length": 241.0, "epoch": 4.881720430107527, "grad_norm": 0.21256023645401, "kl": 0.0906982421875, "learning_rate": 1.703990610793793e-08, "loss": 0.0009048469364643097, "memory(GiB)": 38.15, "reward": 0.3954373598098755, "reward_std": 0.03165587782859802, "rewards/VisualizationJSONCombinedORM/mean": 0.3954373598098755, "rewards/VisualizationJSONCombinedORM/std": 0.1485457718372345, "step": 5902, "train_speed(iter/s)": 0.066655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 274.9375, "completions/min_length": 234.0, "epoch": 4.882547559966914, "grad_norm": 0.22015278041362762, "kl": 0.06805419921875, "learning_rate": 1.6802552133865502e-08, "loss": 0.0006809160113334656, "memory(GiB)": 38.15, "reward": 0.5383546352386475, "reward_std": 0.07781830430030823, "rewards/VisualizationJSONCombinedORM/mean": 0.5383546352386475, "rewards/VisualizationJSONCombinedORM/std": 0.11103779077529907, "step": 5903, "train_speed(iter/s)": 0.066643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 326.0625, "completions/min_length": 278.0, "epoch": 4.883374689826303, "grad_norm": 0.19214105606079102, "kl": 0.0792236328125, "learning_rate": 1.6566860079468284e-08, "loss": 0.0007936421898193657, "memory(GiB)": 38.15, "reward": 0.6118205785751343, "reward_std": 0.01906772330403328, "rewards/VisualizationJSONCombinedORM/mean": 0.6118205785751343, "rewards/VisualizationJSONCombinedORM/std": 0.05785570293664932, "step": 5904, "train_speed(iter/s)": 0.066637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 307.25, "completions/min_length": 237.0, "epoch": 4.884201819685691, "grad_norm": 0.15956440567970276, "kl": 0.0509033203125, "learning_rate": 1.6332830023350065e-08, "loss": 0.0005097463726997375, "memory(GiB)": 38.15, "reward": 0.8092032670974731, "reward_std": 0.05301595479249954, "rewards/VisualizationJSONCombinedORM/mean": 0.8092032670974731, "rewards/VisualizationJSONCombinedORM/std": 0.07143900543451309, "step": 5905, "train_speed(iter/s)": 0.066628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 334.625, "completions/min_length": 285.0, "epoch": 4.885028949545078, "grad_norm": 0.18041114509105682, "kl": 0.1312255859375, "learning_rate": 1.610046204356175e-08, "loss": 0.001312538981437683, "memory(GiB)": 38.15, "reward": 0.6991572976112366, "reward_std": 0.07588718831539154, "rewards/VisualizationJSONCombinedORM/mean": 0.6991572976112366, "rewards/VisualizationJSONCombinedORM/std": 0.14681649208068848, "step": 5906, "train_speed(iter/s)": 0.066619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 283.9375, "completions/min_length": 229.0, "epoch": 4.8858560794044665, "grad_norm": 0.18552866578102112, "kl": 0.05206298828125, "learning_rate": 1.5869756217598563e-08, "loss": 0.0005203112959861755, "memory(GiB)": 38.15, "reward": 0.6267861723899841, "reward_std": 0.045649200677871704, "rewards/VisualizationJSONCombinedORM/mean": 0.6267861723899841, "rewards/VisualizationJSONCombinedORM/std": 0.10714515298604965, "step": 5907, "train_speed(iter/s)": 0.066612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 331.5625, "completions/min_length": 267.0, "epoch": 4.886683209263855, "grad_norm": 0.14743874967098236, "kl": 0.15924072265625, "learning_rate": 1.56407126224023e-08, "loss": 0.0015901140868663788, "memory(GiB)": 38.15, "reward": 0.39523935317993164, "reward_std": 0.056661996990442276, "rewards/VisualizationJSONCombinedORM/mean": 0.39523935317993164, "rewards/VisualizationJSONCombinedORM/std": 0.1537485122680664, "step": 5908, "train_speed(iter/s)": 0.066605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 278.0625, "completions/min_length": 220.0, "epoch": 4.887510339123242, "grad_norm": 0.20866329967975616, "kl": 0.0679931640625, "learning_rate": 1.541333133436018e-08, "loss": 0.0006801849231123924, "memory(GiB)": 38.15, "reward": 0.6251674294471741, "reward_std": 0.10254303365945816, "rewards/VisualizationJSONCombinedORM/mean": 0.6251674294471741, "rewards/VisualizationJSONCombinedORM/std": 0.1965450644493103, "step": 5909, "train_speed(iter/s)": 0.066599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 300.9375, "completions/min_length": 238.0, "epoch": 4.88833746898263, "grad_norm": 0.23545557260513306, "kl": 0.04888916015625, "learning_rate": 1.5187612429304887e-08, "loss": 0.0004884973168373108, "memory(GiB)": 38.15, "reward": 0.7295321226119995, "reward_std": 0.03869141638278961, "rewards/VisualizationJSONCombinedORM/mean": 0.7295321226119995, "rewards/VisualizationJSONCombinedORM/std": 0.1607811003923416, "step": 5910, "train_speed(iter/s)": 0.06659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 328.3125, "completions/min_length": 240.0, "epoch": 4.889164598842018, "grad_norm": 0.1951264590024948, "kl": 0.06414794921875, "learning_rate": 1.4963555982514532e-08, "loss": 0.0006431182846426964, "memory(GiB)": 38.15, "reward": 0.3848942518234253, "reward_std": 0.009729281067848206, "rewards/VisualizationJSONCombinedORM/mean": 0.3848942518234253, "rewards/VisualizationJSONCombinedORM/std": 0.1420394480228424, "step": 5911, "train_speed(iter/s)": 0.066584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 273.9375, "completions/min_length": 227.0, "epoch": 4.889991728701406, "grad_norm": 0.1437881886959076, "kl": 0.042816162109375, "learning_rate": 1.4741162068712677e-08, "loss": 0.0004274696111679077, "memory(GiB)": 38.15, "reward": 0.5110763311386108, "reward_std": 0.035423122346401215, "rewards/VisualizationJSONCombinedORM/mean": 0.5110763311386108, "rewards/VisualizationJSONCombinedORM/std": 0.21201474964618683, "step": 5912, "train_speed(iter/s)": 0.066576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 312.1875, "completions/min_length": 253.0, "epoch": 4.890818858560794, "grad_norm": 0.18002790212631226, "kl": 0.1324462890625, "learning_rate": 1.4520430762069438e-08, "loss": 0.0013254787772893906, "memory(GiB)": 38.15, "reward": 0.6549302339553833, "reward_std": 0.058201104402542114, "rewards/VisualizationJSONCombinedORM/mean": 0.6549302339553833, "rewards/VisualizationJSONCombinedORM/std": 0.11838000267744064, "step": 5913, "train_speed(iter/s)": 0.066569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 293.75, "completions/min_length": 238.0, "epoch": 4.891645988420182, "grad_norm": 0.18969249725341797, "kl": 0.075927734375, "learning_rate": 1.430136213619926e-08, "loss": 0.0007584244012832642, "memory(GiB)": 38.15, "reward": 0.49927717447280884, "reward_std": 0.027090877294540405, "rewards/VisualizationJSONCombinedORM/mean": 0.49927717447280884, "rewards/VisualizationJSONCombinedORM/std": 0.25870972871780396, "step": 5914, "train_speed(iter/s)": 0.066563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 306.1875, "completions/min_length": 251.0, "epoch": 4.89247311827957, "grad_norm": 0.2019696831703186, "kl": 0.06390380859375, "learning_rate": 1.408395626416259e-08, "loss": 0.0006392896175384521, "memory(GiB)": 38.15, "reward": 0.5126060843467712, "reward_std": 0.05403273552656174, "rewards/VisualizationJSONCombinedORM/mean": 0.5126060843467712, "rewards/VisualizationJSONCombinedORM/std": 0.23769360780715942, "step": 5915, "train_speed(iter/s)": 0.066556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 329.875, "completions/min_length": 248.0, "epoch": 4.893300248138958, "grad_norm": 0.2941281497478485, "kl": 0.0662841796875, "learning_rate": 1.3868213218465876e-08, "loss": 0.00066409632563591, "memory(GiB)": 38.15, "reward": 0.723974347114563, "reward_std": 0.08143416792154312, "rewards/VisualizationJSONCombinedORM/mean": 0.723974347114563, "rewards/VisualizationJSONCombinedORM/std": 0.07926557213068008, "step": 5916, "train_speed(iter/s)": 0.066546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/mean_length": 341.0625, "completions/min_length": 297.0, "epoch": 4.894127377998346, "grad_norm": 0.1775728017091751, "kl": 0.03717041015625, "learning_rate": 1.3654133071059894e-08, "loss": 0.00037195533514022827, "memory(GiB)": 38.15, "reward": 0.6310588717460632, "reward_std": 0.05503913760185242, "rewards/VisualizationJSONCombinedORM/mean": 0.6310588717460632, "rewards/VisualizationJSONCombinedORM/std": 0.11092952638864517, "step": 5917, "train_speed(iter/s)": 0.066538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 339.9375, "completions/min_length": 279.0, "epoch": 4.894954507857734, "grad_norm": 0.16140346229076385, "kl": 0.06488037109375, "learning_rate": 1.3441715893341422e-08, "loss": 0.0006499961018562317, "memory(GiB)": 38.15, "reward": 0.59294593334198, "reward_std": 0.046087559312582016, "rewards/VisualizationJSONCombinedORM/mean": 0.59294593334198, "rewards/VisualizationJSONCombinedORM/std": 0.139382466673851, "step": 5918, "train_speed(iter/s)": 0.066529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 319.375, "completions/min_length": 259.0, "epoch": 4.895781637717121, "grad_norm": 0.2546910047531128, "kl": 0.069091796875, "learning_rate": 1.3230961756152683e-08, "loss": 0.0006902497261762619, "memory(GiB)": 38.15, "reward": 0.5978392362594604, "reward_std": 0.03954155370593071, "rewards/VisualizationJSONCombinedORM/mean": 0.5978392362594604, "rewards/VisualizationJSONCombinedORM/std": 0.105378158390522, "step": 5919, "train_speed(iter/s)": 0.066518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 284.4375, "completions/min_length": 213.0, "epoch": 4.8966087675765095, "grad_norm": 0.2281571477651596, "kl": 0.0794677734375, "learning_rate": 1.3021870729780783e-08, "loss": 0.000793948769569397, "memory(GiB)": 38.15, "reward": 0.4430444836616516, "reward_std": 0.05277864262461662, "rewards/VisualizationJSONCombinedORM/mean": 0.4430444836616516, "rewards/VisualizationJSONCombinedORM/std": 0.09227995574474335, "step": 5920, "train_speed(iter/s)": 0.066508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 311.75, "completions/min_length": 233.0, "epoch": 4.897435897435898, "grad_norm": 0.17995741963386536, "kl": 0.03668212890625, "learning_rate": 1.2814442883959388e-08, "loss": 0.00036670267581939697, "memory(GiB)": 38.15, "reward": 0.5630550384521484, "reward_std": 0.07955902069807053, "rewards/VisualizationJSONCombinedORM/mean": 0.5630550384521484, "rewards/VisualizationJSONCombinedORM/std": 0.18967077136039734, "step": 5921, "train_speed(iter/s)": 0.066497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 305.3125, "completions/min_length": 254.0, "epoch": 4.898263027295285, "grad_norm": 0.16697801649570465, "kl": 0.04217529296875, "learning_rate": 1.2608678287865383e-08, "loss": 0.00042159855365753174, "memory(GiB)": 38.15, "reward": 0.7620814442634583, "reward_std": 0.05168711021542549, "rewards/VisualizationJSONCombinedORM/mean": 0.7620814442634583, "rewards/VisualizationJSONCombinedORM/std": 0.0593082569539547, "step": 5922, "train_speed(iter/s)": 0.06649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 315.3125, "completions/min_length": 235.0, "epoch": 4.899090157154673, "grad_norm": 0.19129984080791473, "kl": 0.1317138671875, "learning_rate": 1.2404577010123875e-08, "loss": 0.0013185739517211914, "memory(GiB)": 38.15, "reward": 0.5835321545600891, "reward_std": 0.07798057049512863, "rewards/VisualizationJSONCombinedORM/mean": 0.5835321545600891, "rewards/VisualizationJSONCombinedORM/std": 0.12076186388731003, "step": 5923, "train_speed(iter/s)": 0.06648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 344.9375, "completions/min_length": 279.0, "epoch": 4.899917287014061, "grad_norm": 0.1814335137605667, "kl": 0.06500244140625, "learning_rate": 1.2202139118802636e-08, "loss": 0.0006501562893390656, "memory(GiB)": 38.15, "reward": 0.5397080183029175, "reward_std": 0.056979626417160034, "rewards/VisualizationJSONCombinedORM/mean": 0.5397080183029175, "rewards/VisualizationJSONCombinedORM/std": 0.10238857567310333, "step": 5924, "train_speed(iter/s)": 0.066472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 345.0625, "completions/min_length": 230.0, "epoch": 4.900744416873449, "grad_norm": 0.20501954853534698, "kl": 0.1065673828125, "learning_rate": 1.200136468141544e-08, "loss": 0.0010658949613571167, "memory(GiB)": 38.15, "reward": 0.5214368104934692, "reward_std": 0.044032447040081024, "rewards/VisualizationJSONCombinedORM/mean": 0.5214368104934692, "rewards/VisualizationJSONCombinedORM/std": 0.31554993987083435, "step": 5925, "train_speed(iter/s)": 0.066457 }, { "epoch": 4.900744416873449, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.2916666666667, "eval_completions/mean_length": 312.8958333333333, "eval_completions/min_length": 262.0833333333333, "eval_kl": 0.09276326497395833, "eval_loss": 0.0009238123893737793, "eval_reward": 0.4505008328706026, "eval_reward_std": 0.05910702385396386, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4505008328706026, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05910702364053577, "eval_runtime": 316.4639, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 307.125, "completions/min_length": 238.0, "epoch": 4.901571546732837, "grad_norm": 0.2411877065896988, "kl": 0.0411376953125, "learning_rate": 1.18022537649215e-08, "loss": 0.0004124939441680908, "memory(GiB)": 38.15, "reward": 0.4287819266319275, "reward_std": 0.0390804298222065, "rewards/VisualizationJSONCombinedORM/mean": 0.4287819266319275, "rewards/VisualizationJSONCombinedORM/std": 0.09129684418439865, "step": 5926, "train_speed(iter/s)": 0.066216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 299.625, "completions/min_length": 254.0, "epoch": 4.902398676592225, "grad_norm": 0.21704792976379395, "kl": 0.0576171875, "learning_rate": 1.1604806435726035e-08, "loss": 0.0005750339478254318, "memory(GiB)": 38.15, "reward": 0.6704164147377014, "reward_std": 0.08799372613430023, "rewards/VisualizationJSONCombinedORM/mean": 0.6704164147377014, "rewards/VisualizationJSONCombinedORM/std": 0.10188388079404831, "step": 5927, "train_speed(iter/s)": 0.066208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 308.875, "completions/min_length": 242.0, "epoch": 4.903225806451613, "grad_norm": 0.18947188556194305, "kl": 0.1292724609375, "learning_rate": 1.1409022759678034e-08, "loss": 0.001292504370212555, "memory(GiB)": 38.15, "reward": 0.6378132104873657, "reward_std": 0.048389144241809845, "rewards/VisualizationJSONCombinedORM/mean": 0.6378132104873657, "rewards/VisualizationJSONCombinedORM/std": 0.06532464921474457, "step": 5928, "train_speed(iter/s)": 0.0662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 285.0, "completions/min_length": 217.0, "epoch": 4.904052936311, "grad_norm": 0.1621691733598709, "kl": 0.08685302734375, "learning_rate": 1.1214902802072491e-08, "loss": 0.0008691810071468353, "memory(GiB)": 38.15, "reward": 0.7454319000244141, "reward_std": 0.03865163400769234, "rewards/VisualizationJSONCombinedORM/mean": 0.7454319000244141, "rewards/VisualizationJSONCombinedORM/std": 0.04451275244355202, "step": 5929, "train_speed(iter/s)": 0.066194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 319.5625, "completions/min_length": 240.0, "epoch": 4.904880066170389, "grad_norm": 0.18969808518886566, "kl": 0.054412841796875, "learning_rate": 1.1022446627649286e-08, "loss": 0.0005451031029224396, "memory(GiB)": 38.15, "reward": 0.6699355840682983, "reward_std": 0.04959847405552864, "rewards/VisualizationJSONCombinedORM/mean": 0.6699355840682983, "rewards/VisualizationJSONCombinedORM/std": 0.14635604619979858, "step": 5930, "train_speed(iter/s)": 0.066184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 317.4375, "completions/min_length": 235.0, "epoch": 4.905707196029777, "grad_norm": 0.21324558556079865, "kl": 0.069580078125, "learning_rate": 1.0831654300593186e-08, "loss": 0.0006972849369049072, "memory(GiB)": 38.15, "reward": 0.575011670589447, "reward_std": 0.039898697286844254, "rewards/VisualizationJSONCombinedORM/mean": 0.575011670589447, "rewards/VisualizationJSONCombinedORM/std": 0.2582721710205078, "step": 5931, "train_speed(iter/s)": 0.066174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 319.5625, "completions/min_length": 242.0, "epoch": 4.906534325889164, "grad_norm": 0.1459425985813141, "kl": 0.113525390625, "learning_rate": 1.0642525884534405e-08, "loss": 0.0011345910606905818, "memory(GiB)": 38.15, "reward": 0.5720921754837036, "reward_std": 0.02998715080320835, "rewards/VisualizationJSONCombinedORM/mean": 0.5720921754837036, "rewards/VisualizationJSONCombinedORM/std": 0.09241494536399841, "step": 5932, "train_speed(iter/s)": 0.066163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 327.5, "completions/min_length": 261.0, "epoch": 4.9073614557485525, "grad_norm": 0.20343270897865295, "kl": 0.111083984375, "learning_rate": 1.0455061442548597e-08, "loss": 0.001111733727157116, "memory(GiB)": 38.15, "reward": 0.5118834972381592, "reward_std": 0.051215093582868576, "rewards/VisualizationJSONCombinedORM/mean": 0.5118834972381592, "rewards/VisualizationJSONCombinedORM/std": 0.22322002053260803, "step": 5933, "train_speed(iter/s)": 0.066158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 314.875, "completions/min_length": 255.0, "epoch": 4.908188585607941, "grad_norm": 0.3578415811061859, "kl": 0.080810546875, "learning_rate": 1.0269261037155197e-08, "loss": 0.000807229895144701, "memory(GiB)": 38.15, "reward": 0.5388505458831787, "reward_std": 0.07840759307146072, "rewards/VisualizationJSONCombinedORM/mean": 0.5388505458831787, "rewards/VisualizationJSONCombinedORM/std": 0.12327093631029129, "step": 5934, "train_speed(iter/s)": 0.066148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 332.125, "completions/min_length": 269.0, "epoch": 4.909015715467328, "grad_norm": 0.17229487001895905, "kl": 0.09967041015625, "learning_rate": 1.008512473032075e-08, "loss": 0.0009967666119337082, "memory(GiB)": 38.15, "reward": 0.4529576897621155, "reward_std": 0.04069436341524124, "rewards/VisualizationJSONCombinedORM/mean": 0.4529576897621155, "rewards/VisualizationJSONCombinedORM/std": 0.0836096927523613, "step": 5935, "train_speed(iter/s)": 0.066141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 285.9375, "completions/min_length": 227.0, "epoch": 4.909842845326716, "grad_norm": 0.18849769234657288, "kl": 0.07781982421875, "learning_rate": 9.902652583454463e-09, "loss": 0.0007771477103233337, "memory(GiB)": 38.15, "reward": 0.41610056161880493, "reward_std": 0.03850685432553291, "rewards/VisualizationJSONCombinedORM/mean": 0.41610056161880493, "rewards/VisualizationJSONCombinedORM/std": 0.05261693149805069, "step": 5936, "train_speed(iter/s)": 0.066135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 310.375, "completions/min_length": 216.0, "epoch": 4.910669975186105, "grad_norm": 0.3774755001068115, "kl": 0.106201171875, "learning_rate": 9.72184465741155e-09, "loss": 0.0010648518800735474, "memory(GiB)": 38.15, "reward": 0.531516432762146, "reward_std": 0.0774339959025383, "rewards/VisualizationJSONCombinedORM/mean": 0.531516432762146, "rewards/VisualizationJSONCombinedORM/std": 0.1244133859872818, "step": 5937, "train_speed(iter/s)": 0.066127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 323.6875, "completions/min_length": 259.0, "epoch": 4.911497105045492, "grad_norm": 0.1804387867450714, "kl": 0.051513671875, "learning_rate": 9.542701012493771e-09, "loss": 0.0005152225494384766, "memory(GiB)": 38.15, "reward": 0.6520551443099976, "reward_std": 0.0807214081287384, "rewards/VisualizationJSONCombinedORM/mean": 0.6520551443099976, "rewards/VisualizationJSONCombinedORM/std": 0.08595827221870422, "step": 5938, "train_speed(iter/s)": 0.06612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 315.0, "completions/min_length": 259.0, "epoch": 4.91232423490488, "grad_norm": 0.20775499939918518, "kl": 0.1292724609375, "learning_rate": 9.365221708445006e-09, "loss": 0.001291818916797638, "memory(GiB)": 38.15, "reward": 0.488960325717926, "reward_std": 0.04140974208712578, "rewards/VisualizationJSONCombinedORM/mean": 0.488960325717926, "rewards/VisualizationJSONCombinedORM/std": 0.17692235112190247, "step": 5939, "train_speed(iter/s)": 0.066114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 314.5, "completions/min_length": 251.0, "epoch": 4.913151364764268, "grad_norm": 0.19348293542861938, "kl": 0.0634765625, "learning_rate": 9.18940680445568e-09, "loss": 0.0006339550018310547, "memory(GiB)": 38.15, "reward": 0.5971238613128662, "reward_std": 0.06100628525018692, "rewards/VisualizationJSONCombinedORM/mean": 0.5971238613128662, "rewards/VisualizationJSONCombinedORM/std": 0.09155116230249405, "step": 5940, "train_speed(iter/s)": 0.066108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 307.5625, "completions/min_length": 224.0, "epoch": 4.913978494623656, "grad_norm": 0.18368136882781982, "kl": 0.04644775390625, "learning_rate": 9.015256359161118e-09, "loss": 0.00046375393867492676, "memory(GiB)": 38.15, "reward": 0.7410786747932434, "reward_std": 0.08526947349309921, "rewards/VisualizationJSONCombinedORM/mean": 0.7410786747932434, "rewards/VisualizationJSONCombinedORM/std": 0.08360495418310165, "step": 5941, "train_speed(iter/s)": 0.066097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 289.75, "completions/min_length": 206.0, "epoch": 4.914805624483044, "grad_norm": 0.152792289853096, "kl": 0.03863525390625, "learning_rate": 8.842770430640968e-09, "loss": 0.0003847479820251465, "memory(GiB)": 38.15, "reward": 0.6427048444747925, "reward_std": 0.030617257580161095, "rewards/VisualizationJSONCombinedORM/mean": 0.6427048444747925, "rewards/VisualizationJSONCombinedORM/std": 0.21150854229927063, "step": 5942, "train_speed(iter/s)": 0.066089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 295.5, "completions/min_length": 238.0, "epoch": 4.915632754342432, "grad_norm": 0.18678948283195496, "kl": 0.08074951171875, "learning_rate": 8.671949076420883e-09, "loss": 0.0008072778582572937, "memory(GiB)": 38.15, "reward": 0.5686328411102295, "reward_std": 0.07160129398107529, "rewards/VisualizationJSONCombinedORM/mean": 0.5686328411102295, "rewards/VisualizationJSONCombinedORM/std": 0.07364167273044586, "step": 5943, "train_speed(iter/s)": 0.066083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 290.0625, "completions/min_length": 229.0, "epoch": 4.91645988420182, "grad_norm": 0.20757202804088593, "kl": 0.0543212890625, "learning_rate": 8.50279235346918e-09, "loss": 0.0005420930683612823, "memory(GiB)": 38.15, "reward": 0.5752362012863159, "reward_std": 0.04219191521406174, "rewards/VisualizationJSONCombinedORM/mean": 0.5752362012863159, "rewards/VisualizationJSONCombinedORM/std": 0.18175877630710602, "step": 5944, "train_speed(iter/s)": 0.066076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 338.0625, "completions/min_length": 247.0, "epoch": 4.917287014061207, "grad_norm": 0.2822508215904236, "kl": 0.1265869140625, "learning_rate": 8.335300318201844e-09, "loss": 0.001266341656446457, "memory(GiB)": 38.15, "reward": 0.7767603397369385, "reward_std": 0.07629885524511337, "rewards/VisualizationJSONCombinedORM/mean": 0.7767603397369385, "rewards/VisualizationJSONCombinedORM/std": 0.10033752769231796, "step": 5945, "train_speed(iter/s)": 0.066062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 289.6875, "completions/min_length": 235.0, "epoch": 4.9181141439205955, "grad_norm": 0.36567214131355286, "kl": 0.12823486328125, "learning_rate": 8.169473026478082e-09, "loss": 0.0012826472520828247, "memory(GiB)": 38.15, "reward": 0.6965160965919495, "reward_std": 0.14069534838199615, "rewards/VisualizationJSONCombinedORM/mean": 0.6965160965919495, "rewards/VisualizationJSONCombinedORM/std": 0.137437105178833, "step": 5946, "train_speed(iter/s)": 0.066056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 332.3125, "completions/min_length": 249.0, "epoch": 4.918941273779984, "grad_norm": 0.1770542412996292, "kl": 0.042236328125, "learning_rate": 8.005310533600875e-09, "loss": 0.00042227283120155334, "memory(GiB)": 38.15, "reward": 0.698647141456604, "reward_std": 0.05972790718078613, "rewards/VisualizationJSONCombinedORM/mean": 0.698647141456604, "rewards/VisualizationJSONCombinedORM/std": 0.08713214099407196, "step": 5947, "train_speed(iter/s)": 0.066043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 322.0, "completions/min_length": 258.0, "epoch": 4.919768403639371, "grad_norm": 0.18113429844379425, "kl": 0.109375, "learning_rate": 7.842812894320318e-09, "loss": 0.0010945852845907211, "memory(GiB)": 38.15, "reward": 0.7689310312271118, "reward_std": 0.05542171746492386, "rewards/VisualizationJSONCombinedORM/mean": 0.7689310312271118, "rewards/VisualizationJSONCombinedORM/std": 0.05542837455868721, "step": 5948, "train_speed(iter/s)": 0.066034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 287.8125, "completions/min_length": 242.0, "epoch": 4.920595533498759, "grad_norm": 0.17603787779808044, "kl": 0.04913330078125, "learning_rate": 7.681980162830283e-09, "loss": 0.0004918687045574188, "memory(GiB)": 38.15, "reward": 0.47830814123153687, "reward_std": 0.04563041031360626, "rewards/VisualizationJSONCombinedORM/mean": 0.47830814123153687, "rewards/VisualizationJSONCombinedORM/std": 0.1576889306306839, "step": 5949, "train_speed(iter/s)": 0.066028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 329.4375, "completions/min_length": 247.0, "epoch": 4.921422663358147, "grad_norm": 0.20463576912879944, "kl": 0.06036376953125, "learning_rate": 7.52281239276842e-09, "loss": 0.000605463981628418, "memory(GiB)": 38.15, "reward": 0.4883025884628296, "reward_std": 0.05510155111551285, "rewards/VisualizationJSONCombinedORM/mean": 0.4883025884628296, "rewards/VisualizationJSONCombinedORM/std": 0.21393033862113953, "step": 5950, "train_speed(iter/s)": 0.066017 }, { "epoch": 4.921422663358147, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 376.4166666666667, "eval_completions/mean_length": 312.5885416666667, "eval_completions/min_length": 257.25, "eval_kl": 0.07446797688802083, "eval_loss": 0.0007497121696360409, "eval_reward": 0.4632231829067071, "eval_reward_std": 0.050334679312072694, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4632231829067071, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05033468132993827, "eval_runtime": 318.8141, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 5950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 316.9375, "completions/min_length": 247.0, "epoch": 4.922249793217535, "grad_norm": 0.18557579815387726, "kl": 0.07373046875, "learning_rate": 7.3653096372178216e-09, "loss": 0.000737898051738739, "memory(GiB)": 38.15, "reward": 0.3329457938671112, "reward_std": 0.04243484139442444, "rewards/VisualizationJSONCombinedORM/mean": 0.3329457938671112, "rewards/VisualizationJSONCombinedORM/std": 0.04777035117149353, "step": 5951, "train_speed(iter/s)": 0.065774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 301.125, "completions/min_length": 246.0, "epoch": 4.923076923076923, "grad_norm": 0.2273872196674347, "kl": 0.0623779296875, "learning_rate": 7.20947194870758e-09, "loss": 0.000624045729637146, "memory(GiB)": 38.15, "reward": 0.5856391191482544, "reward_std": 0.04134954512119293, "rewards/VisualizationJSONCombinedORM/mean": 0.5856391191482544, "rewards/VisualizationJSONCombinedORM/std": 0.1145055815577507, "step": 5952, "train_speed(iter/s)": 0.065765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 329.125, "completions/min_length": 247.0, "epoch": 4.923904052936311, "grad_norm": 0.19876570999622345, "kl": 0.072509765625, "learning_rate": 7.055299379208902e-09, "loss": 0.0007247254252433777, "memory(GiB)": 38.15, "reward": 0.5244550704956055, "reward_std": 0.08272214978933334, "rewards/VisualizationJSONCombinedORM/mean": 0.5244550704956055, "rewards/VisualizationJSONCombinedORM/std": 0.09970133751630783, "step": 5953, "train_speed(iter/s)": 0.065757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 321.75, "completions/min_length": 271.0, "epoch": 4.924731182795699, "grad_norm": 0.21138526499271393, "kl": 0.107666015625, "learning_rate": 6.9027919801401e-09, "loss": 0.0010768547654151917, "memory(GiB)": 38.15, "reward": 0.5198566317558289, "reward_std": 0.05348363518714905, "rewards/VisualizationJSONCombinedORM/mean": 0.5198566317558289, "rewards/VisualizationJSONCombinedORM/std": 0.11800899356603622, "step": 5954, "train_speed(iter/s)": 0.06575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 314.625, "completions/min_length": 256.0, "epoch": 4.925558312655086, "grad_norm": 0.2288721203804016, "kl": 0.0924072265625, "learning_rate": 6.751949802362712e-09, "loss": 0.0009242501109838486, "memory(GiB)": 38.15, "reward": 0.4133181869983673, "reward_std": 0.040777381509542465, "rewards/VisualizationJSONCombinedORM/mean": 0.4133181869983673, "rewards/VisualizationJSONCombinedORM/std": 0.21782159805297852, "step": 5955, "train_speed(iter/s)": 0.065745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 335.5625, "completions/min_length": 239.0, "epoch": 4.926385442514475, "grad_norm": 0.20536838471889496, "kl": 0.09393310546875, "learning_rate": 6.602772896183163e-09, "loss": 0.0009403415024280548, "memory(GiB)": 38.15, "reward": 0.4243166148662567, "reward_std": 0.053183384239673615, "rewards/VisualizationJSONCombinedORM/mean": 0.4243166148662567, "rewards/VisualizationJSONCombinedORM/std": 0.06987938284873962, "step": 5956, "train_speed(iter/s)": 0.065737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 298.5625, "completions/min_length": 233.0, "epoch": 4.927212572373863, "grad_norm": 0.20052595436573029, "kl": 0.11883544921875, "learning_rate": 6.455261311352767e-09, "loss": 0.0011841580271720886, "memory(GiB)": 38.15, "reward": 0.31426721811294556, "reward_std": 0.03079169988632202, "rewards/VisualizationJSONCombinedORM/mean": 0.31426721811294556, "rewards/VisualizationJSONCombinedORM/std": 0.07276008278131485, "step": 5957, "train_speed(iter/s)": 0.065731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 324.75, "completions/min_length": 258.0, "epoch": 4.92803970223325, "grad_norm": 0.24749237298965454, "kl": 0.0679931640625, "learning_rate": 6.309415097067728e-09, "loss": 0.0006784778088331223, "memory(GiB)": 38.15, "reward": 0.4346303343772888, "reward_std": 0.04027874022722244, "rewards/VisualizationJSONCombinedORM/mean": 0.4346303343772888, "rewards/VisualizationJSONCombinedORM/std": 0.10795651376247406, "step": 5958, "train_speed(iter/s)": 0.065724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 299.0625, "completions/min_length": 255.0, "epoch": 4.9288668320926385, "grad_norm": 0.2687141001224518, "kl": 0.04400634765625, "learning_rate": 6.165234301967471e-09, "loss": 0.00043926388025283813, "memory(GiB)": 38.15, "reward": 0.5776462554931641, "reward_std": 0.06508257985115051, "rewards/VisualizationJSONCombinedORM/mean": 0.5776462554931641, "rewards/VisualizationJSONCombinedORM/std": 0.18305416405200958, "step": 5959, "train_speed(iter/s)": 0.065719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 289.5, "completions/min_length": 238.0, "epoch": 4.929693961952027, "grad_norm": 0.1819581240415573, "kl": 0.0894775390625, "learning_rate": 6.022718974137976e-09, "loss": 0.0008954405784606934, "memory(GiB)": 38.15, "reward": 0.6755402088165283, "reward_std": 0.053897708654403687, "rewards/VisualizationJSONCombinedORM/mean": 0.6755402088165283, "rewards/VisualizationJSONCombinedORM/std": 0.14156274497509003, "step": 5960, "train_speed(iter/s)": 0.065713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 333.8125, "completions/min_length": 246.0, "epoch": 4.930521091811414, "grad_norm": 0.19016289710998535, "kl": 0.06549072265625, "learning_rate": 5.881869161108444e-09, "loss": 0.0006554722785949707, "memory(GiB)": 38.15, "reward": 0.5551018714904785, "reward_std": 0.04932006448507309, "rewards/VisualizationJSONCombinedORM/mean": 0.5551018714904785, "rewards/VisualizationJSONCombinedORM/std": 0.16266009211540222, "step": 5961, "train_speed(iter/s)": 0.065706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 301.0, "completions/min_length": 230.0, "epoch": 4.9313482216708024, "grad_norm": 0.20879334211349487, "kl": 0.03997802734375, "learning_rate": 5.7426849098529695e-09, "loss": 0.00039906054735183716, "memory(GiB)": 38.15, "reward": 0.48714086413383484, "reward_std": 0.059114161878824234, "rewards/VisualizationJSONCombinedORM/mean": 0.48714086413383484, "rewards/VisualizationJSONCombinedORM/std": 0.08947864919900894, "step": 5962, "train_speed(iter/s)": 0.065699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/mean_length": 330.875, "completions/min_length": 245.0, "epoch": 4.932175351530191, "grad_norm": 0.17482897639274597, "kl": 0.11962890625, "learning_rate": 5.605166266789419e-09, "loss": 0.001195266842842102, "memory(GiB)": 38.15, "reward": 0.744686484336853, "reward_std": 0.04939000681042671, "rewards/VisualizationJSONCombinedORM/mean": 0.744686484336853, "rewards/VisualizationJSONCombinedORM/std": 0.0966683179140091, "step": 5963, "train_speed(iter/s)": 0.065692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 303.8125, "completions/min_length": 236.0, "epoch": 4.933002481389578, "grad_norm": 0.1555386632680893, "kl": 0.1185302734375, "learning_rate": 5.469313277782218e-09, "loss": 0.0011857934296131134, "memory(GiB)": 38.15, "reward": 0.5134277939796448, "reward_std": 0.03687996417284012, "rewards/VisualizationJSONCombinedORM/mean": 0.5134277939796448, "rewards/VisualizationJSONCombinedORM/std": 0.11388834565877914, "step": 5964, "train_speed(iter/s)": 0.065685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 292.4375, "completions/min_length": 225.0, "epoch": 4.933829611248966, "grad_norm": 0.2543942630290985, "kl": 0.0877685546875, "learning_rate": 5.3351259881379016e-09, "loss": 0.0008761957287788391, "memory(GiB)": 38.15, "reward": 0.566594123840332, "reward_std": 0.08219832181930542, "rewards/VisualizationJSONCombinedORM/mean": 0.566594123840332, "rewards/VisualizationJSONCombinedORM/std": 0.22481770813465118, "step": 5965, "train_speed(iter/s)": 0.065681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 266.875, "completions/min_length": 214.0, "epoch": 4.934656741108354, "grad_norm": 0.21457697451114655, "kl": 0.04620361328125, "learning_rate": 5.202604442609005e-09, "loss": 0.00046300143003463745, "memory(GiB)": 38.15, "reward": 0.47949931025505066, "reward_std": 0.06431250274181366, "rewards/VisualizationJSONCombinedORM/mean": 0.47949931025505066, "rewards/VisualizationJSONCombinedORM/std": 0.1714470088481903, "step": 5966, "train_speed(iter/s)": 0.065674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 318.6875, "completions/min_length": 255.0, "epoch": 4.935483870967742, "grad_norm": 0.1717083752155304, "kl": 0.030029296875, "learning_rate": 5.0717486853918415e-09, "loss": 0.00030040740966796875, "memory(GiB)": 38.15, "reward": 0.5723631381988525, "reward_std": 0.0658712089061737, "rewards/VisualizationJSONCombinedORM/mean": 0.5723631381988525, "rewards/VisualizationJSONCombinedORM/std": 0.06469053030014038, "step": 5967, "train_speed(iter/s)": 0.065664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 335.25, "completions/min_length": 267.0, "epoch": 4.93631100082713, "grad_norm": 0.18694375455379486, "kl": 0.0633544921875, "learning_rate": 4.942558760128169e-09, "loss": 0.0006333328783512115, "memory(GiB)": 38.15, "reward": 0.5851283073425293, "reward_std": 0.07722242176532745, "rewards/VisualizationJSONCombinedORM/mean": 0.5851283073425293, "rewards/VisualizationJSONCombinedORM/std": 0.1353408545255661, "step": 5968, "train_speed(iter/s)": 0.065655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 315.4375, "completions/min_length": 247.0, "epoch": 4.937138130686518, "grad_norm": 0.2638629972934723, "kl": 0.14373779296875, "learning_rate": 4.815034709902411e-09, "loss": 0.0014326348900794983, "memory(GiB)": 38.15, "reward": 0.29731956124305725, "reward_std": 0.033733341842889786, "rewards/VisualizationJSONCombinedORM/mean": 0.29731956124305725, "rewards/VisualizationJSONCombinedORM/std": 0.09592489153146744, "step": 5969, "train_speed(iter/s)": 0.065647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 322.0625, "completions/min_length": 242.0, "epoch": 4.937965260545906, "grad_norm": 0.1776835024356842, "kl": 0.037933349609375, "learning_rate": 4.689176577244992e-09, "loss": 0.00037993118166923523, "memory(GiB)": 38.15, "reward": 0.5821532011032104, "reward_std": 0.0329495333135128, "rewards/VisualizationJSONCombinedORM/mean": 0.5821532011032104, "rewards/VisualizationJSONCombinedORM/std": 0.26742056012153625, "step": 5970, "train_speed(iter/s)": 0.065638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 324.0, "completions/min_length": 269.0, "epoch": 4.938792390405293, "grad_norm": 0.16204790771007538, "kl": 0.0428466796875, "learning_rate": 4.564984404130113e-09, "loss": 0.0004301890730857849, "memory(GiB)": 38.15, "reward": 0.7115325927734375, "reward_std": 0.011618008837103844, "rewards/VisualizationJSONCombinedORM/mean": 0.7115325927734375, "rewards/VisualizationJSONCombinedORM/std": 0.05426519736647606, "step": 5971, "train_speed(iter/s)": 0.065629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 299.0625, "completions/min_length": 241.0, "epoch": 4.9396195202646815, "grad_norm": 0.22189919650554657, "kl": 0.05560302734375, "learning_rate": 4.44245823197631e-09, "loss": 0.0005554035305976868, "memory(GiB)": 38.15, "reward": 0.4423784017562866, "reward_std": 0.038152288645505905, "rewards/VisualizationJSONCombinedORM/mean": 0.4423784017562866, "rewards/VisualizationJSONCombinedORM/std": 0.19193856418132782, "step": 5972, "train_speed(iter/s)": 0.06562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 300.3125, "completions/min_length": 245.0, "epoch": 4.94044665012407, "grad_norm": 0.23564627766609192, "kl": 0.1346435546875, "learning_rate": 4.321598101647007e-09, "loss": 0.0013479441404342651, "memory(GiB)": 38.15, "reward": 0.6149148941040039, "reward_std": 0.056281983852386475, "rewards/VisualizationJSONCombinedORM/mean": 0.6149148941040039, "rewards/VisualizationJSONCombinedORM/std": 0.06213013082742691, "step": 5973, "train_speed(iter/s)": 0.065612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 333.5625, "completions/min_length": 267.0, "epoch": 4.941273779983457, "grad_norm": 0.17944328486919403, "kl": 0.06512451171875, "learning_rate": 4.202404053449405e-09, "loss": 0.0006517544388771057, "memory(GiB)": 38.15, "reward": 0.5142556428909302, "reward_std": 0.034101832658052444, "rewards/VisualizationJSONCombinedORM/mean": 0.5142556428909302, "rewards/VisualizationJSONCombinedORM/std": 0.30999746918678284, "step": 5974, "train_speed(iter/s)": 0.065604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 314.0625, "completions/min_length": 246.0, "epoch": 4.9421009098428454, "grad_norm": 0.1883568912744522, "kl": 0.07354736328125, "learning_rate": 4.0848761271350405e-09, "loss": 0.0007353890687227249, "memory(GiB)": 38.15, "reward": 0.48586446046829224, "reward_std": 0.05444641411304474, "rewards/VisualizationJSONCombinedORM/mean": 0.48586446046829224, "rewards/VisualizationJSONCombinedORM/std": 0.0964195653796196, "step": 5975, "train_speed(iter/s)": 0.065598 }, { "epoch": 4.9421009098428454, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 374.4583333333333, "eval_completions/mean_length": 311.9791666666667, "eval_completions/min_length": 257.5833333333333, "eval_kl": 0.0810394287109375, "eval_loss": 0.0008208056096918881, "eval_reward": 0.44653211534023285, "eval_reward_std": 0.051555352518334985, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.44653211534023285, "eval_rewards/VisualizationJSONCombinedORM/std": 0.051555353430255, "eval_runtime": 317.6277, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 5975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 313.3125, "completions/min_length": 250.0, "epoch": 4.942928039702233, "grad_norm": 0.19698035717010498, "kl": 0.06976318359375, "learning_rate": 3.969014361900336e-09, "loss": 0.0006957873702049255, "memory(GiB)": 38.15, "reward": 0.741422176361084, "reward_std": 0.08261118829250336, "rewards/VisualizationJSONCombinedORM/mean": 0.741422176361084, "rewards/VisualizationJSONCombinedORM/std": 0.09573325514793396, "step": 5976, "train_speed(iter/s)": 0.065361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 311.5, "completions/min_length": 245.0, "epoch": 4.943755169561621, "grad_norm": 0.20164136588573456, "kl": 0.0543212890625, "learning_rate": 3.854818796385495e-09, "loss": 0.0005433186888694763, "memory(GiB)": 38.15, "reward": 0.5455667972564697, "reward_std": 0.023109402507543564, "rewards/VisualizationJSONCombinedORM/mean": 0.5455667972564697, "rewards/VisualizationJSONCombinedORM/std": 0.21851308643817902, "step": 5977, "train_speed(iter/s)": 0.065354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 310.8125, "completions/min_length": 229.0, "epoch": 4.944582299421009, "grad_norm": 0.21098390221595764, "kl": 0.0784912109375, "learning_rate": 3.742289468675053e-09, "loss": 0.0007839873433113098, "memory(GiB)": 38.15, "reward": 0.43424883484840393, "reward_std": 0.02105897106230259, "rewards/VisualizationJSONCombinedORM/mean": 0.43424883484840393, "rewards/VisualizationJSONCombinedORM/std": 0.07265650480985641, "step": 5978, "train_speed(iter/s)": 0.065348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/mean_length": 310.1875, "completions/min_length": 252.0, "epoch": 4.945409429280397, "grad_norm": 0.17374758422374725, "kl": 0.06396484375, "learning_rate": 3.6314264162989887e-09, "loss": 0.0006401166319847107, "memory(GiB)": 38.15, "reward": 0.33609792590141296, "reward_std": 0.023368995636701584, "rewards/VisualizationJSONCombinedORM/mean": 0.33609792590141296, "rewards/VisualizationJSONCombinedORM/std": 0.02944200113415718, "step": 5979, "train_speed(iter/s)": 0.06534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 316.75, "completions/min_length": 242.0, "epoch": 4.946236559139785, "grad_norm": 0.18800057470798492, "kl": 0.0433349609375, "learning_rate": 3.522229676229949e-09, "loss": 0.0004325695335865021, "memory(GiB)": 38.15, "reward": 0.6942185163497925, "reward_std": 0.07117312401533127, "rewards/VisualizationJSONCombinedORM/mean": 0.6942185163497925, "rewards/VisualizationJSONCombinedORM/std": 0.12626244127750397, "step": 5980, "train_speed(iter/s)": 0.065331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 316.5, "completions/min_length": 231.0, "epoch": 4.947063688999172, "grad_norm": 0.2076060026884079, "kl": 0.05322265625, "learning_rate": 3.41469928488547e-09, "loss": 0.0005323439836502075, "memory(GiB)": 38.15, "reward": 0.5913429856300354, "reward_std": 0.06719876825809479, "rewards/VisualizationJSONCombinedORM/mean": 0.5913429856300354, "rewards/VisualizationJSONCombinedORM/std": 0.1924622505903244, "step": 5981, "train_speed(iter/s)": 0.065322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 292.875, "completions/min_length": 227.0, "epoch": 4.947890818858561, "grad_norm": 0.16735292971134186, "kl": 0.05389404296875, "learning_rate": 3.3088352781279753e-09, "loss": 0.0005385950207710266, "memory(GiB)": 38.15, "reward": 0.3617701530456543, "reward_std": 0.0322822704911232, "rewards/VisualizationJSONCombinedORM/mean": 0.3617701530456543, "rewards/VisualizationJSONCombinedORM/std": 0.061099253594875336, "step": 5982, "train_speed(iter/s)": 0.065316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 324.3125, "completions/min_length": 267.0, "epoch": 4.948717948717949, "grad_norm": 0.18330909311771393, "kl": 0.09295654296875, "learning_rate": 3.2046376912631127e-09, "loss": 0.00093097984790802, "memory(GiB)": 38.15, "reward": 0.37896206974983215, "reward_std": 0.03793054074048996, "rewards/VisualizationJSONCombinedORM/mean": 0.37896206974983215, "rewards/VisualizationJSONCombinedORM/std": 0.05489351972937584, "step": 5983, "train_speed(iter/s)": 0.065306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/mean_length": 344.3125, "completions/min_length": 291.0, "epoch": 4.949545078577336, "grad_norm": 0.18417400121688843, "kl": 0.103271484375, "learning_rate": 3.1021065590414177e-09, "loss": 0.0010314993560314178, "memory(GiB)": 38.15, "reward": 0.5412541627883911, "reward_std": 0.05268063396215439, "rewards/VisualizationJSONCombinedORM/mean": 0.5412541627883911, "rewards/VisualizationJSONCombinedORM/std": 0.17304231226444244, "step": 5984, "train_speed(iter/s)": 0.065297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 314.75, "completions/min_length": 229.0, "epoch": 4.9503722084367245, "grad_norm": 0.17648202180862427, "kl": 0.061279296875, "learning_rate": 3.0012419156572047e-09, "loss": 0.0006134398281574249, "memory(GiB)": 38.15, "reward": 0.6334377527236938, "reward_std": 0.05022510141134262, "rewards/VisualizationJSONCombinedORM/mean": 0.6334377527236938, "rewards/VisualizationJSONCombinedORM/std": 0.08081862330436707, "step": 5985, "train_speed(iter/s)": 0.065291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 331.375, "completions/min_length": 284.0, "epoch": 4.951199338296113, "grad_norm": 0.22045259177684784, "kl": 0.05517578125, "learning_rate": 2.9020437947502313e-09, "loss": 0.0005511678755283356, "memory(GiB)": 38.15, "reward": 0.5904910564422607, "reward_std": 0.05154237151145935, "rewards/VisualizationJSONCombinedORM/mean": 0.5904910564422607, "rewards/VisualizationJSONCombinedORM/std": 0.23645846545696259, "step": 5986, "train_speed(iter/s)": 0.065281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/mean_length": 301.625, "completions/min_length": 215.0, "epoch": 4.9520264681555, "grad_norm": 0.20638418197631836, "kl": 0.073974609375, "learning_rate": 2.8045122294023675e-09, "loss": 0.0007394403219223022, "memory(GiB)": 38.15, "reward": 0.5912182927131653, "reward_std": 0.05060546100139618, "rewards/VisualizationJSONCombinedORM/mean": 0.5912182927131653, "rewards/VisualizationJSONCombinedORM/std": 0.15462026000022888, "step": 5987, "train_speed(iter/s)": 0.065274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 322.8125, "completions/min_length": 255.0, "epoch": 4.9528535980148884, "grad_norm": 0.21438266336917877, "kl": 0.0775146484375, "learning_rate": 2.708647252142038e-09, "loss": 0.0007754471153020859, "memory(GiB)": 38.15, "reward": 0.3763919472694397, "reward_std": 0.045454636216163635, "rewards/VisualizationJSONCombinedORM/mean": 0.3763919472694397, "rewards/VisualizationJSONCombinedORM/std": 0.0627717450261116, "step": 5988, "train_speed(iter/s)": 0.065266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 310.75, "completions/min_length": 230.0, "epoch": 4.953680727874277, "grad_norm": 0.19977353513240814, "kl": 0.05419921875, "learning_rate": 2.6144488949392253e-09, "loss": 0.000542372465133667, "memory(GiB)": 38.15, "reward": 0.5581343770027161, "reward_std": 0.045094817876815796, "rewards/VisualizationJSONCombinedORM/mean": 0.5581343770027161, "rewards/VisualizationJSONCombinedORM/std": 0.06762100756168365, "step": 5989, "train_speed(iter/s)": 0.06526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 314.25, "completions/min_length": 267.0, "epoch": 4.954507857733664, "grad_norm": 0.19153138995170593, "kl": 0.134033203125, "learning_rate": 2.5219171892110207e-09, "loss": 0.0013400502502918243, "memory(GiB)": 38.15, "reward": 0.32322603464126587, "reward_std": 0.027990536764264107, "rewards/VisualizationJSONCombinedORM/mean": 0.32322603464126587, "rewards/VisualizationJSONCombinedORM/std": 0.03815879672765732, "step": 5990, "train_speed(iter/s)": 0.065249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 299.4375, "completions/min_length": 240.0, "epoch": 4.955334987593052, "grad_norm": 0.4725308120250702, "kl": 0.4393310546875, "learning_rate": 2.4310521658160722e-09, "loss": 0.0044095478951931, "memory(GiB)": 38.15, "reward": 0.5588024854660034, "reward_std": 0.06095818802714348, "rewards/VisualizationJSONCombinedORM/mean": 0.5588024854660034, "rewards/VisualizationJSONCombinedORM/std": 0.09435825049877167, "step": 5991, "train_speed(iter/s)": 0.065243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 320.25, "completions/min_length": 271.0, "epoch": 4.95616211745244, "grad_norm": 0.19917316734790802, "kl": 0.0628662109375, "learning_rate": 2.3418538550590285e-09, "loss": 0.000627666711807251, "memory(GiB)": 38.15, "reward": 0.5984646081924438, "reward_std": 0.06181804835796356, "rewards/VisualizationJSONCombinedORM/mean": 0.5984646081924438, "rewards/VisualizationJSONCombinedORM/std": 0.0836649164557457, "step": 5992, "train_speed(iter/s)": 0.065235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/mean_length": 307.1875, "completions/min_length": 241.0, "epoch": 4.956989247311828, "grad_norm": 0.23639258742332458, "kl": 0.0611572265625, "learning_rate": 2.254322286687205e-09, "loss": 0.0006119310855865479, "memory(GiB)": 38.15, "reward": 0.4740666449069977, "reward_std": 0.08826767653226852, "rewards/VisualizationJSONCombinedORM/mean": 0.4740666449069977, "rewards/VisualizationJSONCombinedORM/std": 0.14831626415252686, "step": 5993, "train_speed(iter/s)": 0.065228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 300.625, "completions/min_length": 253.0, "epoch": 4.957816377171216, "grad_norm": 0.18310174345970154, "kl": 0.1077880859375, "learning_rate": 2.168457489893916e-09, "loss": 0.0010771006345748901, "memory(GiB)": 38.15, "reward": 0.5451916456222534, "reward_std": 0.03889115899801254, "rewards/VisualizationJSONCombinedORM/mean": 0.5451916456222534, "rewards/VisualizationJSONCombinedORM/std": 0.16542783379554749, "step": 5994, "train_speed(iter/s)": 0.065221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 334.5, "completions/min_length": 278.0, "epoch": 4.958643507030604, "grad_norm": 0.18254028260707855, "kl": 0.03961181640625, "learning_rate": 2.0842594933140338e-09, "loss": 0.00039676204323768616, "memory(GiB)": 38.15, "reward": 0.6178001165390015, "reward_std": 0.08034737408161163, "rewards/VisualizationJSONCombinedORM/mean": 0.6178001165390015, "rewards/VisualizationJSONCombinedORM/std": 0.16190753877162933, "step": 5995, "train_speed(iter/s)": 0.06521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 319.3125, "completions/min_length": 273.0, "epoch": 4.959470636889992, "grad_norm": 0.18018962442874908, "kl": 0.026397705078125, "learning_rate": 2.001728325028984e-09, "loss": 0.0002642683684825897, "memory(GiB)": 38.15, "reward": 0.6486045122146606, "reward_std": 0.051064301282167435, "rewards/VisualizationJSONCombinedORM/mean": 0.6486045122146606, "rewards/VisualizationJSONCombinedORM/std": 0.23371626436710358, "step": 5996, "train_speed(iter/s)": 0.065204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 297.0625, "completions/min_length": 253.0, "epoch": 4.960297766749379, "grad_norm": 0.13971947133541107, "kl": 0.0731201171875, "learning_rate": 1.9208640125628618e-09, "loss": 0.0007280632853507996, "memory(GiB)": 38.15, "reward": 0.37375837564468384, "reward_std": 0.022347765043377876, "rewards/VisualizationJSONCombinedORM/mean": 0.37375837564468384, "rewards/VisualizationJSONCombinedORM/std": 0.062762551009655, "step": 5997, "train_speed(iter/s)": 0.065197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 324.75, "completions/min_length": 240.0, "epoch": 4.9611248966087675, "grad_norm": 0.18370361626148224, "kl": 0.08441162109375, "learning_rate": 1.8416665828846493e-09, "loss": 0.0008441172540187836, "memory(GiB)": 38.15, "reward": 0.7821418046951294, "reward_std": 0.08842294663190842, "rewards/VisualizationJSONCombinedORM/mean": 0.7821418046951294, "rewards/VisualizationJSONCombinedORM/std": 0.0970340445637703, "step": 5998, "train_speed(iter/s)": 0.065188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 288.75, "completions/min_length": 232.0, "epoch": 4.961952026468156, "grad_norm": 0.25909286737442017, "kl": 0.06072998046875, "learning_rate": 1.7641360624071068e-09, "loss": 0.0006073117256164551, "memory(GiB)": 38.15, "reward": 0.3884740471839905, "reward_std": 0.03652854263782501, "rewards/VisualizationJSONCombinedORM/mean": 0.3884740471839905, "rewards/VisualizationJSONCombinedORM/std": 0.14983858168125153, "step": 5999, "train_speed(iter/s)": 0.065181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 301.375, "completions/min_length": 242.0, "epoch": 4.962779156327543, "grad_norm": 0.27666255831718445, "kl": 0.05902099609375, "learning_rate": 1.688272476986219e-09, "loss": 0.000589326024055481, "memory(GiB)": 38.15, "reward": 0.5064765810966492, "reward_std": 0.05756598711013794, "rewards/VisualizationJSONCombinedORM/mean": 0.5064765810966492, "rewards/VisualizationJSONCombinedORM/std": 0.09193436056375504, "step": 6000, "train_speed(iter/s)": 0.065171 }, { "epoch": 4.962779156327543, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 372.4583333333333, "eval_completions/mean_length": 309.6145833333333, "eval_completions/min_length": 255.58333333333334, "eval_kl": 0.08870442708333333, "eval_loss": 0.0008968642796389759, "eval_reward": 0.4492592637737592, "eval_reward_std": 0.04867793208298584, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4492592637737592, "eval_rewards/VisualizationJSONCombinedORM/std": 0.04867793254864713, "eval_runtime": 316.0338, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.009, "step": 6000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 294.375, "completions/min_length": 247.0, "epoch": 4.9636062861869314, "grad_norm": 0.1575014293193817, "kl": 0.05438232421875, "learning_rate": 1.6140758519239685e-09, "loss": 0.0005438923835754395, "memory(GiB)": 38.15, "reward": 0.7919590473175049, "reward_std": 0.027018729597330093, "rewards/VisualizationJSONCombinedORM/mean": 0.7919590473175049, "rewards/VisualizationJSONCombinedORM/std": 0.036520346999168396, "step": 6001, "train_speed(iter/s)": 0.064941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 296.4375, "completions/min_length": 237.0, "epoch": 4.964433416046319, "grad_norm": 0.212689608335495, "kl": 0.083251953125, "learning_rate": 1.541546211964451e-09, "loss": 0.0008295625448226929, "memory(GiB)": 38.15, "reward": 0.5342814922332764, "reward_std": 0.051157549023628235, "rewards/VisualizationJSONCombinedORM/mean": 0.5342814922332764, "rewards/VisualizationJSONCombinedORM/std": 0.17605246603488922, "step": 6002, "train_speed(iter/s)": 0.064933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 328.5, "completions/min_length": 270.0, "epoch": 4.965260545905707, "grad_norm": 0.18366852402687073, "kl": 0.06597900390625, "learning_rate": 1.4706835812966502e-09, "loss": 0.0006592795252799988, "memory(GiB)": 38.15, "reward": 0.37476468086242676, "reward_std": 0.01726955734193325, "rewards/VisualizationJSONCombinedORM/mean": 0.37476468086242676, "rewards/VisualizationJSONCombinedORM/std": 0.18722587823867798, "step": 6003, "train_speed(iter/s)": 0.064922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 308.4375, "completions/min_length": 245.0, "epoch": 4.966087675765095, "grad_norm": 0.2112223207950592, "kl": 0.101806640625, "learning_rate": 1.4014879835544393e-09, "loss": 0.0010179020464420319, "memory(GiB)": 38.15, "reward": 0.4120720624923706, "reward_std": 0.05932479724287987, "rewards/VisualizationJSONCombinedORM/mean": 0.4120720624923706, "rewards/VisualizationJSONCombinedORM/std": 0.1473277062177658, "step": 6004, "train_speed(iter/s)": 0.064915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 327.5625, "completions/min_length": 254.0, "epoch": 4.966914805624483, "grad_norm": 0.28013327717781067, "kl": 0.1280517578125, "learning_rate": 1.3339594418138036e-09, "loss": 0.0012782365083694458, "memory(GiB)": 38.15, "reward": 0.3937720060348511, "reward_std": 0.029746994376182556, "rewards/VisualizationJSONCombinedORM/mean": 0.3937720060348511, "rewards/VisualizationJSONCombinedORM/std": 0.10195358097553253, "step": 6005, "train_speed(iter/s)": 0.064908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 289.125, "completions/min_length": 229.0, "epoch": 4.967741935483871, "grad_norm": 0.26465314626693726, "kl": 0.06787109375, "learning_rate": 1.2680979785961722e-09, "loss": 0.0006783250719308853, "memory(GiB)": 38.15, "reward": 0.6281099319458008, "reward_std": 0.09723646193742752, "rewards/VisualizationJSONCombinedORM/mean": 0.6281099319458008, "rewards/VisualizationJSONCombinedORM/std": 0.1293727159500122, "step": 6006, "train_speed(iter/s)": 0.064903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 305.5, "completions/min_length": 259.0, "epoch": 4.968569065343259, "grad_norm": 0.22986260056495667, "kl": 0.05218505859375, "learning_rate": 1.2039036158673078e-09, "loss": 0.0005224496126174927, "memory(GiB)": 38.15, "reward": 0.6974358558654785, "reward_std": 0.0788385421037674, "rewards/VisualizationJSONCombinedORM/mean": 0.6974358558654785, "rewards/VisualizationJSONCombinedORM/std": 0.08153865486383438, "step": 6007, "train_speed(iter/s)": 0.064892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 313.8125, "completions/min_length": 251.0, "epoch": 4.969396195202647, "grad_norm": 0.21190592646598816, "kl": 0.155029296875, "learning_rate": 1.141376375035641e-09, "loss": 0.0015511587262153625, "memory(GiB)": 38.15, "reward": 0.36328238248825073, "reward_std": 0.03335600346326828, "rewards/VisualizationJSONCombinedORM/mean": 0.36328238248825073, "rewards/VisualizationJSONCombinedORM/std": 0.12496078759431839, "step": 6008, "train_speed(iter/s)": 0.064886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 315.625, "completions/min_length": 244.0, "epoch": 4.970223325062035, "grad_norm": 0.22969335317611694, "kl": 0.146728515625, "learning_rate": 1.0805162769544908e-09, "loss": 0.0014666654169559479, "memory(GiB)": 38.15, "reward": 0.49527862668037415, "reward_std": 0.06619437783956528, "rewards/VisualizationJSONCombinedORM/mean": 0.49527862668037415, "rewards/VisualizationJSONCombinedORM/std": 0.1020493358373642, "step": 6009, "train_speed(iter/s)": 0.064879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 278.6875, "completions/min_length": 212.0, "epoch": 4.971050454921422, "grad_norm": 0.24027641117572784, "kl": 0.25537109375, "learning_rate": 1.0213233419203994e-09, "loss": 0.0025528259575366974, "memory(GiB)": 38.15, "reward": 0.5456535220146179, "reward_std": 0.09373563528060913, "rewards/VisualizationJSONCombinedORM/mean": 0.5456535220146179, "rewards/VisualizationJSONCombinedORM/std": 0.1347636878490448, "step": 6010, "train_speed(iter/s)": 0.064875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 323.375, "completions/min_length": 263.0, "epoch": 4.9718775847808105, "grad_norm": 0.21979187428951263, "kl": 0.0467529296875, "learning_rate": 9.637975896759077e-10, "loss": 0.00046769389882683754, "memory(GiB)": 38.15, "reward": 0.5993940830230713, "reward_std": 0.06220853701233864, "rewards/VisualizationJSONCombinedORM/mean": 0.5993940830230713, "rewards/VisualizationJSONCombinedORM/std": 0.13998885452747345, "step": 6011, "train_speed(iter/s)": 0.064868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 295.5625, "completions/min_length": 209.0, "epoch": 4.972704714640199, "grad_norm": 0.20420655608177185, "kl": 0.0521240234375, "learning_rate": 9.079390394045596e-10, "loss": 0.0005205906927585602, "memory(GiB)": 38.15, "reward": 0.3769516348838806, "reward_std": 0.03892771154642105, "rewards/VisualizationJSONCombinedORM/mean": 0.3769516348838806, "rewards/VisualizationJSONCombinedORM/std": 0.09375227987766266, "step": 6012, "train_speed(iter/s)": 0.064856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 279.0625, "completions/min_length": 224.0, "epoch": 4.973531844499586, "grad_norm": 0.21817301213741302, "kl": 0.04498291015625, "learning_rate": 8.537477097364522e-10, "loss": 0.0004494413733482361, "memory(GiB)": 38.15, "reward": 0.7760408520698547, "reward_std": 0.050069406628608704, "rewards/VisualizationJSONCombinedORM/mean": 0.7760408520698547, "rewards/VisualizationJSONCombinedORM/std": 0.051365263760089874, "step": 6013, "train_speed(iter/s)": 0.064848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 271.1875, "completions/min_length": 225.0, "epoch": 4.9743589743589745, "grad_norm": 0.1721685826778412, "kl": 0.1131591796875, "learning_rate": 8.012236187443511e-10, "loss": 0.0011330097913742065, "memory(GiB)": 38.15, "reward": 0.5256131887435913, "reward_std": 0.04362773150205612, "rewards/VisualizationJSONCombinedORM/mean": 0.5256131887435913, "rewards/VisualizationJSONCombinedORM/std": 0.17436015605926514, "step": 6014, "train_speed(iter/s)": 0.064843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 282.6875, "completions/min_length": 227.0, "epoch": 4.975186104218363, "grad_norm": 0.18368709087371826, "kl": 0.06243896484375, "learning_rate": 7.503667839453555e-10, "loss": 0.0006243474781513214, "memory(GiB)": 38.15, "reward": 0.784182071685791, "reward_std": 0.071881502866745, "rewards/VisualizationJSONCombinedORM/mean": 0.784182071685791, "rewards/VisualizationJSONCombinedORM/std": 0.0943346619606018, "step": 6015, "train_speed(iter/s)": 0.064832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 297.3125, "completions/min_length": 229.0, "epoch": 4.97601323407775, "grad_norm": 0.23631933331489563, "kl": 0.06219482421875, "learning_rate": 7.011772223003421e-10, "loss": 0.0006206296384334564, "memory(GiB)": 38.15, "reward": 0.558529257774353, "reward_std": 0.042381562292575836, "rewards/VisualizationJSONCombinedORM/mean": 0.558529257774353, "rewards/VisualizationJSONCombinedORM/std": 0.09515970200300217, "step": 6016, "train_speed(iter/s)": 0.064827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 316.375, "completions/min_length": 256.0, "epoch": 4.976840363937138, "grad_norm": 0.29438525438308716, "kl": 0.0628662109375, "learning_rate": 6.536549502145218e-10, "loss": 0.0006310120224952698, "memory(GiB)": 38.15, "reward": 0.4954977035522461, "reward_std": 0.062388233840465546, "rewards/VisualizationJSONCombinedORM/mean": 0.4954977035522461, "rewards/VisualizationJSONCombinedORM/std": 0.0978415459394455, "step": 6017, "train_speed(iter/s)": 0.064819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 330.125, "completions/min_length": 258.0, "epoch": 4.977667493796526, "grad_norm": 0.20681560039520264, "kl": 0.0576171875, "learning_rate": 6.077999835363279e-10, "loss": 0.0005742888897657394, "memory(GiB)": 38.15, "reward": 0.4467146098613739, "reward_std": 0.025466768071055412, "rewards/VisualizationJSONCombinedORM/mean": 0.4467146098613739, "rewards/VisualizationJSONCombinedORM/std": 0.300231009721756, "step": 6018, "train_speed(iter/s)": 0.064811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 315.4375, "completions/min_length": 234.0, "epoch": 4.978494623655914, "grad_norm": 0.19996555149555206, "kl": 0.07684326171875, "learning_rate": 5.636123375585279e-10, "loss": 0.0007676780223846436, "memory(GiB)": 38.15, "reward": 0.45414572954177856, "reward_std": 0.05273345857858658, "rewards/VisualizationJSONCombinedORM/mean": 0.45414572954177856, "rewards/VisualizationJSONCombinedORM/std": 0.24822981655597687, "step": 6019, "train_speed(iter/s)": 0.064802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 311.875, "completions/min_length": 235.0, "epoch": 4.979321753515302, "grad_norm": 0.18176253139972687, "kl": 0.0958251953125, "learning_rate": 5.210920270187769e-10, "loss": 0.0009591914713382721, "memory(GiB)": 38.15, "reward": 0.2771017551422119, "reward_std": 0.02199457585811615, "rewards/VisualizationJSONCombinedORM/mean": 0.2771017551422119, "rewards/VisualizationJSONCombinedORM/std": 0.030336804687976837, "step": 6020, "train_speed(iter/s)": 0.064791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 339.6875, "completions/min_length": 280.0, "epoch": 4.98014888337469, "grad_norm": 0.1676860898733139, "kl": 0.052978515625, "learning_rate": 4.802390660968437e-10, "loss": 0.0005293339490890503, "memory(GiB)": 38.15, "reward": 0.5262283086776733, "reward_std": 0.04482901096343994, "rewards/VisualizationJSONCombinedORM/mean": 0.5262283086776733, "rewards/VisualizationJSONCombinedORM/std": 0.10093791037797928, "step": 6021, "train_speed(iter/s)": 0.064783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 320.1875, "completions/min_length": 259.0, "epoch": 4.980976013234078, "grad_norm": 0.15343579649925232, "kl": 0.04864501953125, "learning_rate": 4.4105346841794014e-10, "loss": 0.00048701465129852295, "memory(GiB)": 38.15, "reward": 0.8338021636009216, "reward_std": 0.03475472331047058, "rewards/VisualizationJSONCombinedORM/mean": 0.8338021636009216, "rewards/VisualizationJSONCombinedORM/std": 0.0337323360145092, "step": 6022, "train_speed(iter/s)": 0.064776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 311.8125, "completions/min_length": 234.0, "epoch": 4.981803143093465, "grad_norm": 0.17537204921245575, "kl": 0.04205322265625, "learning_rate": 4.0353524705050164e-10, "loss": 0.00042050331830978394, "memory(GiB)": 38.15, "reward": 0.4192846417427063, "reward_std": 0.03908153623342514, "rewards/VisualizationJSONCombinedORM/mean": 0.4192846417427063, "rewards/VisualizationJSONCombinedORM/std": 0.042888086289167404, "step": 6023, "train_speed(iter/s)": 0.064769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 317.9375, "completions/min_length": 258.0, "epoch": 4.9826302729528535, "grad_norm": 0.22295649349689484, "kl": 0.08740234375, "learning_rate": 3.6768441450729663e-10, "loss": 0.0008720923215150833, "memory(GiB)": 38.15, "reward": 0.4900539517402649, "reward_std": 0.05713490769267082, "rewards/VisualizationJSONCombinedORM/mean": 0.4900539517402649, "rewards/VisualizationJSONCombinedORM/std": 0.18370063602924347, "step": 6024, "train_speed(iter/s)": 0.064765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 300.875, "completions/min_length": 224.0, "epoch": 4.983457402812242, "grad_norm": 0.1875896006822586, "kl": 0.1103515625, "learning_rate": 3.335009827437619e-10, "loss": 0.001104786992073059, "memory(GiB)": 38.15, "reward": 0.3087664246559143, "reward_std": 0.03765741363167763, "rewards/VisualizationJSONCombinedORM/mean": 0.3087664246559143, "rewards/VisualizationJSONCombinedORM/std": 0.11245383322238922, "step": 6025, "train_speed(iter/s)": 0.064753 }, { "epoch": 4.983457402812242, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 379.6666666666667, "eval_completions/mean_length": 315.9166666666667, "eval_completions/min_length": 260.8333333333333, "eval_kl": 0.09649658203125, "eval_loss": 0.0009666917030699551, "eval_reward": 0.4483294704308112, "eval_reward_std": 0.0544489478925243, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.4483294704308112, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05444895115215331, "eval_runtime": 321.7562, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 6025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 313.125, "completions/min_length": 263.0, "epoch": 4.984284532671629, "grad_norm": 0.22521093487739563, "kl": 0.05712890625, "learning_rate": 3.0098496316188776e-10, "loss": 0.0005716979503631592, "memory(GiB)": 38.15, "reward": 0.43780767917633057, "reward_std": 0.04112870991230011, "rewards/VisualizationJSONCombinedORM/mean": 0.43780767917633057, "rewards/VisualizationJSONCombinedORM/std": 0.15923374891281128, "step": 6026, "train_speed(iter/s)": 0.064525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 312.6875, "completions/min_length": 252.0, "epoch": 4.9851116625310175, "grad_norm": 0.15775735676288605, "kl": 0.13720703125, "learning_rate": 2.701363666041124e-10, "loss": 0.0013686791062355042, "memory(GiB)": 38.15, "reward": 0.5717803239822388, "reward_std": 0.0493779256939888, "rewards/VisualizationJSONCombinedORM/mean": 0.5717803239822388, "rewards/VisualizationJSONCombinedORM/std": 0.09555043280124664, "step": 6027, "train_speed(iter/s)": 0.064517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 295.6875, "completions/min_length": 224.0, "epoch": 4.985938792390405, "grad_norm": 0.21194784343242645, "kl": 0.101806640625, "learning_rate": 2.4095520335998266e-10, "loss": 0.001017741858959198, "memory(GiB)": 38.15, "reward": 0.5955581068992615, "reward_std": 0.05639095604419708, "rewards/VisualizationJSONCombinedORM/mean": 0.5955581068992615, "rewards/VisualizationJSONCombinedORM/std": 0.24153490364551544, "step": 6028, "train_speed(iter/s)": 0.06451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/mean_length": 298.4375, "completions/min_length": 232.0, "epoch": 4.986765922249793, "grad_norm": 0.25638246536254883, "kl": 0.06378173828125, "learning_rate": 2.1344148316060352e-10, "loss": 0.0006392449140548706, "memory(GiB)": 38.15, "reward": 0.4587441682815552, "reward_std": 0.10127340257167816, "rewards/VisualizationJSONCombinedORM/mean": 0.4587441682815552, "rewards/VisualizationJSONCombinedORM/std": 0.1241634264588356, "step": 6029, "train_speed(iter/s)": 0.0645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 350.0, "completions/min_length": 263.0, "epoch": 4.987593052109181, "grad_norm": 0.19785821437835693, "kl": 0.1962890625, "learning_rate": 1.8759521518307845e-10, "loss": 0.001964658498764038, "memory(GiB)": 38.15, "reward": 0.3247668147087097, "reward_std": 0.021202536299824715, "rewards/VisualizationJSONCombinedORM/mean": 0.3247668147087097, "rewards/VisualizationJSONCombinedORM/std": 0.05765209347009659, "step": 6030, "train_speed(iter/s)": 0.064494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 329.625, "completions/min_length": 258.0, "epoch": 4.988420181968569, "grad_norm": 0.2047170251607895, "kl": 0.2359619140625, "learning_rate": 1.6341640804606872e-10, "loss": 0.00235661119222641, "memory(GiB)": 38.15, "reward": 0.468467116355896, "reward_std": 0.03840099275112152, "rewards/VisualizationJSONCombinedORM/mean": 0.468467116355896, "rewards/VisualizationJSONCombinedORM/std": 0.07327667623758316, "step": 6031, "train_speed(iter/s)": 0.064487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 327.3125, "completions/min_length": 254.0, "epoch": 4.989247311827957, "grad_norm": 0.17628805339336395, "kl": 0.1104736328125, "learning_rate": 1.4090506981367934e-10, "loss": 0.0011041909456253052, "memory(GiB)": 38.15, "reward": 0.6024926900863647, "reward_std": 0.08594890683889389, "rewards/VisualizationJSONCombinedORM/mean": 0.6024926900863647, "rewards/VisualizationJSONCombinedORM/std": 0.10832291096448898, "step": 6032, "train_speed(iter/s)": 0.064477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 308.0, "completions/min_length": 257.0, "epoch": 4.990074441687345, "grad_norm": 0.17895667254924774, "kl": 0.04632568359375, "learning_rate": 1.200612079937935e-10, "loss": 0.0004630591720342636, "memory(GiB)": 38.15, "reward": 0.6785976886749268, "reward_std": 0.06960998475551605, "rewards/VisualizationJSONCombinedORM/mean": 0.6785976886749268, "rewards/VisualizationJSONCombinedORM/std": 0.07420626282691956, "step": 6033, "train_speed(iter/s)": 0.064469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 306.125, "completions/min_length": 260.0, "epoch": 4.990901571546733, "grad_norm": 0.17672541737556458, "kl": 0.0648193359375, "learning_rate": 1.0088482953751754e-10, "loss": 0.0006475672125816345, "memory(GiB)": 38.15, "reward": 0.37896421551704407, "reward_std": 0.03450150042772293, "rewards/VisualizationJSONCombinedORM/mean": 0.37896421551704407, "rewards/VisualizationJSONCombinedORM/std": 0.09412240982055664, "step": 6034, "train_speed(iter/s)": 0.064461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 330.8125, "completions/min_length": 251.0, "epoch": 4.991728701406121, "grad_norm": 0.4091775417327881, "kl": 0.07196044921875, "learning_rate": 8.337594084084633e-11, "loss": 0.0007194355130195618, "memory(GiB)": 38.15, "reward": 0.37533828616142273, "reward_std": 0.058115530759096146, "rewards/VisualizationJSONCombinedORM/mean": 0.37533828616142273, "rewards/VisualizationJSONCombinedORM/std": 0.05995307117700577, "step": 6035, "train_speed(iter/s)": 0.064447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 313.625, "completions/min_length": 244.0, "epoch": 4.992555831265509, "grad_norm": 0.2386581003665924, "kl": 0.108642578125, "learning_rate": 6.753454774244272e-11, "loss": 0.0010892227292060852, "memory(GiB)": 38.15, "reward": 0.38446417450904846, "reward_std": 0.0452231764793396, "rewards/VisualizationJSONCombinedORM/mean": 0.38446417450904846, "rewards/VisualizationJSONCombinedORM/std": 0.07890524715185165, "step": 6036, "train_speed(iter/s)": 0.064441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 316.9375, "completions/min_length": 235.0, "epoch": 4.9933829611248965, "grad_norm": 0.1882610023021698, "kl": 0.04632568359375, "learning_rate": 5.336065552641323e-11, "loss": 0.0004630237817764282, "memory(GiB)": 38.15, "reward": 0.7189874649047852, "reward_std": 0.042828354984521866, "rewards/VisualizationJSONCombinedORM/mean": 0.7189874649047852, "rewards/VisualizationJSONCombinedORM/std": 0.052006036043167114, "step": 6037, "train_speed(iter/s)": 0.064434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 351.75, "completions/min_length": 260.0, "epoch": 4.994210090984285, "grad_norm": 0.1651468127965927, "kl": 0.0704345703125, "learning_rate": 4.0854268918422145e-11, "loss": 0.0007042661309242249, "memory(GiB)": 38.15, "reward": 0.43391484022140503, "reward_std": 0.04654868692159653, "rewards/VisualizationJSONCombinedORM/mean": 0.43391484022140503, "rewards/VisualizationJSONCombinedORM/std": 0.0817156434059143, "step": 6038, "train_speed(iter/s)": 0.064425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 314.25, "completions/min_length": 254.0, "epoch": 4.995037220843672, "grad_norm": 0.17735229432582855, "kl": 0.04876708984375, "learning_rate": 3.0015392090687603e-11, "loss": 0.00048597902059555054, "memory(GiB)": 38.15, "reward": 0.5019304156303406, "reward_std": 0.03046821802854538, "rewards/VisualizationJSONCombinedORM/mean": 0.5019304156303406, "rewards/VisualizationJSONCombinedORM/std": 0.1490679830312729, "step": 6039, "train_speed(iter/s)": 0.064418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 312.6875, "completions/min_length": 264.0, "epoch": 4.9958643507030605, "grad_norm": 0.17252318561077118, "kl": 0.0802001953125, "learning_rate": 2.084402865754065e-11, "loss": 0.0007989704608917236, "memory(GiB)": 38.15, "reward": 0.5644674301147461, "reward_std": 0.040231600403785706, "rewards/VisualizationJSONCombinedORM/mean": 0.5644674301147461, "rewards/VisualizationJSONCombinedORM/std": 0.08467186987400055, "step": 6040, "train_speed(iter/s)": 0.064407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 306.6875, "completions/min_length": 249.0, "epoch": 4.996691480562449, "grad_norm": 0.32115283608436584, "kl": 0.05865478515625, "learning_rate": 1.3340181677645724e-11, "loss": 0.0005875285714864731, "memory(GiB)": 38.15, "reward": 0.3527697026729584, "reward_std": 0.04750170186161995, "rewards/VisualizationJSONCombinedORM/mean": 0.3527697026729584, "rewards/VisualizationJSONCombinedORM/std": 0.1125858947634697, "step": 6041, "train_speed(iter/s)": 0.064397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 329.125, "completions/min_length": 258.0, "epoch": 4.997518610421836, "grad_norm": 0.21998152136802673, "kl": 0.129638671875, "learning_rate": 7.503853653445525e-12, "loss": 0.00129694864153862, "memory(GiB)": 38.15, "reward": 0.35590100288391113, "reward_std": 0.0510701984167099, "rewards/VisualizationJSONCombinedORM/mean": 0.35590100288391113, "rewards/VisualizationJSONCombinedORM/std": 0.052261821925640106, "step": 6042, "train_speed(iter/s)": 0.064389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 295.3125, "completions/min_length": 220.0, "epoch": 4.998345740281224, "grad_norm": 0.18874408304691315, "kl": 0.09423828125, "learning_rate": 3.335046531716124e-12, "loss": 0.0009422563016414642, "memory(GiB)": 38.15, "reward": 0.42993563413619995, "reward_std": 0.034692030400037766, "rewards/VisualizationJSONCombinedORM/mean": 0.42993563413619995, "rewards/VisualizationJSONCombinedORM/std": 0.19258475303649902, "step": 6043, "train_speed(iter/s)": 0.064385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 317.1875, "completions/min_length": 245.0, "epoch": 4.999172870140612, "grad_norm": 0.2584044635295868, "kl": 0.061767578125, "learning_rate": 8.33761702456748e-13, "loss": 0.0006164908409118652, "memory(GiB)": 38.15, "reward": 0.550040602684021, "reward_std": 0.052717842161655426, "rewards/VisualizationJSONCombinedORM/mean": 0.550040602684021, "rewards/VisualizationJSONCombinedORM/std": 0.22870904207229614, "step": 6044, "train_speed(iter/s)": 0.064376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 315.9375, "completions/min_length": 248.0, "epoch": 5.0, "grad_norm": 0.22514377534389496, "kl": 0.0833740234375, "learning_rate": 0.0, "loss": 0.0008340775966644287, "memory(GiB)": 38.15, "reward": 0.46580827236175537, "reward_std": 0.04544491693377495, "rewards/VisualizationJSONCombinedORM/mean": 0.46580827236175537, "rewards/VisualizationJSONCombinedORM/std": 0.29172518849372864, "step": 6045, "train_speed(iter/s)": 0.06437 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 368.1666666666667, "eval_completions/mean_length": 309.3385416666667, "eval_completions/min_length": 259.1666666666667, "eval_kl": 0.08336385091145833, "eval_loss": 0.0008374601602554321, "eval_reward": 0.45094586970905465, "eval_reward_std": 0.05418863537488505, "eval_rewards/VisualizationJSONCombinedORM/mean": 0.45094586970905465, "eval_rewards/VisualizationJSONCombinedORM/std": 0.05418863663605104, "eval_runtime": 314.1559, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 6045 } ], "logging_steps": 1, "max_steps": 6045, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }