trainer_state.json
374.3 KB · 12214 lines · json Raw
1 {
2 "best_metric": null,
3 "best_model_checkpoint": null,
4 "epoch": 10.0,
5 "eval_steps": 500,
6 "global_step": 870,
7 "is_hyper_param_search": false,
8 "is_local_process_zero": true,
9 "is_world_process_zero": true,
10 "log_history": [
11 {
12 "clip_ratio": 0.0,
13 "completion_length": 107.88542175292969,
14 "epoch": 0.011494252873563218,
15 "grad_norm": 1.3213221828105846,
16 "kl": 0.0,
17 "learning_rate": 9.988505747126437e-07,
18 "loss": 0.0,
19 "reward": 1.8197596073150635,
20 "reward_std": 0.06271697580814362,
21 "rewards/accuracy_reward": 0.8243168592453003,
22 "rewards/format_reward": 0.9954427480697632,
23 "step": 1
24 },
25 {
26 "clip_ratio": 0.0,
27 "completion_length": 109.89388275146484,
28 "epoch": 0.022988505747126436,
29 "grad_norm": 1.006446520082249,
30 "kl": 0.000926971435546875,
31 "learning_rate": 9.977011494252872e-07,
32 "loss": 0.0001,
33 "reward": 1.7921943664550781,
34 "reward_std": 0.06994771957397461,
35 "rewards/accuracy_reward": 0.7961006164550781,
36 "rewards/format_reward": 0.99609375,
37 "step": 2
38 },
39 {
40 "clip_ratio": 0.0,
41 "completion_length": 109.96745300292969,
42 "epoch": 0.034482758620689655,
43 "grad_norm": 1.4023786331817765,
44 "kl": 0.0013885498046875,
45 "learning_rate": 9.96551724137931e-07,
46 "loss": 0.0001,
47 "reward": 1.8005397319793701,
48 "reward_std": 0.06632152944803238,
49 "rewards/accuracy_reward": 0.8037948608398438,
50 "rewards/format_reward": 0.9967448115348816,
51 "step": 3
52 },
53 {
54 "clip_ratio": 0.0,
55 "completion_length": 108.48958587646484,
56 "epoch": 0.04597701149425287,
57 "grad_norm": 1.0375104386573222,
58 "kl": 0.00238037109375,
59 "learning_rate": 9.954022988505747e-07,
60 "loss": 0.0001,
61 "reward": 1.801190733909607,
62 "reward_std": 0.06357432901859283,
63 "rewards/accuracy_reward": 0.8044459819793701,
64 "rewards/format_reward": 0.9967448115348816,
65 "step": 4
66 },
67 {
68 "clip_ratio": 0.0,
69 "completion_length": 109.017578125,
70 "epoch": 0.05747126436781609,
71 "grad_norm": 1.2136231517307197,
72 "kl": 0.0023956298828125,
73 "learning_rate": 9.942528735632182e-07,
74 "loss": 0.0001,
75 "reward": 1.7834184169769287,
76 "reward_std": 0.06750813126564026,
77 "rewards/accuracy_reward": 0.7886267900466919,
78 "rewards/format_reward": 0.9947916865348816,
79 "step": 5
80 },
81 {
82 "clip_ratio": 0.0,
83 "completion_length": 111.845703125,
84 "epoch": 0.06896551724137931,
85 "grad_norm": 1.9568797025276756,
86 "kl": 0.003631591796875,
87 "learning_rate": 9.93103448275862e-07,
88 "loss": 0.0001,
89 "reward": 1.816709280014038,
90 "reward_std": 0.06217224523425102,
91 "rewards/accuracy_reward": 0.8199643492698669,
92 "rewards/format_reward": 0.9967448115348816,
93 "step": 6
94 },
95 {
96 "clip_ratio": 0.0,
97 "completion_length": 110.53450775146484,
98 "epoch": 0.08045977011494253,
99 "grad_norm": 1.0517811222524571,
100 "kl": 0.00457763671875,
101 "learning_rate": 9.919540229885057e-07,
102 "loss": 0.0002,
103 "reward": 1.8246958255767822,
104 "reward_std": 0.05271482467651367,
105 "rewards/accuracy_reward": 0.8246957063674927,
106 "rewards/format_reward": 1.0,
107 "step": 7
108 },
109 {
110 "clip_ratio": 0.0,
111 "completion_length": 111.068359375,
112 "epoch": 0.09195402298850575,
113 "grad_norm": 1.1645981671220067,
114 "kl": 0.004638671875,
115 "learning_rate": 9.908045977011493e-07,
116 "loss": 0.0002,
117 "reward": 1.8276481628417969,
118 "reward_std": 0.05832577869296074,
119 "rewards/accuracy_reward": 0.8296012878417969,
120 "rewards/format_reward": 0.998046875,
121 "step": 8
122 },
123 {
124 "clip_ratio": 0.0,
125 "completion_length": 111.41471862792969,
126 "epoch": 0.10344827586206896,
127 "grad_norm": 1.323752481080967,
128 "kl": 0.0045166015625,
129 "learning_rate": 9.89655172413793e-07,
130 "loss": 0.0002,
131 "reward": 1.7714653015136719,
132 "reward_std": 0.05968251824378967,
133 "rewards/accuracy_reward": 0.7734185457229614,
134 "rewards/format_reward": 0.998046875,
135 "step": 9
136 },
137 {
138 "clip_ratio": 0.0,
139 "completion_length": 107.76042175292969,
140 "epoch": 0.11494252873563218,
141 "grad_norm": 1.412974502439727,
142 "kl": 0.00445556640625,
143 "learning_rate": 9.885057471264367e-07,
144 "loss": 0.0002,
145 "reward": 1.8277686834335327,
146 "reward_std": 0.05361395701766014,
147 "rewards/accuracy_reward": 0.8284196853637695,
148 "rewards/format_reward": 0.9993489980697632,
149 "step": 10
150 },
151 {
152 "clip_ratio": 0.0,
153 "completion_length": 107.87630462646484,
154 "epoch": 0.12643678160919541,
155 "grad_norm": 2.075304847500042,
156 "kl": 0.00518798828125,
157 "learning_rate": 9.873563218390805e-07,
158 "loss": 0.0002,
159 "reward": 1.8043527603149414,
160 "reward_std": 0.05421295762062073,
161 "rewards/accuracy_reward": 0.805003821849823,
162 "rewards/format_reward": 0.9993489980697632,
163 "step": 11
164 },
165 {
166 "clip_ratio": 0.0,
167 "completion_length": 108.30989837646484,
168 "epoch": 0.13793103448275862,
169 "grad_norm": 1.4906362546711622,
170 "kl": 0.005401611328125,
171 "learning_rate": 9.86206896551724e-07,
172 "loss": 0.0002,
173 "reward": 1.804455280303955,
174 "reward_std": 0.054727546870708466,
175 "rewards/accuracy_reward": 0.805106520652771,
176 "rewards/format_reward": 0.9993489980697632,
177 "step": 12
178 },
179 {
180 "clip_ratio": 0.0,
181 "completion_length": 106.990234375,
182 "epoch": 0.14942528735632185,
183 "grad_norm": 1.5774307568902088,
184 "kl": 0.007354736328125,
185 "learning_rate": 9.850574712643678e-07,
186 "loss": 0.0003,
187 "reward": 1.8098227977752686,
188 "reward_std": 0.05695592984557152,
189 "rewards/accuracy_reward": 0.8098229169845581,
190 "rewards/format_reward": 1.0,
191 "step": 13
192 },
193 {
194 "clip_ratio": 0.0,
195 "completion_length": 105.994140625,
196 "epoch": 0.16091954022988506,
197 "grad_norm": 1.1798175513976064,
198 "kl": 0.00909423828125,
199 "learning_rate": 9.839080459770115e-07,
200 "loss": 0.0004,
201 "reward": 1.804555892944336,
202 "reward_std": 0.054406534880399704,
203 "rewards/accuracy_reward": 0.8045558929443359,
204 "rewards/format_reward": 1.0,
205 "step": 14
206 },
207 {
208 "clip_ratio": 0.0,
209 "completion_length": 103.54622650146484,
210 "epoch": 0.1724137931034483,
211 "grad_norm": 1.4179734216380988,
212 "kl": 0.00946044921875,
213 "learning_rate": 9.82758620689655e-07,
214 "loss": 0.0004,
215 "reward": 1.842616081237793,
216 "reward_std": 0.04714566469192505,
217 "rewards/accuracy_reward": 0.8432672023773193,
218 "rewards/format_reward": 0.9993489980697632,
219 "step": 15
220 },
221 {
222 "clip_ratio": 0.0,
223 "completion_length": 105.39974212646484,
224 "epoch": 0.1839080459770115,
225 "grad_norm": 4.716568487745124,
226 "kl": 0.01055908203125,
227 "learning_rate": 9.816091954022988e-07,
228 "loss": 0.0004,
229 "reward": 1.7991819381713867,
230 "reward_std": 0.05012369155883789,
231 "rewards/accuracy_reward": 0.7991819381713867,
232 "rewards/format_reward": 1.0,
233 "step": 16
234 },
235 {
236 "clip_ratio": 0.0,
237 "completion_length": 104.38216400146484,
238 "epoch": 0.19540229885057472,
239 "grad_norm": 1.288358836642226,
240 "kl": 0.01361083984375,
241 "learning_rate": 9.804597701149425e-07,
242 "loss": 0.0006,
243 "reward": 1.8223531246185303,
244 "reward_std": 0.05100415274500847,
245 "rewards/accuracy_reward": 0.8223528861999512,
246 "rewards/format_reward": 1.0,
247 "step": 17
248 },
249 {
250 "clip_ratio": 0.0,
251 "completion_length": 100.85612487792969,
252 "epoch": 0.20689655172413793,
253 "grad_norm": 1.4234083835613787,
254 "kl": 0.01214599609375,
255 "learning_rate": 9.79310344827586e-07,
256 "loss": 0.0005,
257 "reward": 1.8194246292114258,
258 "reward_std": 0.04791136831045151,
259 "rewards/accuracy_reward": 0.8200756311416626,
260 "rewards/format_reward": 0.9993489980697632,
261 "step": 18
262 },
263 {
264 "clip_ratio": 0.0,
265 "completion_length": 103.123046875,
266 "epoch": 0.21839080459770116,
267 "grad_norm": 1.4919735757668362,
268 "kl": 0.0126953125,
269 "learning_rate": 9.781609195402298e-07,
270 "loss": 0.0006,
271 "reward": 1.7989381551742554,
272 "reward_std": 0.05323593318462372,
273 "rewards/accuracy_reward": 0.7989381551742554,
274 "rewards/format_reward": 1.0,
275 "step": 19
276 },
277 {
278 "clip_ratio": 0.0,
279 "completion_length": 101.22917175292969,
280 "epoch": 0.22988505747126436,
281 "grad_norm": 1.149828657266046,
282 "kl": 0.012939453125,
283 "learning_rate": 9.770114942528735e-07,
284 "loss": 0.0006,
285 "reward": 1.8272727727890015,
286 "reward_std": 0.04600293189287186,
287 "rewards/accuracy_reward": 0.8272727727890015,
288 "rewards/format_reward": 1.0,
289 "step": 20
290 },
291 {
292 "clip_ratio": 0.0,
293 "completion_length": 102.38411712646484,
294 "epoch": 0.2413793103448276,
295 "grad_norm": 4.856552679666189,
296 "kl": 0.01416015625,
297 "learning_rate": 9.75862068965517e-07,
298 "loss": 0.0006,
299 "reward": 1.8167914152145386,
300 "reward_std": 0.04727660119533539,
301 "rewards/accuracy_reward": 0.816791296005249,
302 "rewards/format_reward": 1.0,
303 "step": 21
304 },
305 {
306 "clip_ratio": 0.0,
307 "completion_length": 102.02734375,
308 "epoch": 0.25287356321839083,
309 "grad_norm": 1.4678458148251887,
310 "kl": 0.0172119140625,
311 "learning_rate": 9.747126436781608e-07,
312 "loss": 0.0007,
313 "reward": 1.8238710165023804,
314 "reward_std": 0.04785631224513054,
315 "rewards/accuracy_reward": 0.8245220184326172,
316 "rewards/format_reward": 0.9993489980697632,
317 "step": 22
318 },
319 {
320 "clip_ratio": 0.0,
321 "completion_length": 101.50521087646484,
322 "epoch": 0.26436781609195403,
323 "grad_norm": 2.388579996164326,
324 "kl": 0.0181884765625,
325 "learning_rate": 9.735632183908046e-07,
326 "loss": 0.0007,
327 "reward": 1.8233423233032227,
328 "reward_std": 0.04244537279009819,
329 "rewards/accuracy_reward": 0.8233422040939331,
330 "rewards/format_reward": 1.0,
331 "step": 23
332 },
333 {
334 "clip_ratio": 0.0,
335 "completion_length": 101.33138275146484,
336 "epoch": 0.27586206896551724,
337 "grad_norm": 1.5276786773430984,
338 "kl": 0.01806640625,
339 "learning_rate": 9.72413793103448e-07,
340 "loss": 0.0008,
341 "reward": 1.837747573852539,
342 "reward_std": 0.041725195944309235,
343 "rewards/accuracy_reward": 0.8383986353874207,
344 "rewards/format_reward": 0.9993489980697632,
345 "step": 24
346 },
347 {
348 "clip_ratio": 0.0,
349 "completion_length": 101.095703125,
350 "epoch": 0.28735632183908044,
351 "grad_norm": 2.321983309381386,
352 "kl": 0.0213623046875,
353 "learning_rate": 9.712643678160918e-07,
354 "loss": 0.0009,
355 "reward": 1.7901400327682495,
356 "reward_std": 0.04634283110499382,
357 "rewards/accuracy_reward": 0.7901400923728943,
358 "rewards/format_reward": 1.0,
359 "step": 25
360 },
361 {
362 "clip_ratio": 0.0,
363 "completion_length": 101.55859375,
364 "epoch": 0.2988505747126437,
365 "grad_norm": 2.351843921213514,
366 "kl": 0.023681640625,
367 "learning_rate": 9.701149425287356e-07,
368 "loss": 0.001,
369 "reward": 1.804998755455017,
370 "reward_std": 0.0408855676651001,
371 "rewards/accuracy_reward": 0.8049987554550171,
372 "rewards/format_reward": 1.0,
373 "step": 26
374 },
375 {
376 "clip_ratio": 0.0,
377 "completion_length": 102.91732025146484,
378 "epoch": 0.3103448275862069,
379 "grad_norm": 1.1953728712395675,
380 "kl": 0.023193359375,
381 "learning_rate": 9.689655172413793e-07,
382 "loss": 0.001,
383 "reward": 1.8027284145355225,
384 "reward_std": 0.04391753673553467,
385 "rewards/accuracy_reward": 0.8033794164657593,
386 "rewards/format_reward": 0.9993489980697632,
387 "step": 27
388 },
389 {
390 "clip_ratio": 0.0,
391 "completion_length": 100.53841400146484,
392 "epoch": 0.3218390804597701,
393 "grad_norm": 1.4473686865312074,
394 "kl": 0.0211181640625,
395 "learning_rate": 9.678160919540228e-07,
396 "loss": 0.0009,
397 "reward": 1.8385992050170898,
398 "reward_std": 0.038850087672472,
399 "rewards/accuracy_reward": 0.8385992050170898,
400 "rewards/format_reward": 1.0,
401 "step": 28
402 },
403 {
404 "clip_ratio": 0.0,
405 "completion_length": 101.99153900146484,
406 "epoch": 0.3333333333333333,
407 "grad_norm": 1.7266188910571618,
408 "kl": 0.02099609375,
409 "learning_rate": 9.666666666666666e-07,
410 "loss": 0.0009,
411 "reward": 1.8396878242492676,
412 "reward_std": 0.03943753242492676,
413 "rewards/accuracy_reward": 0.840338945388794,
414 "rewards/format_reward": 0.9993489980697632,
415 "step": 29
416 },
417 {
418 "clip_ratio": 0.0,
419 "completion_length": 103.44140625,
420 "epoch": 0.3448275862068966,
421 "grad_norm": 1.264532198419213,
422 "kl": 0.0211181640625,
423 "learning_rate": 9.655172413793103e-07,
424 "loss": 0.0009,
425 "reward": 1.819528341293335,
426 "reward_std": 0.04290936142206192,
427 "rewards/accuracy_reward": 0.8195282816886902,
428 "rewards/format_reward": 1.0,
429 "step": 30
430 },
431 {
432 "clip_ratio": 0.0,
433 "completion_length": 103.53255462646484,
434 "epoch": 0.3563218390804598,
435 "grad_norm": 1.7079785748793026,
436 "kl": 0.0220947265625,
437 "learning_rate": 9.643678160919539e-07,
438 "loss": 0.0009,
439 "reward": 1.8193325996398926,
440 "reward_std": 0.039525143802165985,
441 "rewards/accuracy_reward": 0.8193327188491821,
442 "rewards/format_reward": 1.0,
443 "step": 31
444 },
445 {
446 "clip_ratio": 0.0,
447 "completion_length": 105.01237487792969,
448 "epoch": 0.367816091954023,
449 "grad_norm": 1.1050680979361267,
450 "kl": 0.018310546875,
451 "learning_rate": 9.632183908045976e-07,
452 "loss": 0.0008,
453 "reward": 1.8326280117034912,
454 "reward_std": 0.03926192969083786,
455 "rewards/accuracy_reward": 0.833279013633728,
456 "rewards/format_reward": 0.9993489980697632,
457 "step": 32
458 },
459 {
460 "clip_ratio": 0.0,
461 "completion_length": 107.38542175292969,
462 "epoch": 0.3793103448275862,
463 "grad_norm": 1.9594290453169385,
464 "kl": 0.018310546875,
465 "learning_rate": 9.620689655172413e-07,
466 "loss": 0.0008,
467 "reward": 1.830482840538025,
468 "reward_std": 0.03912848234176636,
469 "rewards/accuracy_reward": 0.8304828405380249,
470 "rewards/format_reward": 1.0,
471 "step": 33
472 },
473 {
474 "clip_ratio": 0.0,
475 "completion_length": 108.35482025146484,
476 "epoch": 0.39080459770114945,
477 "grad_norm": 1.1967253632472934,
478 "kl": 0.01806640625,
479 "learning_rate": 9.609195402298849e-07,
480 "loss": 0.0007,
481 "reward": 1.8504703044891357,
482 "reward_std": 0.03594374656677246,
483 "rewards/accuracy_reward": 0.8504700660705566,
484 "rewards/format_reward": 1.0,
485 "step": 34
486 },
487 {
488 "clip_ratio": 0.0,
489 "completion_length": 107.44271087646484,
490 "epoch": 0.40229885057471265,
491 "grad_norm": 1.4075688908737447,
492 "kl": 0.020751953125,
493 "learning_rate": 9.597701149425286e-07,
494 "loss": 0.0009,
495 "reward": 1.8235161304473877,
496 "reward_std": 0.03952939808368683,
497 "rewards/accuracy_reward": 0.8241671919822693,
498 "rewards/format_reward": 0.9993489980697632,
499 "step": 35
500 },
501 {
502 "clip_ratio": 0.0,
503 "completion_length": 106.69010925292969,
504 "epoch": 0.41379310344827586,
505 "grad_norm": 1.1584071923987709,
506 "kl": 0.0166015625,
507 "learning_rate": 9.586206896551724e-07,
508 "loss": 0.0007,
509 "reward": 1.8258438110351562,
510 "reward_std": 0.03740541636943817,
511 "rewards/accuracy_reward": 0.8258438110351562,
512 "rewards/format_reward": 1.0,
513 "step": 36
514 },
515 {
516 "clip_ratio": 0.0,
517 "completion_length": 106.52083587646484,
518 "epoch": 0.42528735632183906,
519 "grad_norm": 3.559550579405242,
520 "kl": 0.017578125,
521 "learning_rate": 9.57471264367816e-07,
522 "loss": 0.0007,
523 "reward": 1.8376644849777222,
524 "reward_std": 0.03800010681152344,
525 "rewards/accuracy_reward": 0.8376644849777222,
526 "rewards/format_reward": 1.0,
527 "step": 37
528 },
529 {
530 "clip_ratio": 0.0,
531 "completion_length": 104.80078125,
532 "epoch": 0.4367816091954023,
533 "grad_norm": 1.414167289320497,
534 "kl": 0.017333984375,
535 "learning_rate": 9.563218390804596e-07,
536 "loss": 0.0007,
537 "reward": 1.8298163414001465,
538 "reward_std": 0.03874580189585686,
539 "rewards/accuracy_reward": 0.8304674029350281,
540 "rewards/format_reward": 0.9993489980697632,
541 "step": 38
542 },
543 {
544 "clip_ratio": 0.0,
545 "completion_length": 104.865234375,
546 "epoch": 0.4482758620689655,
547 "grad_norm": 3.3277943720668635,
548 "kl": 0.01806640625,
549 "learning_rate": 9.551724137931034e-07,
550 "loss": 0.0008,
551 "reward": 1.8412444591522217,
552 "reward_std": 0.0331571064889431,
553 "rewards/accuracy_reward": 0.8412443995475769,
554 "rewards/format_reward": 1.0,
555 "step": 39
556 },
557 {
558 "clip_ratio": 0.0,
559 "completion_length": 104.65625,
560 "epoch": 0.45977011494252873,
561 "grad_norm": 1.3938070271983027,
562 "kl": 0.01904296875,
563 "learning_rate": 9.540229885057471e-07,
564 "loss": 0.0008,
565 "reward": 1.829883098602295,
566 "reward_std": 0.03403263911604881,
567 "rewards/accuracy_reward": 0.8298830986022949,
568 "rewards/format_reward": 1.0,
569 "step": 40
570 },
571 {
572 "clip_ratio": 0.0,
573 "completion_length": 103.31315612792969,
574 "epoch": 0.47126436781609193,
575 "grad_norm": 2.4737739620251444,
576 "kl": 0.01806640625,
577 "learning_rate": 9.528735632183908e-07,
578 "loss": 0.0008,
579 "reward": 1.8481197357177734,
580 "reward_std": 0.03319514915347099,
581 "rewards/accuracy_reward": 0.8481197357177734,
582 "rewards/format_reward": 1.0,
583 "step": 41
584 },
585 {
586 "clip_ratio": 0.0,
587 "completion_length": 105.20247650146484,
588 "epoch": 0.4827586206896552,
589 "grad_norm": 3.381834125397947,
590 "kl": 0.0181884765625,
591 "learning_rate": 9.517241379310345e-07,
592 "loss": 0.0008,
593 "reward": 1.8192237615585327,
594 "reward_std": 0.03670245409011841,
595 "rewards/accuracy_reward": 0.8198747634887695,
596 "rewards/format_reward": 0.9993489980697632,
597 "step": 42
598 },
599 {
600 "clip_ratio": 0.0,
601 "completion_length": 103.3125,
602 "epoch": 0.4942528735632184,
603 "grad_norm": 1.8762875005665294,
604 "kl": 0.0189208984375,
605 "learning_rate": 9.505747126436781e-07,
606 "loss": 0.0008,
607 "reward": 1.866302251815796,
608 "reward_std": 0.029023345559835434,
609 "rewards/accuracy_reward": 0.8663021922111511,
610 "rewards/format_reward": 1.0,
611 "step": 43
612 },
613 {
614 "clip_ratio": 0.0,
615 "completion_length": 103.919921875,
616 "epoch": 0.5057471264367817,
617 "grad_norm": 1.364282450282579,
618 "kl": 0.0186767578125,
619 "learning_rate": 9.494252873563218e-07,
620 "loss": 0.0008,
621 "reward": 1.8467683792114258,
622 "reward_std": 0.032513901591300964,
623 "rewards/accuracy_reward": 0.8467683792114258,
624 "rewards/format_reward": 1.0,
625 "step": 44
626 },
627 {
628 "clip_ratio": 0.0,
629 "completion_length": 105.521484375,
630 "epoch": 0.5172413793103449,
631 "grad_norm": 1.26529494442379,
632 "kl": 0.0184326171875,
633 "learning_rate": 9.482758620689655e-07,
634 "loss": 0.0008,
635 "reward": 1.8102507591247559,
636 "reward_std": 0.03791056573390961,
637 "rewards/accuracy_reward": 0.8115529417991638,
638 "rewards/format_reward": 0.9986979365348816,
639 "step": 45
640 },
641 {
642 "clip_ratio": 0.0,
643 "completion_length": 104.720703125,
644 "epoch": 0.5287356321839081,
645 "grad_norm": 1.9592533419058873,
646 "kl": 0.0203857421875,
647 "learning_rate": 9.471264367816092e-07,
648 "loss": 0.0009,
649 "reward": 1.8435447216033936,
650 "reward_std": 0.03397071361541748,
651 "rewards/accuracy_reward": 0.844195544719696,
652 "rewards/format_reward": 0.9993489980697632,
653 "step": 46
654 },
655 {
656 "clip_ratio": 0.0,
657 "completion_length": 105.55013275146484,
658 "epoch": 0.5402298850574713,
659 "grad_norm": 1.434106375840011,
660 "kl": 0.01953125,
661 "learning_rate": 9.459770114942528e-07,
662 "loss": 0.0008,
663 "reward": 1.8275771141052246,
664 "reward_std": 0.039521463215351105,
665 "rewards/accuracy_reward": 0.8288793563842773,
666 "rewards/format_reward": 0.9986979365348816,
667 "step": 47
668 },
669 {
670 "clip_ratio": 0.0,
671 "completion_length": 108.88346862792969,
672 "epoch": 0.5517241379310345,
673 "grad_norm": 1.3331417373343877,
674 "kl": 0.019775390625,
675 "learning_rate": 9.448275862068965e-07,
676 "loss": 0.0008,
677 "reward": 1.847245216369629,
678 "reward_std": 0.031749702990055084,
679 "rewards/accuracy_reward": 0.8472453951835632,
680 "rewards/format_reward": 1.0,
681 "step": 48
682 },
683 {
684 "clip_ratio": 0.0,
685 "completion_length": 106.53841400146484,
686 "epoch": 0.5632183908045977,
687 "grad_norm": 1.2065807063642335,
688 "kl": 0.0191650390625,
689 "learning_rate": 9.436781609195402e-07,
690 "loss": 0.0008,
691 "reward": 1.8429206609725952,
692 "reward_std": 0.030585885047912598,
693 "rewards/accuracy_reward": 0.8435718417167664,
694 "rewards/format_reward": 0.9993489980697632,
695 "step": 49
696 },
697 {
698 "clip_ratio": 0.0,
699 "completion_length": 105.23503112792969,
700 "epoch": 0.5747126436781609,
701 "grad_norm": 1.8805762744730514,
702 "kl": 0.0216064453125,
703 "learning_rate": 9.425287356321838e-07,
704 "loss": 0.0009,
705 "reward": 1.8460264205932617,
706 "reward_std": 0.030056733638048172,
707 "rewards/accuracy_reward": 0.8460264205932617,
708 "rewards/format_reward": 1.0,
709 "step": 50
710 },
711 {
712 "clip_ratio": 0.0,
713 "completion_length": 107.20964050292969,
714 "epoch": 0.5862068965517241,
715 "grad_norm": 2.416272655538971,
716 "kl": 0.031494140625,
717 "learning_rate": 9.413793103448276e-07,
718 "loss": 0.0013,
719 "reward": 1.8473304510116577,
720 "reward_std": 0.028418144211173058,
721 "rewards/accuracy_reward": 0.8473303318023682,
722 "rewards/format_reward": 1.0,
723 "step": 51
724 },
725 {
726 "clip_ratio": 0.0,
727 "completion_length": 106.59245300292969,
728 "epoch": 0.5977011494252874,
729 "grad_norm": 1.7021853778739644,
730 "kl": 0.0206298828125,
731 "learning_rate": 9.402298850574713e-07,
732 "loss": 0.0009,
733 "reward": 1.837215781211853,
734 "reward_std": 0.027952462434768677,
735 "rewards/accuracy_reward": 0.837215781211853,
736 "rewards/format_reward": 1.0,
737 "step": 52
738 },
739 {
740 "clip_ratio": 0.0,
741 "completion_length": 106.181640625,
742 "epoch": 0.6091954022988506,
743 "grad_norm": 1.5829777832061342,
744 "kl": 0.020263671875,
745 "learning_rate": 9.390804597701148e-07,
746 "loss": 0.0009,
747 "reward": 1.8315935134887695,
748 "reward_std": 0.03717650845646858,
749 "rewards/accuracy_reward": 0.8315935134887695,
750 "rewards/format_reward": 1.0,
751 "step": 53
752 },
753 {
754 "clip_ratio": 0.0,
755 "completion_length": 105.23112487792969,
756 "epoch": 0.6206896551724138,
757 "grad_norm": 1.5951363696277052,
758 "kl": 0.027099609375,
759 "learning_rate": 9.379310344827586e-07,
760 "loss": 0.0011,
761 "reward": 1.8406846523284912,
762 "reward_std": 0.031024938449263573,
763 "rewards/accuracy_reward": 0.8413355350494385,
764 "rewards/format_reward": 0.9993489980697632,
765 "step": 54
766 },
767 {
768 "clip_ratio": 0.0,
769 "completion_length": 105.03190612792969,
770 "epoch": 0.632183908045977,
771 "grad_norm": 1.3835743922707338,
772 "kl": 0.018798828125,
773 "learning_rate": 9.367816091954023e-07,
774 "loss": 0.0009,
775 "reward": 1.8285382986068726,
776 "reward_std": 0.02894814871251583,
777 "rewards/accuracy_reward": 0.8285383582115173,
778 "rewards/format_reward": 1.0,
779 "step": 55
780 },
781 {
782 "clip_ratio": 0.0,
783 "completion_length": 104.86849212646484,
784 "epoch": 0.6436781609195402,
785 "grad_norm": 1.3591671428361543,
786 "kl": 0.019287109375,
787 "learning_rate": 9.356321839080458e-07,
788 "loss": 0.0008,
789 "reward": 1.8349835872650146,
790 "reward_std": 0.033723484724760056,
791 "rewards/accuracy_reward": 0.8356344699859619,
792 "rewards/format_reward": 0.9993489980697632,
793 "step": 56
794 },
795 {
796 "clip_ratio": 0.0,
797 "completion_length": 105.513671875,
798 "epoch": 0.6551724137931034,
799 "grad_norm": 2.359691828231059,
800 "kl": 0.02392578125,
801 "learning_rate": 9.344827586206896e-07,
802 "loss": 0.001,
803 "reward": 1.8344968557357788,
804 "reward_std": 0.03300865739583969,
805 "rewards/accuracy_reward": 0.83514803647995,
806 "rewards/format_reward": 0.9993489980697632,
807 "step": 57
808 },
809 {
810 "clip_ratio": 0.0,
811 "completion_length": 107.46875,
812 "epoch": 0.6666666666666666,
813 "grad_norm": 1.4850162206462427,
814 "kl": 0.020263671875,
815 "learning_rate": 9.333333333333333e-07,
816 "loss": 0.0009,
817 "reward": 1.8346723318099976,
818 "reward_std": 0.03354714438319206,
819 "rewards/accuracy_reward": 0.8346724510192871,
820 "rewards/format_reward": 1.0,
821 "step": 58
822 },
823 {
824 "clip_ratio": 0.0,
825 "completion_length": 106.7109375,
826 "epoch": 0.6781609195402298,
827 "grad_norm": 1.2483974787682879,
828 "kl": 0.02197265625,
829 "learning_rate": 9.321839080459771e-07,
830 "loss": 0.0009,
831 "reward": 1.8298664093017578,
832 "reward_std": 0.03340629115700722,
833 "rewards/accuracy_reward": 0.8305175304412842,
834 "rewards/format_reward": 0.9993489980697632,
835 "step": 59
836 },
837 {
838 "clip_ratio": 0.0,
839 "completion_length": 106.80078125,
840 "epoch": 0.6896551724137931,
841 "grad_norm": 2.363012214927635,
842 "kl": 0.02978515625,
843 "learning_rate": 9.310344827586206e-07,
844 "loss": 0.0012,
845 "reward": 1.831284761428833,
846 "reward_std": 0.03303222730755806,
847 "rewards/accuracy_reward": 0.8319358229637146,
848 "rewards/format_reward": 0.9993489980697632,
849 "step": 60
850 },
851 {
852 "clip_ratio": 0.0,
853 "completion_length": 109.81771087646484,
854 "epoch": 0.7011494252873564,
855 "grad_norm": 4.0916014790353445,
856 "kl": 0.0196533203125,
857 "learning_rate": 9.298850574712643e-07,
858 "loss": 0.0008,
859 "reward": 1.839867353439331,
860 "reward_std": 0.031664229929447174,
861 "rewards/accuracy_reward": 0.8398674726486206,
862 "rewards/format_reward": 1.0,
863 "step": 61
864 },
865 {
866 "clip_ratio": 0.0,
867 "completion_length": 108.52214050292969,
868 "epoch": 0.7126436781609196,
869 "grad_norm": 3.2309398976005297,
870 "kl": 0.022216796875,
871 "learning_rate": 9.287356321839081e-07,
872 "loss": 0.0009,
873 "reward": 1.854928970336914,
874 "reward_std": 0.031918901950120926,
875 "rewards/accuracy_reward": 0.8549291491508484,
876 "rewards/format_reward": 1.0,
877 "step": 62
878 },
879 {
880 "clip_ratio": 0.0,
881 "completion_length": 108.703125,
882 "epoch": 0.7241379310344828,
883 "grad_norm": 1.9168568902881002,
884 "kl": 0.019287109375,
885 "learning_rate": 9.275862068965516e-07,
886 "loss": 0.0008,
887 "reward": 1.8610057830810547,
888 "reward_std": 0.02559598907828331,
889 "rewards/accuracy_reward": 0.8610057234764099,
890 "rewards/format_reward": 1.0,
891 "step": 63
892 },
893 {
894 "clip_ratio": 0.0,
895 "completion_length": 108.36067962646484,
896 "epoch": 0.735632183908046,
897 "grad_norm": 1.3566272735502378,
898 "kl": 0.0191650390625,
899 "learning_rate": 9.264367816091954e-07,
900 "loss": 0.0008,
901 "reward": 1.8530144691467285,
902 "reward_std": 0.0317344069480896,
903 "rewards/accuracy_reward": 0.8530145883560181,
904 "rewards/format_reward": 1.0,
905 "step": 64
906 },
907 {
908 "clip_ratio": 0.0,
909 "completion_length": 106.51888275146484,
910 "epoch": 0.7471264367816092,
911 "grad_norm": 1.9418693251170596,
912 "kl": 0.01806640625,
913 "learning_rate": 9.252873563218391e-07,
914 "loss": 0.0008,
915 "reward": 1.8553061485290527,
916 "reward_std": 0.028751468285918236,
917 "rewards/accuracy_reward": 0.8553061485290527,
918 "rewards/format_reward": 1.0,
919 "step": 65
920 },
921 {
922 "clip_ratio": 0.0,
923 "completion_length": 108.462890625,
924 "epoch": 0.7586206896551724,
925 "grad_norm": 1.6584510097653566,
926 "kl": 0.0179443359375,
927 "learning_rate": 9.241379310344826e-07,
928 "loss": 0.0008,
929 "reward": 1.8472495079040527,
930 "reward_std": 0.029529428109526634,
931 "rewards/accuracy_reward": 0.8472495079040527,
932 "rewards/format_reward": 1.0,
933 "step": 66
934 },
935 {
936 "clip_ratio": 0.0,
937 "completion_length": 109.38671875,
938 "epoch": 0.7701149425287356,
939 "grad_norm": 1.5373869592745633,
940 "kl": 0.0185546875,
941 "learning_rate": 9.229885057471264e-07,
942 "loss": 0.0008,
943 "reward": 1.8314546346664429,
944 "reward_std": 0.03092101775109768,
945 "rewards/accuracy_reward": 0.8314546346664429,
946 "rewards/format_reward": 1.0,
947 "step": 67
948 },
949 {
950 "clip_ratio": 0.0,
951 "completion_length": 106.568359375,
952 "epoch": 0.7816091954022989,
953 "grad_norm": 1.4559609132563736,
954 "kl": 0.01806640625,
955 "learning_rate": 9.218390804597701e-07,
956 "loss": 0.0008,
957 "reward": 1.8663041591644287,
958 "reward_std": 0.026553651317954063,
959 "rewards/accuracy_reward": 0.8663042783737183,
960 "rewards/format_reward": 1.0,
961 "step": 68
962 },
963 {
964 "clip_ratio": 0.0,
965 "completion_length": 104.46419525146484,
966 "epoch": 0.7931034482758621,
967 "grad_norm": 1.2008629627793295,
968 "kl": 0.0186767578125,
969 "learning_rate": 9.206896551724138e-07,
970 "loss": 0.0008,
971 "reward": 1.8613271713256836,
972 "reward_std": 0.02855812758207321,
973 "rewards/accuracy_reward": 0.8613271713256836,
974 "rewards/format_reward": 1.0,
975 "step": 69
976 },
977 {
978 "clip_ratio": 0.0,
979 "completion_length": 104.99153900146484,
980 "epoch": 0.8045977011494253,
981 "grad_norm": 4.9275967882281275,
982 "kl": 0.020751953125,
983 "learning_rate": 9.195402298850574e-07,
984 "loss": 0.0009,
985 "reward": 1.8591216802597046,
986 "reward_std": 0.028761819005012512,
987 "rewards/accuracy_reward": 0.8591216802597046,
988 "rewards/format_reward": 1.0,
989 "step": 70
990 },
991 {
992 "clip_ratio": 0.0,
993 "completion_length": 105.91536712646484,
994 "epoch": 0.8160919540229885,
995 "grad_norm": 9.850480698430928,
996 "kl": 0.01953125,
997 "learning_rate": 9.183908045977011e-07,
998 "loss": 0.0008,
999 "reward": 1.8541549444198608,
1000 "reward_std": 0.028836514800786972,
1001 "rewards/accuracy_reward": 0.8548059463500977,
1002 "rewards/format_reward": 0.9993489980697632,
1003 "step": 71
1004 },
1005 {
1006 "clip_ratio": 0.0,
1007 "completion_length": 102.697265625,
1008 "epoch": 0.8275862068965517,
1009 "grad_norm": 1.210498300407391,
1010 "kl": 0.0201416015625,
1011 "learning_rate": 9.172413793103448e-07,
1012 "loss": 0.0009,
1013 "reward": 1.843854308128357,
1014 "reward_std": 0.030750762671232224,
1015 "rewards/accuracy_reward": 0.8438543677330017,
1016 "rewards/format_reward": 1.0,
1017 "step": 72
1018 },
1019 {
1020 "clip_ratio": 0.0,
1021 "completion_length": 104.44075775146484,
1022 "epoch": 0.8390804597701149,
1023 "grad_norm": 1.8321539372725062,
1024 "kl": 0.0205078125,
1025 "learning_rate": 9.160919540229884e-07,
1026 "loss": 0.0009,
1027 "reward": 1.8534529209136963,
1028 "reward_std": 0.028409739956259727,
1029 "rewards/accuracy_reward": 0.8534530401229858,
1030 "rewards/format_reward": 1.0,
1031 "step": 73
1032 },
1033 {
1034 "clip_ratio": 0.0,
1035 "completion_length": 102.00065612792969,
1036 "epoch": 0.8505747126436781,
1037 "grad_norm": 1.1934111085412287,
1038 "kl": 0.0218505859375,
1039 "learning_rate": 9.149425287356322e-07,
1040 "loss": 0.0009,
1041 "reward": 1.8261594772338867,
1042 "reward_std": 0.028989605605602264,
1043 "rewards/accuracy_reward": 0.8268105387687683,
1044 "rewards/format_reward": 0.9993489980697632,
1045 "step": 74
1046 },
1047 {
1048 "clip_ratio": 0.0,
1049 "completion_length": 102.86458587646484,
1050 "epoch": 0.8620689655172413,
1051 "grad_norm": 1.1973438361715183,
1052 "kl": 0.02294921875,
1053 "learning_rate": 9.137931034482759e-07,
1054 "loss": 0.001,
1055 "reward": 1.8427714109420776,
1056 "reward_std": 0.028574138879776,
1057 "rewards/accuracy_reward": 0.8427714109420776,
1058 "rewards/format_reward": 1.0,
1059 "step": 75
1060 },
1061 {
1062 "clip_ratio": 0.0,
1063 "completion_length": 104.2109375,
1064 "epoch": 0.8735632183908046,
1065 "grad_norm": 1.3720840552975169,
1066 "kl": 0.0208740234375,
1067 "learning_rate": 9.126436781609194e-07,
1068 "loss": 0.0009,
1069 "reward": 1.8438997268676758,
1070 "reward_std": 0.034771427512168884,
1071 "rewards/accuracy_reward": 0.8452019095420837,
1072 "rewards/format_reward": 0.9986979365348816,
1073 "step": 76
1074 },
1075 {
1076 "clip_ratio": 0.0,
1077 "completion_length": 103.87825775146484,
1078 "epoch": 0.8850574712643678,
1079 "grad_norm": 1.2733923638265987,
1080 "kl": 0.021240234375,
1081 "learning_rate": 9.114942528735632e-07,
1082 "loss": 0.0009,
1083 "reward": 1.8525419235229492,
1084 "reward_std": 0.02576223388314247,
1085 "rewards/accuracy_reward": 0.8525419235229492,
1086 "rewards/format_reward": 1.0,
1087 "step": 77
1088 },
1089 {
1090 "clip_ratio": 0.0,
1091 "completion_length": 105.30599212646484,
1092 "epoch": 0.896551724137931,
1093 "grad_norm": 1.5683715987496294,
1094 "kl": 0.022705078125,
1095 "learning_rate": 9.103448275862069e-07,
1096 "loss": 0.0009,
1097 "reward": 1.8584070205688477,
1098 "reward_std": 0.026499662548303604,
1099 "rewards/accuracy_reward": 0.8590580821037292,
1100 "rewards/format_reward": 0.9993489980697632,
1101 "step": 78
1102 },
1103 {
1104 "clip_ratio": 0.0,
1105 "completion_length": 108.30208587646484,
1106 "epoch": 0.9080459770114943,
1107 "grad_norm": 1.7144360008338546,
1108 "kl": 0.0206298828125,
1109 "learning_rate": 9.091954022988505e-07,
1110 "loss": 0.0009,
1111 "reward": 1.836766004562378,
1112 "reward_std": 0.03153412044048309,
1113 "rewards/accuracy_reward": 0.8374168276786804,
1114 "rewards/format_reward": 0.9993489980697632,
1115 "step": 79
1116 },
1117 {
1118 "clip_ratio": 0.0,
1119 "completion_length": 106.78646087646484,
1120 "epoch": 0.9195402298850575,
1121 "grad_norm": 1.4249106766015625,
1122 "kl": 0.021484375,
1123 "learning_rate": 9.080459770114942e-07,
1124 "loss": 0.0009,
1125 "reward": 1.8593868017196655,
1126 "reward_std": 0.030518915504217148,
1127 "rewards/accuracy_reward": 0.859386682510376,
1128 "rewards/format_reward": 1.0,
1129 "step": 80
1130 },
1131 {
1132 "clip_ratio": 0.0,
1133 "completion_length": 108.84700775146484,
1134 "epoch": 0.9310344827586207,
1135 "grad_norm": 1.6378679289752103,
1136 "kl": 0.0205078125,
1137 "learning_rate": 9.068965517241379e-07,
1138 "loss": 0.0009,
1139 "reward": 1.8533859252929688,
1140 "reward_std": 0.02878081053495407,
1141 "rewards/accuracy_reward": 0.8533859252929688,
1142 "rewards/format_reward": 1.0,
1143 "step": 81
1144 },
1145 {
1146 "clip_ratio": 0.0,
1147 "completion_length": 110.23567962646484,
1148 "epoch": 0.9425287356321839,
1149 "grad_norm": 1.412691105042645,
1150 "kl": 0.021484375,
1151 "learning_rate": 9.057471264367816e-07,
1152 "loss": 0.0009,
1153 "reward": 1.847813606262207,
1154 "reward_std": 0.03038616292178631,
1155 "rewards/accuracy_reward": 0.8484646677970886,
1156 "rewards/format_reward": 0.9993489980697632,
1157 "step": 82
1158 },
1159 {
1160 "clip_ratio": 0.0,
1161 "completion_length": 108.27474212646484,
1162 "epoch": 0.9540229885057471,
1163 "grad_norm": 2.0344799768549784,
1164 "kl": 0.0244140625,
1165 "learning_rate": 9.045977011494252e-07,
1166 "loss": 0.001,
1167 "reward": 1.8461253643035889,
1168 "reward_std": 0.03104616329073906,
1169 "rewards/accuracy_reward": 0.8467763662338257,
1170 "rewards/format_reward": 0.9993489980697632,
1171 "step": 83
1172 },
1173 {
1174 "clip_ratio": 0.0,
1175 "completion_length": 109.6640625,
1176 "epoch": 0.9655172413793104,
1177 "grad_norm": 1.5604062393784244,
1178 "kl": 0.0211181640625,
1179 "learning_rate": 9.034482758620689e-07,
1180 "loss": 0.0009,
1181 "reward": 1.8446769714355469,
1182 "reward_std": 0.025367258116602898,
1183 "rewards/accuracy_reward": 0.8446769714355469,
1184 "rewards/format_reward": 1.0,
1185 "step": 84
1186 },
1187 {
1188 "clip_ratio": 0.0,
1189 "completion_length": 108.02734375,
1190 "epoch": 0.9770114942528736,
1191 "grad_norm": 1.362117300400181,
1192 "kl": 0.0216064453125,
1193 "learning_rate": 9.022988505747126e-07,
1194 "loss": 0.0009,
1195 "reward": 1.850293517112732,
1196 "reward_std": 0.027060410007834435,
1197 "rewards/accuracy_reward": 0.8502935171127319,
1198 "rewards/format_reward": 1.0,
1199 "step": 85
1200 },
1201 {
1202 "clip_ratio": 0.0,
1203 "completion_length": 108.21159362792969,
1204 "epoch": 0.9885057471264368,
1205 "grad_norm": 4.622693800475361,
1206 "kl": 0.0196533203125,
1207 "learning_rate": 9.011494252873562e-07,
1208 "loss": 0.0008,
1209 "reward": 1.8619520664215088,
1210 "reward_std": 0.02936243824660778,
1211 "rewards/accuracy_reward": 0.8626030683517456,
1212 "rewards/format_reward": 0.9993489980697632,
1213 "step": 86
1214 },
1215 {
1216 "clip_ratio": 0.0,
1217 "completion_length": 102.17837524414062,
1218 "epoch": 1.0,
1219 "grad_norm": 1.1739865165675025,
1220 "kl": 0.0206298828125,
1221 "learning_rate": 9e-07,
1222 "loss": 0.0009,
1223 "reward": 1.865034818649292,
1224 "reward_std": 0.024553239345550537,
1225 "rewards/accuracy_reward": 0.8650349974632263,
1226 "rewards/format_reward": 1.0,
1227 "step": 87
1228 },
1229 {
1230 "clip_ratio": 0.0,
1231 "completion_length": 110.279296875,
1232 "epoch": 1.0114942528735633,
1233 "grad_norm": 1.7212140289390774,
1234 "kl": 0.022216796875,
1235 "learning_rate": 8.988505747126436e-07,
1236 "loss": 0.001,
1237 "reward": 1.844975471496582,
1238 "reward_std": 0.031139438971877098,
1239 "rewards/accuracy_reward": 0.8449755907058716,
1240 "rewards/format_reward": 1.0,
1241 "step": 88
1242 },
1243 {
1244 "clip_ratio": 0.0,
1245 "completion_length": 109.97721862792969,
1246 "epoch": 1.0229885057471264,
1247 "grad_norm": 2.1208124047907018,
1248 "kl": 0.02294921875,
1249 "learning_rate": 8.977011494252873e-07,
1250 "loss": 0.001,
1251 "reward": 1.8310832977294922,
1252 "reward_std": 0.030721893534064293,
1253 "rewards/accuracy_reward": 0.8310832381248474,
1254 "rewards/format_reward": 1.0,
1255 "step": 89
1256 },
1257 {
1258 "clip_ratio": 0.0,
1259 "completion_length": 109.72982025146484,
1260 "epoch": 1.0344827586206897,
1261 "grad_norm": 1.7904458786353679,
1262 "kl": 0.025390625,
1263 "learning_rate": 8.96551724137931e-07,
1264 "loss": 0.0011,
1265 "reward": 1.8376115560531616,
1266 "reward_std": 0.03200274333357811,
1267 "rewards/accuracy_reward": 0.8382627367973328,
1268 "rewards/format_reward": 0.9993489980697632,
1269 "step": 90
1270 },
1271 {
1272 "clip_ratio": 0.0,
1273 "completion_length": 109.40495300292969,
1274 "epoch": 1.0459770114942528,
1275 "grad_norm": 1.6518476245455913,
1276 "kl": 0.0216064453125,
1277 "learning_rate": 8.954022988505747e-07,
1278 "loss": 0.0009,
1279 "reward": 1.8409684896469116,
1280 "reward_std": 0.02987781912088394,
1281 "rewards/accuracy_reward": 0.8416194915771484,
1282 "rewards/format_reward": 0.9993489980697632,
1283 "step": 91
1284 },
1285 {
1286 "clip_ratio": 0.0,
1287 "completion_length": 108.744140625,
1288 "epoch": 1.0574712643678161,
1289 "grad_norm": 2.410653089678698,
1290 "kl": 0.021484375,
1291 "learning_rate": 8.942528735632184e-07,
1292 "loss": 0.0009,
1293 "reward": 1.8022823333740234,
1294 "reward_std": 0.03017442300915718,
1295 "rewards/accuracy_reward": 0.8022822141647339,
1296 "rewards/format_reward": 1.0,
1297 "step": 92
1298 },
1299 {
1300 "clip_ratio": 0.0,
1301 "completion_length": 109.03255462646484,
1302 "epoch": 1.0689655172413792,
1303 "grad_norm": 1.070833843767388,
1304 "kl": 0.0230712890625,
1305 "learning_rate": 8.93103448275862e-07,
1306 "loss": 0.001,
1307 "reward": 1.8557833433151245,
1308 "reward_std": 0.030436282977461815,
1309 "rewards/accuracy_reward": 0.8570854663848877,
1310 "rewards/format_reward": 0.9986979365348816,
1311 "step": 93
1312 },
1313 {
1314 "clip_ratio": 0.0,
1315 "completion_length": 106.84635925292969,
1316 "epoch": 1.0804597701149425,
1317 "grad_norm": 1.2651791750173826,
1318 "kl": 0.0216064453125,
1319 "learning_rate": 8.919540229885057e-07,
1320 "loss": 0.0009,
1321 "reward": 1.8365752696990967,
1322 "reward_std": 0.025262486189603806,
1323 "rewards/accuracy_reward": 0.8372262716293335,
1324 "rewards/format_reward": 0.9993489980697632,
1325 "step": 94
1326 },
1327 {
1328 "clip_ratio": 0.0,
1329 "completion_length": 106.35091400146484,
1330 "epoch": 1.0919540229885056,
1331 "grad_norm": 2.5454147273169463,
1332 "kl": 0.0213623046875,
1333 "learning_rate": 8.908045977011494e-07,
1334 "loss": 0.0009,
1335 "reward": 1.8451789617538452,
1336 "reward_std": 0.02507897838950157,
1337 "rewards/accuracy_reward": 0.8451790809631348,
1338 "rewards/format_reward": 1.0,
1339 "step": 95
1340 },
1341 {
1342 "clip_ratio": 0.0,
1343 "completion_length": 106.16862487792969,
1344 "epoch": 1.103448275862069,
1345 "grad_norm": 1.5956876645494014,
1346 "kl": 0.020263671875,
1347 "learning_rate": 8.896551724137931e-07,
1348 "loss": 0.0009,
1349 "reward": 1.8393771648406982,
1350 "reward_std": 0.02759551629424095,
1351 "rewards/accuracy_reward": 0.8393771052360535,
1352 "rewards/format_reward": 1.0,
1353 "step": 96
1354 },
1355 {
1356 "clip_ratio": 0.0,
1357 "completion_length": 105.232421875,
1358 "epoch": 1.1149425287356323,
1359 "grad_norm": 1.6538617605670678,
1360 "kl": 0.0230712890625,
1361 "learning_rate": 8.885057471264368e-07,
1362 "loss": 0.001,
1363 "reward": 1.8187679052352905,
1364 "reward_std": 0.028722627088427544,
1365 "rewards/accuracy_reward": 0.8187679052352905,
1366 "rewards/format_reward": 1.0,
1367 "step": 97
1368 },
1369 {
1370 "clip_ratio": 0.0,
1371 "completion_length": 104.37109375,
1372 "epoch": 1.1264367816091954,
1373 "grad_norm": 2.466954207815668,
1374 "kl": 0.020263671875,
1375 "learning_rate": 8.873563218390804e-07,
1376 "loss": 0.0009,
1377 "reward": 1.8639543056488037,
1378 "reward_std": 0.02820052206516266,
1379 "rewards/accuracy_reward": 0.8646053075790405,
1380 "rewards/format_reward": 0.9993489980697632,
1381 "step": 98
1382 },
1383 {
1384 "clip_ratio": 0.0,
1385 "completion_length": 105.6171875,
1386 "epoch": 1.1379310344827587,
1387 "grad_norm": 1.252778063732612,
1388 "kl": 0.0211181640625,
1389 "learning_rate": 8.862068965517241e-07,
1390 "loss": 0.0009,
1391 "reward": 1.8373963832855225,
1392 "reward_std": 0.027982115745544434,
1393 "rewards/accuracy_reward": 0.837396502494812,
1394 "rewards/format_reward": 1.0,
1395 "step": 99
1396 },
1397 {
1398 "clip_ratio": 0.0,
1399 "completion_length": 104.939453125,
1400 "epoch": 1.1494252873563218,
1401 "grad_norm": 1.5031364905308602,
1402 "kl": 0.021240234375,
1403 "learning_rate": 8.850574712643678e-07,
1404 "loss": 0.0009,
1405 "reward": 1.8384751081466675,
1406 "reward_std": 0.02889677882194519,
1407 "rewards/accuracy_reward": 0.8384751081466675,
1408 "rewards/format_reward": 1.0,
1409 "step": 100
1410 },
1411 {
1412 "clip_ratio": 0.0,
1413 "completion_length": 105.173828125,
1414 "epoch": 1.160919540229885,
1415 "grad_norm": 3.2098798594469957,
1416 "kl": 0.022216796875,
1417 "learning_rate": 8.839080459770114e-07,
1418 "loss": 0.001,
1419 "reward": 1.858123779296875,
1420 "reward_std": 0.027859613299369812,
1421 "rewards/accuracy_reward": 0.8581236600875854,
1422 "rewards/format_reward": 1.0,
1423 "step": 101
1424 },
1425 {
1426 "clip_ratio": 0.0,
1427 "completion_length": 103.611328125,
1428 "epoch": 1.1724137931034484,
1429 "grad_norm": 1.6395969811083824,
1430 "kl": 0.022705078125,
1431 "learning_rate": 8.827586206896551e-07,
1432 "loss": 0.001,
1433 "reward": 1.8613924980163574,
1434 "reward_std": 0.030063219368457794,
1435 "rewards/accuracy_reward": 0.8626946806907654,
1436 "rewards/format_reward": 0.9986979365348816,
1437 "step": 102
1438 },
1439 {
1440 "clip_ratio": 0.0,
1441 "completion_length": 105.9609375,
1442 "epoch": 1.1839080459770115,
1443 "grad_norm": 1.5759802737821602,
1444 "kl": 0.0218505859375,
1445 "learning_rate": 8.816091954022988e-07,
1446 "loss": 0.001,
1447 "reward": 1.87135648727417,
1448 "reward_std": 0.025703629478812218,
1449 "rewards/accuracy_reward": 0.8713564872741699,
1450 "rewards/format_reward": 1.0,
1451 "step": 103
1452 },
1453 {
1454 "clip_ratio": 0.0,
1455 "completion_length": 106.47591400146484,
1456 "epoch": 1.1954022988505748,
1457 "grad_norm": 2.1153792944695753,
1458 "kl": 0.0218505859375,
1459 "learning_rate": 8.804597701149424e-07,
1460 "loss": 0.001,
1461 "reward": 1.8344066143035889,
1462 "reward_std": 0.02677982673048973,
1463 "rewards/accuracy_reward": 0.8344065546989441,
1464 "rewards/format_reward": 1.0,
1465 "step": 104
1466 },
1467 {
1468 "clip_ratio": 0.0,
1469 "completion_length": 106.17513275146484,
1470 "epoch": 1.206896551724138,
1471 "grad_norm": 1.5729705960989235,
1472 "kl": 0.0223388671875,
1473 "learning_rate": 8.793103448275862e-07,
1474 "loss": 0.0009,
1475 "reward": 1.8618988990783691,
1476 "reward_std": 0.025562942028045654,
1477 "rewards/accuracy_reward": 0.8618988990783691,
1478 "rewards/format_reward": 1.0,
1479 "step": 105
1480 },
1481 {
1482 "clip_ratio": 0.0,
1483 "completion_length": 107.10417175292969,
1484 "epoch": 1.2183908045977012,
1485 "grad_norm": 1.5280787952862458,
1486 "kl": 0.021484375,
1487 "learning_rate": 8.781609195402299e-07,
1488 "loss": 0.0009,
1489 "reward": 1.8250049352645874,
1490 "reward_std": 0.030421411618590355,
1491 "rewards/accuracy_reward": 0.8263069987297058,
1492 "rewards/format_reward": 0.9986979365348816,
1493 "step": 106
1494 },
1495 {
1496 "clip_ratio": 0.0,
1497 "completion_length": 108.99674987792969,
1498 "epoch": 1.2298850574712643,
1499 "grad_norm": 1.3417425340508164,
1500 "kl": 0.02294921875,
1501 "learning_rate": 8.770114942528735e-07,
1502 "loss": 0.001,
1503 "reward": 1.8627097606658936,
1504 "reward_std": 0.029694318771362305,
1505 "rewards/accuracy_reward": 0.8633607625961304,
1506 "rewards/format_reward": 0.9993489980697632,
1507 "step": 107
1508 },
1509 {
1510 "clip_ratio": 0.0,
1511 "completion_length": 109.16732025146484,
1512 "epoch": 1.2413793103448276,
1513 "grad_norm": 1.3637947595566382,
1514 "kl": 0.023193359375,
1515 "learning_rate": 8.758620689655172e-07,
1516 "loss": 0.001,
1517 "reward": 1.8729504346847534,
1518 "reward_std": 0.02589436247944832,
1519 "rewards/accuracy_reward": 0.8729504942893982,
1520 "rewards/format_reward": 1.0,
1521 "step": 108
1522 },
1523 {
1524 "clip_ratio": 0.0,
1525 "completion_length": 111.79232025146484,
1526 "epoch": 1.2528735632183907,
1527 "grad_norm": 1.8473222859814702,
1528 "kl": 0.02294921875,
1529 "learning_rate": 8.747126436781609e-07,
1530 "loss": 0.001,
1531 "reward": 1.8625543117523193,
1532 "reward_std": 0.026772310957312584,
1533 "rewards/accuracy_reward": 0.8625543117523193,
1534 "rewards/format_reward": 1.0,
1535 "step": 109
1536 },
1537 {
1538 "clip_ratio": 0.0,
1539 "completion_length": 111.390625,
1540 "epoch": 1.264367816091954,
1541 "grad_norm": 1.6150275987926945,
1542 "kl": 0.024658203125,
1543 "learning_rate": 8.735632183908046e-07,
1544 "loss": 0.001,
1545 "reward": 1.8422784805297852,
1546 "reward_std": 0.030604541301727295,
1547 "rewards/accuracy_reward": 0.8435806632041931,
1548 "rewards/format_reward": 0.9986979365348816,
1549 "step": 110
1550 },
1551 {
1552 "clip_ratio": 0.0,
1553 "completion_length": 112.96354675292969,
1554 "epoch": 1.2758620689655173,
1555 "grad_norm": 1.7104109971917698,
1556 "kl": 0.03125,
1557 "learning_rate": 8.724137931034482e-07,
1558 "loss": 0.0013,
1559 "reward": 1.8350896835327148,
1560 "reward_std": 0.028940599411725998,
1561 "rewards/accuracy_reward": 0.8357405662536621,
1562 "rewards/format_reward": 0.9993489980697632,
1563 "step": 111
1564 },
1565 {
1566 "clip_ratio": 0.0,
1567 "completion_length": 113.47135925292969,
1568 "epoch": 1.2873563218390804,
1569 "grad_norm": 1.48493737972391,
1570 "kl": 0.02490234375,
1571 "learning_rate": 8.712643678160919e-07,
1572 "loss": 0.0011,
1573 "reward": 1.8481062650680542,
1574 "reward_std": 0.029237091541290283,
1575 "rewards/accuracy_reward": 0.8481062650680542,
1576 "rewards/format_reward": 1.0,
1577 "step": 112
1578 },
1579 {
1580 "clip_ratio": 0.0,
1581 "completion_length": 112.853515625,
1582 "epoch": 1.2988505747126438,
1583 "grad_norm": 1.8812767455198556,
1584 "kl": 0.024169921875,
1585 "learning_rate": 8.701149425287357e-07,
1586 "loss": 0.001,
1587 "reward": 1.8743027448654175,
1588 "reward_std": 0.028526946902275085,
1589 "rewards/accuracy_reward": 0.8743027448654175,
1590 "rewards/format_reward": 1.0,
1591 "step": 113
1592 },
1593 {
1594 "clip_ratio": 0.0,
1595 "completion_length": 112.11653900146484,
1596 "epoch": 1.3103448275862069,
1597 "grad_norm": 1.4552951626639703,
1598 "kl": 0.0244140625,
1599 "learning_rate": 8.689655172413792e-07,
1600 "loss": 0.001,
1601 "reward": 1.8737125396728516,
1602 "reward_std": 0.02662818133831024,
1603 "rewards/accuracy_reward": 0.8737127184867859,
1604 "rewards/format_reward": 1.0,
1605 "step": 114
1606 },
1607 {
1608 "clip_ratio": 0.0,
1609 "completion_length": 109.68620300292969,
1610 "epoch": 1.3218390804597702,
1611 "grad_norm": 1.2099323721036837,
1612 "kl": 0.0242919921875,
1613 "learning_rate": 8.67816091954023e-07,
1614 "loss": 0.001,
1615 "reward": 1.8564056158065796,
1616 "reward_std": 0.03203584998846054,
1617 "rewards/accuracy_reward": 0.8570567965507507,
1618 "rewards/format_reward": 0.9993489980697632,
1619 "step": 115
1620 },
1621 {
1622 "clip_ratio": 0.0,
1623 "completion_length": 110.44140625,
1624 "epoch": 1.3333333333333333,
1625 "grad_norm": 1.512025900233657,
1626 "kl": 0.0262451171875,
1627 "learning_rate": 8.666666666666667e-07,
1628 "loss": 0.0011,
1629 "reward": 1.8566768169403076,
1630 "reward_std": 0.028156783431768417,
1631 "rewards/accuracy_reward": 0.857327938079834,
1632 "rewards/format_reward": 0.9993489980697632,
1633 "step": 116
1634 },
1635 {
1636 "clip_ratio": 0.0,
1637 "completion_length": 107.33073425292969,
1638 "epoch": 1.3448275862068966,
1639 "grad_norm": 2.0358354675170904,
1640 "kl": 0.02392578125,
1641 "learning_rate": 8.655172413793102e-07,
1642 "loss": 0.001,
1643 "reward": 1.8653233051300049,
1644 "reward_std": 0.03316108509898186,
1645 "rewards/accuracy_reward": 0.8666254281997681,
1646 "rewards/format_reward": 0.9986979365348816,
1647 "step": 117
1648 },
1649 {
1650 "clip_ratio": 0.0,
1651 "completion_length": 106.40234375,
1652 "epoch": 1.3563218390804597,
1653 "grad_norm": 4.5664679538349136,
1654 "kl": 0.0260009765625,
1655 "learning_rate": 8.64367816091954e-07,
1656 "loss": 0.0011,
1657 "reward": 1.8475828170776367,
1658 "reward_std": 0.028350701555609703,
1659 "rewards/accuracy_reward": 0.8475827574729919,
1660 "rewards/format_reward": 1.0,
1661 "step": 118
1662 },
1663 {
1664 "clip_ratio": 0.0,
1665 "completion_length": 103.509765625,
1666 "epoch": 1.367816091954023,
1667 "grad_norm": 3.3498974515563305,
1668 "kl": 0.02392578125,
1669 "learning_rate": 8.632183908045977e-07,
1670 "loss": 0.001,
1671 "reward": 1.8256216049194336,
1672 "reward_std": 0.03040180169045925,
1673 "rewards/accuracy_reward": 0.8256216049194336,
1674 "rewards/format_reward": 1.0,
1675 "step": 119
1676 },
1677 {
1678 "clip_ratio": 0.0,
1679 "completion_length": 104.119140625,
1680 "epoch": 1.3793103448275863,
1681 "grad_norm": 2.0523739279685618,
1682 "kl": 0.03564453125,
1683 "learning_rate": 8.620689655172412e-07,
1684 "loss": 0.0015,
1685 "reward": 1.8660404682159424,
1686 "reward_std": 0.02924610674381256,
1687 "rewards/accuracy_reward": 0.8660405874252319,
1688 "rewards/format_reward": 1.0,
1689 "step": 120
1690 },
1691 {
1692 "clip_ratio": 0.0,
1693 "completion_length": 101.34440612792969,
1694 "epoch": 1.3908045977011494,
1695 "grad_norm": 4.105539506193381,
1696 "kl": 0.0361328125,
1697 "learning_rate": 8.60919540229885e-07,
1698 "loss": 0.0015,
1699 "reward": 1.8672317266464233,
1700 "reward_std": 0.0260650422424078,
1701 "rewards/accuracy_reward": 0.8672318458557129,
1702 "rewards/format_reward": 1.0,
1703 "step": 121
1704 },
1705 {
1706 "clip_ratio": 0.0,
1707 "completion_length": 101.32878112792969,
1708 "epoch": 1.4022988505747127,
1709 "grad_norm": 1.7057843588411452,
1710 "kl": 0.0303955078125,
1711 "learning_rate": 8.597701149425287e-07,
1712 "loss": 0.0012,
1713 "reward": 1.8730971813201904,
1714 "reward_std": 0.032900359481573105,
1715 "rewards/accuracy_reward": 0.8737481832504272,
1716 "rewards/format_reward": 0.9993489980697632,
1717 "step": 122
1718 },
1719 {
1720 "clip_ratio": 0.0,
1721 "completion_length": 100.82421875,
1722 "epoch": 1.4137931034482758,
1723 "grad_norm": 2.6319293895634828,
1724 "kl": 0.024658203125,
1725 "learning_rate": 8.586206896551725e-07,
1726 "loss": 0.0011,
1727 "reward": 1.8619554042816162,
1728 "reward_std": 0.025756051763892174,
1729 "rewards/accuracy_reward": 0.8619554042816162,
1730 "rewards/format_reward": 1.0,
1731 "step": 123
1732 },
1733 {
1734 "clip_ratio": 0.0,
1735 "completion_length": 101.46810150146484,
1736 "epoch": 1.4252873563218391,
1737 "grad_norm": 1.8929420183153929,
1738 "kl": 0.0264892578125,
1739 "learning_rate": 8.57471264367816e-07,
1740 "loss": 0.0011,
1741 "reward": 1.8555949926376343,
1742 "reward_std": 0.027035661041736603,
1743 "rewards/accuracy_reward": 0.855595052242279,
1744 "rewards/format_reward": 1.0,
1745 "step": 124
1746 },
1747 {
1748 "clip_ratio": 0.0,
1749 "completion_length": 100.79362487792969,
1750 "epoch": 1.4367816091954024,
1751 "grad_norm": 4.4900781638741645,
1752 "kl": 0.0264892578125,
1753 "learning_rate": 8.563218390804597e-07,
1754 "loss": 0.0011,
1755 "reward": 1.8362255096435547,
1756 "reward_std": 0.0311919953674078,
1757 "rewards/accuracy_reward": 0.836225688457489,
1758 "rewards/format_reward": 1.0,
1759 "step": 125
1760 },
1761 {
1762 "clip_ratio": 0.0,
1763 "completion_length": 102.28190612792969,
1764 "epoch": 1.4482758620689655,
1765 "grad_norm": 2.686615982073592,
1766 "kl": 0.0294189453125,
1767 "learning_rate": 8.551724137931035e-07,
1768 "loss": 0.0012,
1769 "reward": 1.8675731420516968,
1770 "reward_std": 0.02920936420559883,
1771 "rewards/accuracy_reward": 0.8682241439819336,
1772 "rewards/format_reward": 0.9993489980697632,
1773 "step": 126
1774 },
1775 {
1776 "clip_ratio": 0.0,
1777 "completion_length": 101.333984375,
1778 "epoch": 1.4597701149425286,
1779 "grad_norm": 1.4366785062548348,
1780 "kl": 0.0245361328125,
1781 "learning_rate": 8.54022988505747e-07,
1782 "loss": 0.001,
1783 "reward": 1.8725385665893555,
1784 "reward_std": 0.028989439830183983,
1785 "rewards/accuracy_reward": 0.8725385665893555,
1786 "rewards/format_reward": 1.0,
1787 "step": 127
1788 },
1789 {
1790 "clip_ratio": 0.0,
1791 "completion_length": 102.35286712646484,
1792 "epoch": 1.471264367816092,
1793 "grad_norm": 3.153210014316318,
1794 "kl": 0.025146484375,
1795 "learning_rate": 8.528735632183908e-07,
1796 "loss": 0.0011,
1797 "reward": 1.8572996854782104,
1798 "reward_std": 0.02787836454808712,
1799 "rewards/accuracy_reward": 0.8579506874084473,
1800 "rewards/format_reward": 0.9993489980697632,
1801 "step": 128
1802 },
1803 {
1804 "clip_ratio": 0.0,
1805 "completion_length": 101.70768737792969,
1806 "epoch": 1.4827586206896552,
1807 "grad_norm": 1.312276972160894,
1808 "kl": 0.023681640625,
1809 "learning_rate": 8.517241379310345e-07,
1810 "loss": 0.001,
1811 "reward": 1.8692939281463623,
1812 "reward_std": 0.028887853026390076,
1813 "rewards/accuracy_reward": 0.8692940473556519,
1814 "rewards/format_reward": 1.0,
1815 "step": 129
1816 },
1817 {
1818 "clip_ratio": 0.0,
1819 "completion_length": 102.83333587646484,
1820 "epoch": 1.4942528735632183,
1821 "grad_norm": 1.7208602570293514,
1822 "kl": 0.0279541015625,
1823 "learning_rate": 8.50574712643678e-07,
1824 "loss": 0.0012,
1825 "reward": 1.86876380443573,
1826 "reward_std": 0.026345182210206985,
1827 "rewards/accuracy_reward": 0.86876380443573,
1828 "rewards/format_reward": 1.0,
1829 "step": 130
1830 },
1831 {
1832 "clip_ratio": 0.0,
1833 "completion_length": 102.58919525146484,
1834 "epoch": 1.5057471264367817,
1835 "grad_norm": 1.5360698582173107,
1836 "kl": 0.02490234375,
1837 "learning_rate": 8.494252873563218e-07,
1838 "loss": 0.0011,
1839 "reward": 1.8543965816497803,
1840 "reward_std": 0.025941472500562668,
1841 "rewards/accuracy_reward": 0.8543965816497803,
1842 "rewards/format_reward": 1.0,
1843 "step": 131
1844 },
1845 {
1846 "clip_ratio": 0.0,
1847 "completion_length": 104.38607025146484,
1848 "epoch": 1.5172413793103448,
1849 "grad_norm": 1.5741802825985347,
1850 "kl": 0.0244140625,
1851 "learning_rate": 8.482758620689655e-07,
1852 "loss": 0.0011,
1853 "reward": 1.8572925329208374,
1854 "reward_std": 0.02648422122001648,
1855 "rewards/accuracy_reward": 0.8572925329208374,
1856 "rewards/format_reward": 1.0,
1857 "step": 132
1858 },
1859 {
1860 "clip_ratio": 0.0,
1861 "completion_length": 102.529296875,
1862 "epoch": 1.528735632183908,
1863 "grad_norm": 2.5696729376896577,
1864 "kl": 0.02685546875,
1865 "learning_rate": 8.471264367816092e-07,
1866 "loss": 0.0012,
1867 "reward": 1.8575305938720703,
1868 "reward_std": 0.023815011605620384,
1869 "rewards/accuracy_reward": 0.8575305938720703,
1870 "rewards/format_reward": 1.0,
1871 "step": 133
1872 },
1873 {
1874 "clip_ratio": 0.0,
1875 "completion_length": 105.123046875,
1876 "epoch": 1.5402298850574714,
1877 "grad_norm": 1.6144061698095113,
1878 "kl": 0.029052734375,
1879 "learning_rate": 8.459770114942528e-07,
1880 "loss": 0.0013,
1881 "reward": 1.8380780220031738,
1882 "reward_std": 0.02718261629343033,
1883 "rewards/accuracy_reward": 0.8380780220031738,
1884 "rewards/format_reward": 1.0,
1885 "step": 134
1886 },
1887 {
1888 "clip_ratio": 0.0,
1889 "completion_length": 106.32552337646484,
1890 "epoch": 1.5517241379310345,
1891 "grad_norm": 1.4219475281295146,
1892 "kl": 0.0281982421875,
1893 "learning_rate": 8.448275862068965e-07,
1894 "loss": 0.0012,
1895 "reward": 1.8486030101776123,
1896 "reward_std": 0.02719944715499878,
1897 "rewards/accuracy_reward": 0.8492540121078491,
1898 "rewards/format_reward": 0.9993489980697632,
1899 "step": 135
1900 },
1901 {
1902 "clip_ratio": 0.0,
1903 "completion_length": 104.57747650146484,
1904 "epoch": 1.5632183908045976,
1905 "grad_norm": 1.2505748762461004,
1906 "kl": 0.0284423828125,
1907 "learning_rate": 8.436781609195402e-07,
1908 "loss": 0.0012,
1909 "reward": 1.8550881147384644,
1910 "reward_std": 0.024348240345716476,
1911 "rewards/accuracy_reward": 0.8550881147384644,
1912 "rewards/format_reward": 1.0,
1913 "step": 136
1914 },
1915 {
1916 "clip_ratio": 0.0,
1917 "completion_length": 105.931640625,
1918 "epoch": 1.5747126436781609,
1919 "grad_norm": 1.4653956935392716,
1920 "kl": 0.030517578125,
1921 "learning_rate": 8.425287356321838e-07,
1922 "loss": 0.0013,
1923 "reward": 1.8590949773788452,
1924 "reward_std": 0.024702435359358788,
1925 "rewards/accuracy_reward": 0.8597459197044373,
1926 "rewards/format_reward": 0.9993489980697632,
1927 "step": 137
1928 },
1929 {
1930 "clip_ratio": 0.0,
1931 "completion_length": 104.66796875,
1932 "epoch": 1.5862068965517242,
1933 "grad_norm": 1.5116616200702753,
1934 "kl": 0.0257568359375,
1935 "learning_rate": 8.413793103448276e-07,
1936 "loss": 0.0011,
1937 "reward": 1.865983247756958,
1938 "reward_std": 0.02651612088084221,
1939 "rewards/accuracy_reward": 0.8659831881523132,
1940 "rewards/format_reward": 1.0,
1941 "step": 138
1942 },
1943 {
1944 "clip_ratio": 0.0,
1945 "completion_length": 105.427734375,
1946 "epoch": 1.5977011494252875,
1947 "grad_norm": 2.539622540808917,
1948 "kl": 0.026123046875,
1949 "learning_rate": 8.402298850574713e-07,
1950 "loss": 0.0011,
1951 "reward": 1.8506149053573608,
1952 "reward_std": 0.024511417374014854,
1953 "rewards/accuracy_reward": 0.8506147861480713,
1954 "rewards/format_reward": 1.0,
1955 "step": 139
1956 },
1957 {
1958 "clip_ratio": 0.0,
1959 "completion_length": 104.677734375,
1960 "epoch": 1.6091954022988506,
1961 "grad_norm": 2.7029566355060575,
1962 "kl": 0.030029296875,
1963 "learning_rate": 8.390804597701148e-07,
1964 "loss": 0.0013,
1965 "reward": 1.8749054670333862,
1966 "reward_std": 0.022635221481323242,
1967 "rewards/accuracy_reward": 0.8749054670333862,
1968 "rewards/format_reward": 1.0,
1969 "step": 140
1970 },
1971 {
1972 "clip_ratio": 0.0,
1973 "completion_length": 105.28190612792969,
1974 "epoch": 1.6206896551724137,
1975 "grad_norm": 1.6125529702224055,
1976 "kl": 0.0306396484375,
1977 "learning_rate": 8.379310344827586e-07,
1978 "loss": 0.0013,
1979 "reward": 1.8687496185302734,
1980 "reward_std": 0.02769436687231064,
1981 "rewards/accuracy_reward": 0.8687496185302734,
1982 "rewards/format_reward": 1.0,
1983 "step": 141
1984 },
1985 {
1986 "clip_ratio": 0.0,
1987 "completion_length": 105.73177337646484,
1988 "epoch": 1.632183908045977,
1989 "grad_norm": 1.6175458343103775,
1990 "kl": 0.03076171875,
1991 "learning_rate": 8.367816091954023e-07,
1992 "loss": 0.0013,
1993 "reward": 1.8663349151611328,
1994 "reward_std": 0.028380822390317917,
1995 "rewards/accuracy_reward": 0.8663349151611328,
1996 "rewards/format_reward": 1.0,
1997 "step": 142
1998 },
1999 {
2000 "clip_ratio": 0.0,
2001 "completion_length": 106.64128112792969,
2002 "epoch": 1.6436781609195403,
2003 "grad_norm": 1.5215374212801362,
2004 "kl": 0.03515625,
2005 "learning_rate": 8.35632183908046e-07,
2006 "loss": 0.0015,
2007 "reward": 1.850799560546875,
2008 "reward_std": 0.026175182312726974,
2009 "rewards/accuracy_reward": 0.8507994413375854,
2010 "rewards/format_reward": 1.0,
2011 "step": 143
2012 },
2013 {
2014 "clip_ratio": 0.0,
2015 "completion_length": 104.19792175292969,
2016 "epoch": 1.6551724137931034,
2017 "grad_norm": 1.3135082334407917,
2018 "kl": 0.035400390625,
2019 "learning_rate": 8.344827586206896e-07,
2020 "loss": 0.0015,
2021 "reward": 1.8789458274841309,
2022 "reward_std": 0.023477703332901,
2023 "rewards/accuracy_reward": 0.8789458274841309,
2024 "rewards/format_reward": 1.0,
2025 "step": 144
2026 },
2027 {
2028 "clip_ratio": 0.0,
2029 "completion_length": 105.71940612792969,
2030 "epoch": 1.6666666666666665,
2031 "grad_norm": 1.7597191781848593,
2032 "kl": 0.0400390625,
2033 "learning_rate": 8.333333333333333e-07,
2034 "loss": 0.0017,
2035 "reward": 1.862714171409607,
2036 "reward_std": 0.02495882660150528,
2037 "rewards/accuracy_reward": 0.8627142906188965,
2038 "rewards/format_reward": 1.0,
2039 "step": 145
2040 },
2041 {
2042 "clip_ratio": 0.0,
2043 "completion_length": 105.86653900146484,
2044 "epoch": 1.6781609195402298,
2045 "grad_norm": 1.1271572137611656,
2046 "kl": 0.041015625,
2047 "learning_rate": 8.32183908045977e-07,
2048 "loss": 0.0017,
2049 "reward": 1.863875389099121,
2050 "reward_std": 0.022430753335356712,
2051 "rewards/accuracy_reward": 0.8638753890991211,
2052 "rewards/format_reward": 1.0,
2053 "step": 146
2054 },
2055 {
2056 "clip_ratio": 0.0,
2057 "completion_length": 105.78971862792969,
2058 "epoch": 1.6896551724137931,
2059 "grad_norm": 1.7900229849601172,
2060 "kl": 0.0361328125,
2061 "learning_rate": 8.310344827586206e-07,
2062 "loss": 0.0015,
2063 "reward": 1.8638134002685547,
2064 "reward_std": 0.024309411644935608,
2065 "rewards/accuracy_reward": 0.8638134002685547,
2066 "rewards/format_reward": 1.0,
2067 "step": 147
2068 },
2069 {
2070 "clip_ratio": 0.0,
2071 "completion_length": 107.46159362792969,
2072 "epoch": 1.7011494252873565,
2073 "grad_norm": 1.4211028736618636,
2074 "kl": 0.03466796875,
2075 "learning_rate": 8.298850574712643e-07,
2076 "loss": 0.0015,
2077 "reward": 1.8384380340576172,
2078 "reward_std": 0.027933349832892418,
2079 "rewards/accuracy_reward": 0.8390890955924988,
2080 "rewards/format_reward": 0.9993489980697632,
2081 "step": 148
2082 },
2083 {
2084 "clip_ratio": 0.0,
2085 "completion_length": 107.04817962646484,
2086 "epoch": 1.7126436781609196,
2087 "grad_norm": 1.6133215111280255,
2088 "kl": 0.033935546875,
2089 "learning_rate": 8.28735632183908e-07,
2090 "loss": 0.0014,
2091 "reward": 1.8644790649414062,
2092 "reward_std": 0.023334093391895294,
2093 "rewards/accuracy_reward": 0.8644790649414062,
2094 "rewards/format_reward": 1.0,
2095 "step": 149
2096 },
2097 {
2098 "clip_ratio": 0.0,
2099 "completion_length": 104.50130462646484,
2100 "epoch": 1.7241379310344827,
2101 "grad_norm": 1.71716415208747,
2102 "kl": 0.0380859375,
2103 "learning_rate": 8.275862068965517e-07,
2104 "loss": 0.0016,
2105 "reward": 1.863791823387146,
2106 "reward_std": 0.025837119668722153,
2107 "rewards/accuracy_reward": 0.8644427061080933,
2108 "rewards/format_reward": 0.9993489980697632,
2109 "step": 150
2110 },
2111 {
2112 "clip_ratio": 0.0,
2113 "completion_length": 106.55339050292969,
2114 "epoch": 1.735632183908046,
2115 "grad_norm": 1.7799429692812343,
2116 "kl": 0.032470703125,
2117 "learning_rate": 8.264367816091954e-07,
2118 "loss": 0.0014,
2119 "reward": 1.8520851135253906,
2120 "reward_std": 0.02567559853196144,
2121 "rewards/accuracy_reward": 0.8520849943161011,
2122 "rewards/format_reward": 1.0,
2123 "step": 151
2124 },
2125 {
2126 "clip_ratio": 0.0,
2127 "completion_length": 106.16015625,
2128 "epoch": 1.7471264367816093,
2129 "grad_norm": 6.845861661718817,
2130 "kl": 0.033203125,
2131 "learning_rate": 8.25287356321839e-07,
2132 "loss": 0.0014,
2133 "reward": 1.8801301717758179,
2134 "reward_std": 0.02385757677257061,
2135 "rewards/accuracy_reward": 0.8801302909851074,
2136 "rewards/format_reward": 1.0,
2137 "step": 152
2138 },
2139 {
2140 "clip_ratio": 0.0,
2141 "completion_length": 106.34635925292969,
2142 "epoch": 1.7586206896551724,
2143 "grad_norm": 2.31131545485098,
2144 "kl": 0.030029296875,
2145 "learning_rate": 8.241379310344827e-07,
2146 "loss": 0.0013,
2147 "reward": 1.8744109869003296,
2148 "reward_std": 0.02264220081269741,
2149 "rewards/accuracy_reward": 0.8744109869003296,
2150 "rewards/format_reward": 1.0,
2151 "step": 153
2152 },
2153 {
2154 "clip_ratio": 0.0,
2155 "completion_length": 104.56901550292969,
2156 "epoch": 1.7701149425287355,
2157 "grad_norm": 1.773846804268088,
2158 "kl": 0.0302734375,
2159 "learning_rate": 8.229885057471264e-07,
2160 "loss": 0.0013,
2161 "reward": 1.864478349685669,
2162 "reward_std": 0.022709330543875694,
2163 "rewards/accuracy_reward": 0.864478349685669,
2164 "rewards/format_reward": 1.0,
2165 "step": 154
2166 },
2167 {
2168 "clip_ratio": 0.0,
2169 "completion_length": 106.39583587646484,
2170 "epoch": 1.7816091954022988,
2171 "grad_norm": 3.139231735878295,
2172 "kl": 0.03662109375,
2173 "learning_rate": 8.218390804597701e-07,
2174 "loss": 0.0016,
2175 "reward": 1.8694690465927124,
2176 "reward_std": 0.023755362257361412,
2177 "rewards/accuracy_reward": 0.8701201677322388,
2178 "rewards/format_reward": 0.9993489980697632,
2179 "step": 155
2180 },
2181 {
2182 "clip_ratio": 0.0,
2183 "completion_length": 105.47786712646484,
2184 "epoch": 1.793103448275862,
2185 "grad_norm": 1.978341379497861,
2186 "kl": 0.033203125,
2187 "learning_rate": 8.206896551724138e-07,
2188 "loss": 0.0014,
2189 "reward": 1.8873121738433838,
2190 "reward_std": 0.02343663200736046,
2191 "rewards/accuracy_reward": 0.8873120546340942,
2192 "rewards/format_reward": 1.0,
2193 "step": 156
2194 },
2195 {
2196 "clip_ratio": 0.0,
2197 "completion_length": 107.451171875,
2198 "epoch": 1.8045977011494254,
2199 "grad_norm": 1.9961053634302923,
2200 "kl": 0.031494140625,
2201 "learning_rate": 8.195402298850574e-07,
2202 "loss": 0.0013,
2203 "reward": 1.8565791845321655,
2204 "reward_std": 0.02433241717517376,
2205 "rewards/accuracy_reward": 0.8565791845321655,
2206 "rewards/format_reward": 1.0,
2207 "step": 157
2208 },
2209 {
2210 "clip_ratio": 0.0,
2211 "completion_length": 108.66015625,
2212 "epoch": 1.8160919540229885,
2213 "grad_norm": 1.4619162689448402,
2214 "kl": 0.0279541015625,
2215 "learning_rate": 8.183908045977011e-07,
2216 "loss": 0.0012,
2217 "reward": 1.893612265586853,
2218 "reward_std": 0.020092740654945374,
2219 "rewards/accuracy_reward": 0.893612265586853,
2220 "rewards/format_reward": 1.0,
2221 "step": 158
2222 },
2223 {
2224 "clip_ratio": 0.0,
2225 "completion_length": 108.15299987792969,
2226 "epoch": 1.8275862068965516,
2227 "grad_norm": 1.3651687933062773,
2228 "kl": 0.0283203125,
2229 "learning_rate": 8.172413793103448e-07,
2230 "loss": 0.0012,
2231 "reward": 1.8662446737289429,
2232 "reward_std": 0.02392234466969967,
2233 "rewards/accuracy_reward": 0.8662446737289429,
2234 "rewards/format_reward": 1.0,
2235 "step": 159
2236 },
2237 {
2238 "clip_ratio": 0.0,
2239 "completion_length": 111.61458587646484,
2240 "epoch": 1.839080459770115,
2241 "grad_norm": 1.2135275799512748,
2242 "kl": 0.0262451171875,
2243 "learning_rate": 8.160919540229885e-07,
2244 "loss": 0.0011,
2245 "reward": 1.8714094161987305,
2246 "reward_std": 0.024804111570119858,
2247 "rewards/accuracy_reward": 0.8714094161987305,
2248 "rewards/format_reward": 1.0,
2249 "step": 160
2250 },
2251 {
2252 "clip_ratio": 0.0,
2253 "completion_length": 110.740234375,
2254 "epoch": 1.8505747126436782,
2255 "grad_norm": 1.8554584998042072,
2256 "kl": 0.03125,
2257 "learning_rate": 8.149425287356322e-07,
2258 "loss": 0.0013,
2259 "reward": 1.885859489440918,
2260 "reward_std": 0.0239472147077322,
2261 "rewards/accuracy_reward": 0.8858596682548523,
2262 "rewards/format_reward": 1.0,
2263 "step": 161
2264 },
2265 {
2266 "clip_ratio": 0.0,
2267 "completion_length": 112.990234375,
2268 "epoch": 1.8620689655172413,
2269 "grad_norm": 2.7992855237483445,
2270 "kl": 0.027587890625,
2271 "learning_rate": 8.137931034482758e-07,
2272 "loss": 0.0012,
2273 "reward": 1.8619565963745117,
2274 "reward_std": 0.025602132081985474,
2275 "rewards/accuracy_reward": 0.8619565963745117,
2276 "rewards/format_reward": 1.0,
2277 "step": 162
2278 },
2279 {
2280 "clip_ratio": 0.0,
2281 "completion_length": 114.46810150146484,
2282 "epoch": 1.8735632183908046,
2283 "grad_norm": 1.153159461213296,
2284 "kl": 0.0291748046875,
2285 "learning_rate": 8.126436781609195e-07,
2286 "loss": 0.0012,
2287 "reward": 1.8758127689361572,
2288 "reward_std": 0.02637392282485962,
2289 "rewards/accuracy_reward": 0.876463770866394,
2290 "rewards/format_reward": 0.9993489980697632,
2291 "step": 163
2292 },
2293 {
2294 "clip_ratio": 0.0,
2295 "completion_length": 115.09440612792969,
2296 "epoch": 1.8850574712643677,
2297 "grad_norm": 1.8638466478476596,
2298 "kl": 0.031005859375,
2299 "learning_rate": 8.114942528735632e-07,
2300 "loss": 0.0013,
2301 "reward": 1.8715770244598389,
2302 "reward_std": 0.02399243414402008,
2303 "rewards/accuracy_reward": 0.8715770244598389,
2304 "rewards/format_reward": 1.0,
2305 "step": 164
2306 },
2307 {
2308 "clip_ratio": 0.0,
2309 "completion_length": 116.47265625,
2310 "epoch": 1.896551724137931,
2311 "grad_norm": 1.1827867594913961,
2312 "kl": 0.0291748046875,
2313 "learning_rate": 8.103448275862068e-07,
2314 "loss": 0.0012,
2315 "reward": 1.8500595092773438,
2316 "reward_std": 0.030623754486441612,
2317 "rewards/accuracy_reward": 0.8520126342773438,
2318 "rewards/format_reward": 0.998046875,
2319 "step": 165
2320 },
2321 {
2322 "clip_ratio": 0.0,
2323 "completion_length": 116.31510925292969,
2324 "epoch": 1.9080459770114944,
2325 "grad_norm": 3.8800308289928274,
2326 "kl": 0.037353515625,
2327 "learning_rate": 8.091954022988506e-07,
2328 "loss": 0.0016,
2329 "reward": 1.8412752151489258,
2330 "reward_std": 0.02564075216650963,
2331 "rewards/accuracy_reward": 0.8412752151489258,
2332 "rewards/format_reward": 1.0,
2333 "step": 166
2334 },
2335 {
2336 "clip_ratio": 0.0,
2337 "completion_length": 113.9921875,
2338 "epoch": 1.9195402298850575,
2339 "grad_norm": 1.7793621376166886,
2340 "kl": 0.0322265625,
2341 "learning_rate": 8.080459770114942e-07,
2342 "loss": 0.0014,
2343 "reward": 1.8666815757751465,
2344 "reward_std": 0.01957223378121853,
2345 "rewards/accuracy_reward": 0.8666815757751465,
2346 "rewards/format_reward": 1.0,
2347 "step": 167
2348 },
2349 {
2350 "clip_ratio": 0.0,
2351 "completion_length": 116.001953125,
2352 "epoch": 1.9310344827586206,
2353 "grad_norm": 3.366934016860489,
2354 "kl": 0.0306396484375,
2355 "learning_rate": 8.068965517241378e-07,
2356 "loss": 0.0013,
2357 "reward": 1.8858134746551514,
2358 "reward_std": 0.021044649183750153,
2359 "rewards/accuracy_reward": 0.8858134150505066,
2360 "rewards/format_reward": 1.0,
2361 "step": 168
2362 },
2363 {
2364 "clip_ratio": 0.0,
2365 "completion_length": 114.40495300292969,
2366 "epoch": 1.9425287356321839,
2367 "grad_norm": 1.4480350054291504,
2368 "kl": 0.06689453125,
2369 "learning_rate": 8.057471264367816e-07,
2370 "loss": 0.0028,
2371 "reward": 1.888282299041748,
2372 "reward_std": 0.025501668453216553,
2373 "rewards/accuracy_reward": 0.8889333605766296,
2374 "rewards/format_reward": 0.9993489980697632,
2375 "step": 169
2376 },
2377 {
2378 "clip_ratio": 0.0,
2379 "completion_length": 112.05339050292969,
2380 "epoch": 1.9540229885057472,
2381 "grad_norm": 1.9158676691938177,
2382 "kl": 0.0322265625,
2383 "learning_rate": 8.045977011494253e-07,
2384 "loss": 0.0014,
2385 "reward": 1.8491878509521484,
2386 "reward_std": 0.023884495720267296,
2387 "rewards/accuracy_reward": 0.8491878509521484,
2388 "rewards/format_reward": 1.0,
2389 "step": 170
2390 },
2391 {
2392 "clip_ratio": 0.0,
2393 "completion_length": 111.62760925292969,
2394 "epoch": 1.9655172413793105,
2395 "grad_norm": 2.780653624143398,
2396 "kl": 0.033203125,
2397 "learning_rate": 8.03448275862069e-07,
2398 "loss": 0.0014,
2399 "reward": 1.887178897857666,
2400 "reward_std": 0.02224629744887352,
2401 "rewards/accuracy_reward": 0.887178897857666,
2402 "rewards/format_reward": 1.0,
2403 "step": 171
2404 },
2405 {
2406 "clip_ratio": 0.0,
2407 "completion_length": 110.57096862792969,
2408 "epoch": 1.9770114942528736,
2409 "grad_norm": 1.9318099805591311,
2410 "kl": 0.031494140625,
2411 "learning_rate": 8.022988505747126e-07,
2412 "loss": 0.0013,
2413 "reward": 1.8778488636016846,
2414 "reward_std": 0.024461418390274048,
2415 "rewards/accuracy_reward": 0.8784998655319214,
2416 "rewards/format_reward": 0.9993489980697632,
2417 "step": 172
2418 },
2419 {
2420 "clip_ratio": 0.0,
2421 "completion_length": 108.85612487792969,
2422 "epoch": 1.9885057471264367,
2423 "grad_norm": 1.8812732346859014,
2424 "kl": 0.031982421875,
2425 "learning_rate": 8.011494252873563e-07,
2426 "loss": 0.0013,
2427 "reward": 1.896362543106079,
2428 "reward_std": 0.021257508546113968,
2429 "rewards/accuracy_reward": 0.8963624238967896,
2430 "rewards/format_reward": 1.0,
2431 "step": 173
2432 },
2433 {
2434 "clip_ratio": 0.0,
2435 "completion_length": 103.70365142822266,
2436 "epoch": 2.0,
2437 "grad_norm": 4.371328763954834,
2438 "kl": 0.033935546875,
2439 "learning_rate": 8e-07,
2440 "loss": 0.0014,
2441 "reward": 1.8666094541549683,
2442 "reward_std": 0.023954574018716812,
2443 "rewards/accuracy_reward": 0.8680139780044556,
2444 "rewards/format_reward": 0.9985955357551575,
2445 "step": 174
2446 },
2447 {
2448 "clip_ratio": 0.0,
2449 "completion_length": 109.845703125,
2450 "epoch": 2.0114942528735633,
2451 "grad_norm": 3.642725753887041,
2452 "kl": 0.028564453125,
2453 "learning_rate": 7.988505747126436e-07,
2454 "loss": 0.0012,
2455 "reward": 1.8493850231170654,
2456 "reward_std": 0.021547168493270874,
2457 "rewards/accuracy_reward": 0.8493850231170654,
2458 "rewards/format_reward": 1.0,
2459 "step": 175
2460 },
2461 {
2462 "clip_ratio": 0.0,
2463 "completion_length": 107.64518737792969,
2464 "epoch": 2.0229885057471266,
2465 "grad_norm": 1.813754894659516,
2466 "kl": 0.0308837890625,
2467 "learning_rate": 7.977011494252873e-07,
2468 "loss": 0.0013,
2469 "reward": 1.8406703472137451,
2470 "reward_std": 0.022422365844249725,
2471 "rewards/accuracy_reward": 0.8406702876091003,
2472 "rewards/format_reward": 1.0,
2473 "step": 176
2474 },
2475 {
2476 "clip_ratio": 0.0,
2477 "completion_length": 106.38021087646484,
2478 "epoch": 2.0344827586206895,
2479 "grad_norm": 1.1813724927973488,
2480 "kl": 0.030029296875,
2481 "learning_rate": 7.965517241379311e-07,
2482 "loss": 0.0013,
2483 "reward": 1.8357830047607422,
2484 "reward_std": 0.02784503623843193,
2485 "rewards/accuracy_reward": 0.8370852470397949,
2486 "rewards/format_reward": 0.9986979365348816,
2487 "step": 177
2488 },
2489 {
2490 "clip_ratio": 0.0,
2491 "completion_length": 104.87890625,
2492 "epoch": 2.045977011494253,
2493 "grad_norm": 1.8350693828851994,
2494 "kl": 0.032470703125,
2495 "learning_rate": 7.954022988505746e-07,
2496 "loss": 0.0014,
2497 "reward": 1.8329381942749023,
2498 "reward_std": 0.02206650748848915,
2499 "rewards/accuracy_reward": 0.8329381942749023,
2500 "rewards/format_reward": 1.0,
2501 "step": 178
2502 },
2503 {
2504 "clip_ratio": 0.0,
2505 "completion_length": 102.98112487792969,
2506 "epoch": 2.057471264367816,
2507 "grad_norm": 6.041427121899008,
2508 "kl": 0.03662109375,
2509 "learning_rate": 7.942528735632184e-07,
2510 "loss": 0.0016,
2511 "reward": 1.8575410842895508,
2512 "reward_std": 0.022278612479567528,
2513 "rewards/accuracy_reward": 0.8575412034988403,
2514 "rewards/format_reward": 1.0,
2515 "step": 179
2516 },
2517 {
2518 "clip_ratio": 0.0,
2519 "completion_length": 104.130859375,
2520 "epoch": 2.0689655172413794,
2521 "grad_norm": 1.3127737451001564,
2522 "kl": 0.037109375,
2523 "learning_rate": 7.931034482758621e-07,
2524 "loss": 0.0016,
2525 "reward": 1.8538506031036377,
2526 "reward_std": 0.026692640036344528,
2527 "rewards/accuracy_reward": 0.8551527261734009,
2528 "rewards/format_reward": 0.9986979365348816,
2529 "step": 180
2530 },
2531 {
2532 "clip_ratio": 0.0,
2533 "completion_length": 102.36849212646484,
2534 "epoch": 2.0804597701149423,
2535 "grad_norm": 2.4756435889269186,
2536 "kl": 0.03759765625,
2537 "learning_rate": 7.919540229885056e-07,
2538 "loss": 0.0016,
2539 "reward": 1.867905616760254,
2540 "reward_std": 0.023630604147911072,
2541 "rewards/accuracy_reward": 0.8685566782951355,
2542 "rewards/format_reward": 0.9993489980697632,
2543 "step": 181
2544 },
2545 {
2546 "clip_ratio": 0.0,
2547 "completion_length": 101.97005462646484,
2548 "epoch": 2.0919540229885056,
2549 "grad_norm": 2.0378313336318823,
2550 "kl": 0.039794921875,
2551 "learning_rate": 7.908045977011494e-07,
2552 "loss": 0.0017,
2553 "reward": 1.8557872772216797,
2554 "reward_std": 0.024489006027579308,
2555 "rewards/accuracy_reward": 0.856438159942627,
2556 "rewards/format_reward": 0.9993489980697632,
2557 "step": 182
2558 },
2559 {
2560 "clip_ratio": 0.0,
2561 "completion_length": 102.88802337646484,
2562 "epoch": 2.103448275862069,
2563 "grad_norm": 1.9136054549569206,
2564 "kl": 0.042724609375,
2565 "learning_rate": 7.896551724137931e-07,
2566 "loss": 0.0018,
2567 "reward": 1.8639075756072998,
2568 "reward_std": 0.025589074939489365,
2569 "rewards/accuracy_reward": 0.865860641002655,
2570 "rewards/format_reward": 0.998046875,
2571 "step": 183
2572 },
2573 {
2574 "clip_ratio": 0.0,
2575 "completion_length": 102.36653900146484,
2576 "epoch": 2.1149425287356323,
2577 "grad_norm": 2.633698077075855,
2578 "kl": 0.038330078125,
2579 "learning_rate": 7.885057471264366e-07,
2580 "loss": 0.0016,
2581 "reward": 1.8632612228393555,
2582 "reward_std": 0.030802268534898758,
2583 "rewards/accuracy_reward": 0.8671674728393555,
2584 "rewards/format_reward": 0.99609375,
2585 "step": 184
2586 },
2587 {
2588 "clip_ratio": 0.0,
2589 "completion_length": 103.41796875,
2590 "epoch": 2.1264367816091956,
2591 "grad_norm": 2.407318367905216,
2592 "kl": 0.03662109375,
2593 "learning_rate": 7.873563218390804e-07,
2594 "loss": 0.0015,
2595 "reward": 1.8410747051239014,
2596 "reward_std": 0.02962721884250641,
2597 "rewards/accuracy_reward": 0.844329833984375,
2598 "rewards/format_reward": 0.9967448115348816,
2599 "step": 185
2600 },
2601 {
2602 "clip_ratio": 0.0,
2603 "completion_length": 105.22591400146484,
2604 "epoch": 2.1379310344827585,
2605 "grad_norm": 1.506301090880529,
2606 "kl": 0.033935546875,
2607 "learning_rate": 7.862068965517241e-07,
2608 "loss": 0.0014,
2609 "reward": 1.8682615756988525,
2610 "reward_std": 0.03243795782327652,
2611 "rewards/accuracy_reward": 0.8734698295593262,
2612 "rewards/format_reward": 0.9947916865348816,
2613 "step": 186
2614 },
2615 {
2616 "clip_ratio": 0.0,
2617 "completion_length": 105.826171875,
2618 "epoch": 2.1494252873563218,
2619 "grad_norm": 3.409263140415921,
2620 "kl": 0.03662109375,
2621 "learning_rate": 7.850574712643679e-07,
2622 "loss": 0.0015,
2623 "reward": 1.875942587852478,
2624 "reward_std": 0.03060537576675415,
2625 "rewards/accuracy_reward": 0.8791979551315308,
2626 "rewards/format_reward": 0.9967448115348816,
2627 "step": 187
2628 },
2629 {
2630 "clip_ratio": 0.0,
2631 "completion_length": 104.181640625,
2632 "epoch": 2.160919540229885,
2633 "grad_norm": 3.163868782667335,
2634 "kl": 0.031005859375,
2635 "learning_rate": 7.839080459770114e-07,
2636 "loss": 0.0013,
2637 "reward": 1.8670661449432373,
2638 "reward_std": 0.03131501376628876,
2639 "rewards/accuracy_reward": 0.8703212738037109,
2640 "rewards/format_reward": 0.9967448115348816,
2641 "step": 188
2642 },
2643 {
2644 "clip_ratio": 0.0,
2645 "completion_length": 104.701171875,
2646 "epoch": 2.1724137931034484,
2647 "grad_norm": 2.26227947471076,
2648 "kl": 0.0299072265625,
2649 "learning_rate": 7.827586206896552e-07,
2650 "loss": 0.0012,
2651 "reward": 1.8607233762741089,
2652 "reward_std": 0.030661556869745255,
2653 "rewards/accuracy_reward": 0.8639785647392273,
2654 "rewards/format_reward": 0.9967448115348816,
2655 "step": 189
2656 },
2657 {
2658 "clip_ratio": 0.0,
2659 "completion_length": 105.59505462646484,
2660 "epoch": 2.1839080459770113,
2661 "grad_norm": 2.1434523868036885,
2662 "kl": 0.0308837890625,
2663 "learning_rate": 7.816091954022989e-07,
2664 "loss": 0.0013,
2665 "reward": 1.8534635305404663,
2666 "reward_std": 0.03587997704744339,
2667 "rewards/accuracy_reward": 0.8580207824707031,
2668 "rewards/format_reward": 0.9954427480697632,
2669 "step": 190
2670 },
2671 {
2672 "clip_ratio": 0.0,
2673 "completion_length": 106.111328125,
2674 "epoch": 2.1954022988505746,
2675 "grad_norm": 1.8017139066303491,
2676 "kl": 0.034423828125,
2677 "learning_rate": 7.804597701149424e-07,
2678 "loss": 0.0015,
2679 "reward": 1.8646326065063477,
2680 "reward_std": 0.031285952776670456,
2681 "rewards/accuracy_reward": 0.8672367930412292,
2682 "rewards/format_reward": 0.9973958730697632,
2683 "step": 191
2684 },
2685 {
2686 "clip_ratio": 0.0,
2687 "completion_length": 103.44921875,
2688 "epoch": 2.206896551724138,
2689 "grad_norm": 1.4767880401798514,
2690 "kl": 0.0302734375,
2691 "learning_rate": 7.793103448275862e-07,
2692 "loss": 0.0013,
2693 "reward": 1.8693158626556396,
2694 "reward_std": 0.03322295472025871,
2695 "rewards/accuracy_reward": 0.8725711107254028,
2696 "rewards/format_reward": 0.9967448115348816,
2697 "step": 192
2698 },
2699 {
2700 "clip_ratio": 0.0,
2701 "completion_length": 104.24674987792969,
2702 "epoch": 2.218390804597701,
2703 "grad_norm": 1.1854249053762298,
2704 "kl": 0.0294189453125,
2705 "learning_rate": 7.781609195402299e-07,
2706 "loss": 0.0013,
2707 "reward": 1.8650622367858887,
2708 "reward_std": 0.03180694580078125,
2709 "rewards/accuracy_reward": 0.867666482925415,
2710 "rewards/format_reward": 0.9973958730697632,
2711 "step": 193
2712 },
2713 {
2714 "clip_ratio": 0.0,
2715 "completion_length": 104.73372650146484,
2716 "epoch": 2.2298850574712645,
2717 "grad_norm": 1.1139391781844512,
2718 "kl": 0.033447265625,
2719 "learning_rate": 7.770114942528734e-07,
2720 "loss": 0.0014,
2721 "reward": 1.87180757522583,
2722 "reward_std": 0.03504088148474693,
2723 "rewards/accuracy_reward": 0.8757138252258301,
2724 "rewards/format_reward": 0.99609375,
2725 "step": 194
2726 },
2727 {
2728 "clip_ratio": 0.0,
2729 "completion_length": 104.34245300292969,
2730 "epoch": 2.2413793103448274,
2731 "grad_norm": 1.6586210582781218,
2732 "kl": 0.032470703125,
2733 "learning_rate": 7.758620689655172e-07,
2734 "loss": 0.0014,
2735 "reward": 1.8676183223724365,
2736 "reward_std": 0.028974760323762894,
2737 "rewards/accuracy_reward": 0.8689204454421997,
2738 "rewards/format_reward": 0.9986979365348816,
2739 "step": 195
2740 },
2741 {
2742 "clip_ratio": 0.0,
2743 "completion_length": 104.36653900146484,
2744 "epoch": 2.2528735632183907,
2745 "grad_norm": 1.1693377970360763,
2746 "kl": 0.0361328125,
2747 "learning_rate": 7.747126436781609e-07,
2748 "loss": 0.0015,
2749 "reward": 1.8650755882263184,
2750 "reward_std": 0.03180007264018059,
2751 "rewards/accuracy_reward": 0.8676798343658447,
2752 "rewards/format_reward": 0.9973958730697632,
2753 "step": 196
2754 },
2755 {
2756 "clip_ratio": 0.0,
2757 "completion_length": 104.658203125,
2758 "epoch": 2.264367816091954,
2759 "grad_norm": 5.251694511136823,
2760 "kl": 0.032470703125,
2761 "learning_rate": 7.735632183908046e-07,
2762 "loss": 0.0014,
2763 "reward": 1.8851394653320312,
2764 "reward_std": 0.028069045394659042,
2765 "rewards/accuracy_reward": 0.8864415884017944,
2766 "rewards/format_reward": 0.9986979365348816,
2767 "step": 197
2768 },
2769 {
2770 "clip_ratio": 0.0,
2771 "completion_length": 105.24674987792969,
2772 "epoch": 2.2758620689655173,
2773 "grad_norm": 1.3160075358259606,
2774 "kl": 0.0306396484375,
2775 "learning_rate": 7.724137931034482e-07,
2776 "loss": 0.0013,
2777 "reward": 1.871217966079712,
2778 "reward_std": 0.02678101509809494,
2779 "rewards/accuracy_reward": 0.8718689680099487,
2780 "rewards/format_reward": 0.9993489980697632,
2781 "step": 198
2782 },
2783 {
2784 "clip_ratio": 0.0,
2785 "completion_length": 107.22396087646484,
2786 "epoch": 2.2873563218390807,
2787 "grad_norm": 1.7363926773399032,
2788 "kl": 0.0322265625,
2789 "learning_rate": 7.712643678160919e-07,
2790 "loss": 0.0014,
2791 "reward": 1.8681142330169678,
2792 "reward_std": 0.025049438700079918,
2793 "rewards/accuracy_reward": 0.8687652349472046,
2794 "rewards/format_reward": 0.9993489980697632,
2795 "step": 199
2796 },
2797 {
2798 "clip_ratio": 0.0,
2799 "completion_length": 106.36328125,
2800 "epoch": 2.2988505747126435,
2801 "grad_norm": 1.6358927960776593,
2802 "kl": 0.032470703125,
2803 "learning_rate": 7.701149425287356e-07,
2804 "loss": 0.0014,
2805 "reward": 1.8802045583724976,
2806 "reward_std": 0.025107329711318016,
2807 "rewards/accuracy_reward": 0.8808557391166687,
2808 "rewards/format_reward": 0.9993489980697632,
2809 "step": 200
2810 },
2811 {
2812 "clip_ratio": 0.0,
2813 "completion_length": 107.34049987792969,
2814 "epoch": 2.310344827586207,
2815 "grad_norm": 1.9769472874812075,
2816 "kl": 0.034912109375,
2817 "learning_rate": 7.689655172413792e-07,
2818 "loss": 0.0015,
2819 "reward": 1.8836421966552734,
2820 "reward_std": 0.023718908429145813,
2821 "rewards/accuracy_reward": 0.8842934370040894,
2822 "rewards/format_reward": 0.9993489980697632,
2823 "step": 201
2824 },
2825 {
2826 "clip_ratio": 0.0,
2827 "completion_length": 105.58203125,
2828 "epoch": 2.32183908045977,
2829 "grad_norm": 1.5235580660765125,
2830 "kl": 0.04150390625,
2831 "learning_rate": 7.67816091954023e-07,
2832 "loss": 0.0017,
2833 "reward": 1.872357726097107,
2834 "reward_std": 0.027882186695933342,
2835 "rewards/accuracy_reward": 0.8736597895622253,
2836 "rewards/format_reward": 0.9986979365348816,
2837 "step": 202
2838 },
2839 {
2840 "clip_ratio": 0.0,
2841 "completion_length": 107.212890625,
2842 "epoch": 2.3333333333333335,
2843 "grad_norm": 1.6195272681997732,
2844 "kl": 0.03515625,
2845 "learning_rate": 7.666666666666667e-07,
2846 "loss": 0.0015,
2847 "reward": 1.864328145980835,
2848 "reward_std": 0.023437950760126114,
2849 "rewards/accuracy_reward": 0.8649791479110718,
2850 "rewards/format_reward": 0.9993489980697632,
2851 "step": 203
2852 },
2853 {
2854 "clip_ratio": 0.0,
2855 "completion_length": 107.525390625,
2856 "epoch": 2.344827586206897,
2857 "grad_norm": 1.8341465818857292,
2858 "kl": 0.038818359375,
2859 "learning_rate": 7.655172413793102e-07,
2860 "loss": 0.0016,
2861 "reward": 1.8627145290374756,
2862 "reward_std": 0.022373493760824203,
2863 "rewards/accuracy_reward": 0.8627144694328308,
2864 "rewards/format_reward": 1.0,
2865 "step": 204
2866 },
2867 {
2868 "clip_ratio": 0.0,
2869 "completion_length": 106.689453125,
2870 "epoch": 2.3563218390804597,
2871 "grad_norm": 1.1804395489400663,
2872 "kl": 0.0390625,
2873 "learning_rate": 7.64367816091954e-07,
2874 "loss": 0.0016,
2875 "reward": 1.8903453350067139,
2876 "reward_std": 0.02181018702685833,
2877 "rewards/accuracy_reward": 0.8903453350067139,
2878 "rewards/format_reward": 1.0,
2879 "step": 205
2880 },
2881 {
2882 "clip_ratio": 0.0,
2883 "completion_length": 106.21875,
2884 "epoch": 2.367816091954023,
2885 "grad_norm": 1.54557267471986,
2886 "kl": 0.03955078125,
2887 "learning_rate": 7.632183908045977e-07,
2888 "loss": 0.0016,
2889 "reward": 1.8669939041137695,
2890 "reward_std": 0.02308422140777111,
2891 "rewards/accuracy_reward": 0.8669940829277039,
2892 "rewards/format_reward": 1.0,
2893 "step": 206
2894 },
2895 {
2896 "clip_ratio": 0.0,
2897 "completion_length": 108.130859375,
2898 "epoch": 2.3793103448275863,
2899 "grad_norm": 1.6770832687532953,
2900 "kl": 0.041259765625,
2901 "learning_rate": 7.620689655172414e-07,
2902 "loss": 0.0017,
2903 "reward": 1.8921034336090088,
2904 "reward_std": 0.024557670578360558,
2905 "rewards/accuracy_reward": 0.8934054374694824,
2906 "rewards/format_reward": 0.9986979365348816,
2907 "step": 207
2908 },
2909 {
2910 "clip_ratio": 0.0,
2911 "completion_length": 107.56510925292969,
2912 "epoch": 2.3908045977011496,
2913 "grad_norm": 1.53505457334918,
2914 "kl": 0.04150390625,
2915 "learning_rate": 7.60919540229885e-07,
2916 "loss": 0.0017,
2917 "reward": 1.8635462522506714,
2918 "reward_std": 0.028757482767105103,
2919 "rewards/accuracy_reward": 0.8635462522506714,
2920 "rewards/format_reward": 1.0,
2921 "step": 208
2922 },
2923 {
2924 "clip_ratio": 0.0,
2925 "completion_length": 108.521484375,
2926 "epoch": 2.4022988505747125,
2927 "grad_norm": 1.5279469918158477,
2928 "kl": 0.0458984375,
2929 "learning_rate": 7.597701149425287e-07,
2930 "loss": 0.0019,
2931 "reward": 1.8711223602294922,
2932 "reward_std": 0.022866621613502502,
2933 "rewards/accuracy_reward": 0.8711225390434265,
2934 "rewards/format_reward": 1.0,
2935 "step": 209
2936 },
2937 {
2938 "clip_ratio": 0.0,
2939 "completion_length": 108.22135925292969,
2940 "epoch": 2.413793103448276,
2941 "grad_norm": 1.9431939650471932,
2942 "kl": 0.04296875,
2943 "learning_rate": 7.586206896551724e-07,
2944 "loss": 0.0018,
2945 "reward": 1.8868978023529053,
2946 "reward_std": 0.02192886732518673,
2947 "rewards/accuracy_reward": 0.8868978023529053,
2948 "rewards/format_reward": 1.0,
2949 "step": 210
2950 },
2951 {
2952 "clip_ratio": 0.0,
2953 "completion_length": 107.751953125,
2954 "epoch": 2.425287356321839,
2955 "grad_norm": 1.499449740827804,
2956 "kl": 0.042236328125,
2957 "learning_rate": 7.57471264367816e-07,
2958 "loss": 0.0017,
2959 "reward": 1.865365743637085,
2960 "reward_std": 0.025590822100639343,
2961 "rewards/accuracy_reward": 0.8660167455673218,
2962 "rewards/format_reward": 0.9993489980697632,
2963 "step": 211
2964 },
2965 {
2966 "clip_ratio": 0.0,
2967 "completion_length": 105.19075775146484,
2968 "epoch": 2.4367816091954024,
2969 "grad_norm": 2.0546219049276684,
2970 "kl": 0.042724609375,
2971 "learning_rate": 7.563218390804598e-07,
2972 "loss": 0.0017,
2973 "reward": 1.8754686117172241,
2974 "reward_std": 0.02179308794438839,
2975 "rewards/accuracy_reward": 0.8754686713218689,
2976 "rewards/format_reward": 1.0,
2977 "step": 212
2978 },
2979 {
2980 "clip_ratio": 0.0,
2981 "completion_length": 105.09700775146484,
2982 "epoch": 2.4482758620689653,
2983 "grad_norm": 1.7497621009886284,
2984 "kl": 0.0361328125,
2985 "learning_rate": 7.551724137931034e-07,
2986 "loss": 0.0015,
2987 "reward": 1.8816776275634766,
2988 "reward_std": 0.023087939247488976,
2989 "rewards/accuracy_reward": 0.8816776275634766,
2990 "rewards/format_reward": 1.0,
2991 "step": 213
2992 },
2993 {
2994 "clip_ratio": 0.0,
2995 "completion_length": 103.85612487792969,
2996 "epoch": 2.4597701149425286,
2997 "grad_norm": 1.6521998481125082,
2998 "kl": 0.037109375,
2999 "learning_rate": 7.540229885057471e-07,
3000 "loss": 0.0016,
3001 "reward": 1.8775031566619873,
3002 "reward_std": 0.022142350673675537,
3003 "rewards/accuracy_reward": 0.8781541585922241,
3004 "rewards/format_reward": 0.9993489980697632,
3005 "step": 214
3006 },
3007 {
3008 "clip_ratio": 0.0,
3009 "completion_length": 101.697265625,
3010 "epoch": 2.471264367816092,
3011 "grad_norm": 1.4603832529701488,
3012 "kl": 0.037353515625,
3013 "learning_rate": 7.528735632183908e-07,
3014 "loss": 0.0015,
3015 "reward": 1.8763501644134521,
3016 "reward_std": 0.02745872735977173,
3017 "rewards/accuracy_reward": 0.8770012259483337,
3018 "rewards/format_reward": 0.9993489980697632,
3019 "step": 215
3020 },
3021 {
3022 "clip_ratio": 0.0,
3023 "completion_length": 100.99284362792969,
3024 "epoch": 2.4827586206896552,
3025 "grad_norm": 1.4199974173722163,
3026 "kl": 0.033447265625,
3027 "learning_rate": 7.517241379310344e-07,
3028 "loss": 0.0014,
3029 "reward": 1.879393458366394,
3030 "reward_std": 0.023880278691649437,
3031 "rewards/accuracy_reward": 0.8800445795059204,
3032 "rewards/format_reward": 0.9993489980697632,
3033 "step": 216
3034 },
3035 {
3036 "clip_ratio": 0.0,
3037 "completion_length": 99.63216400146484,
3038 "epoch": 2.4942528735632186,
3039 "grad_norm": 1.4805312918485005,
3040 "kl": 0.034423828125,
3041 "learning_rate": 7.505747126436781e-07,
3042 "loss": 0.0014,
3043 "reward": 1.8871581554412842,
3044 "reward_std": 0.023221522569656372,
3045 "rewards/accuracy_reward": 0.8871581554412842,
3046 "rewards/format_reward": 1.0,
3047 "step": 217
3048 },
3049 {
3050 "clip_ratio": 0.0,
3051 "completion_length": 101.69010925292969,
3052 "epoch": 2.5057471264367814,
3053 "grad_norm": 1.5882030215776515,
3054 "kl": 0.031982421875,
3055 "learning_rate": 7.494252873563218e-07,
3056 "loss": 0.0014,
3057 "reward": 1.8449172973632812,
3058 "reward_std": 0.0257129929959774,
3059 "rewards/accuracy_reward": 0.8449174761772156,
3060 "rewards/format_reward": 1.0,
3061 "step": 218
3062 },
3063 {
3064 "clip_ratio": 0.0,
3065 "completion_length": 102.49089050292969,
3066 "epoch": 2.5172413793103448,
3067 "grad_norm": 3.140150954079358,
3068 "kl": 0.0322265625,
3069 "learning_rate": 7.482758620689655e-07,
3070 "loss": 0.0014,
3071 "reward": 1.8943170309066772,
3072 "reward_std": 0.023106535896658897,
3073 "rewards/accuracy_reward": 0.8949680328369141,
3074 "rewards/format_reward": 0.9993489980697632,
3075 "step": 219
3076 },
3077 {
3078 "clip_ratio": 0.0,
3079 "completion_length": 99.779296875,
3080 "epoch": 2.528735632183908,
3081 "grad_norm": 1.8149238607460914,
3082 "kl": 0.032958984375,
3083 "learning_rate": 7.471264367816092e-07,
3084 "loss": 0.0014,
3085 "reward": 1.8684821128845215,
3086 "reward_std": 0.023636629804968834,
3087 "rewards/accuracy_reward": 0.8684821128845215,
3088 "rewards/format_reward": 1.0,
3089 "step": 220
3090 },
3091 {
3092 "clip_ratio": 0.0,
3093 "completion_length": 102.23503112792969,
3094 "epoch": 2.5402298850574714,
3095 "grad_norm": 2.0933238300012715,
3096 "kl": 0.048095703125,
3097 "learning_rate": 7.459770114942528e-07,
3098 "loss": 0.002,
3099 "reward": 1.8713550567626953,
3100 "reward_std": 0.023653965443372726,
3101 "rewards/accuracy_reward": 0.8713551759719849,
3102 "rewards/format_reward": 1.0,
3103 "step": 221
3104 },
3105 {
3106 "clip_ratio": 0.0,
3107 "completion_length": 102.32292175292969,
3108 "epoch": 2.5517241379310347,
3109 "grad_norm": 2.40931495027469,
3110 "kl": 0.0311279296875,
3111 "learning_rate": 7.448275862068965e-07,
3112 "loss": 0.0013,
3113 "reward": 1.8844565153121948,
3114 "reward_std": 0.025886138901114464,
3115 "rewards/accuracy_reward": 0.8844565153121948,
3116 "rewards/format_reward": 1.0,
3117 "step": 222
3118 },
3119 {
3120 "clip_ratio": 0.0,
3121 "completion_length": 102.625,
3122 "epoch": 2.5632183908045976,
3123 "grad_norm": 2.3506903297912234,
3124 "kl": 0.035400390625,
3125 "learning_rate": 7.436781609195402e-07,
3126 "loss": 0.0015,
3127 "reward": 1.8940942287445068,
3128 "reward_std": 0.021925464272499084,
3129 "rewards/accuracy_reward": 0.8940942287445068,
3130 "rewards/format_reward": 1.0,
3131 "step": 223
3132 },
3133 {
3134 "clip_ratio": 0.0,
3135 "completion_length": 104.16796875,
3136 "epoch": 2.574712643678161,
3137 "grad_norm": 4.228763111501542,
3138 "kl": 0.0302734375,
3139 "learning_rate": 7.425287356321839e-07,
3140 "loss": 0.0013,
3141 "reward": 1.8942646980285645,
3142 "reward_std": 0.02186274155974388,
3143 "rewards/accuracy_reward": 0.8942646980285645,
3144 "rewards/format_reward": 1.0,
3145 "step": 224
3146 },
3147 {
3148 "clip_ratio": 0.0,
3149 "completion_length": 106.87760925292969,
3150 "epoch": 2.586206896551724,
3151 "grad_norm": 1.339092605600081,
3152 "kl": 0.0286865234375,
3153 "learning_rate": 7.413793103448276e-07,
3154 "loss": 0.0012,
3155 "reward": 1.881058692932129,
3156 "reward_std": 0.025702446699142456,
3157 "rewards/accuracy_reward": 0.8817097544670105,
3158 "rewards/format_reward": 0.9993489980697632,
3159 "step": 225
3160 },
3161 {
3162 "clip_ratio": 0.0,
3163 "completion_length": 104.62435150146484,
3164 "epoch": 2.5977011494252875,
3165 "grad_norm": 1.770916862954278,
3166 "kl": 0.03125,
3167 "learning_rate": 7.402298850574712e-07,
3168 "loss": 0.0013,
3169 "reward": 1.8780031204223633,
3170 "reward_std": 0.02341182343661785,
3171 "rewards/accuracy_reward": 0.8780032396316528,
3172 "rewards/format_reward": 1.0,
3173 "step": 226
3174 },
3175 {
3176 "clip_ratio": 0.0,
3177 "completion_length": 105.64453125,
3178 "epoch": 2.609195402298851,
3179 "grad_norm": 1.929796218203904,
3180 "kl": 0.0289306640625,
3181 "learning_rate": 7.390804597701149e-07,
3182 "loss": 0.0012,
3183 "reward": 1.8760128021240234,
3184 "reward_std": 0.025408655405044556,
3185 "rewards/accuracy_reward": 0.8760129809379578,
3186 "rewards/format_reward": 1.0,
3187 "step": 227
3188 },
3189 {
3190 "clip_ratio": 0.0,
3191 "completion_length": 107.86393737792969,
3192 "epoch": 2.6206896551724137,
3193 "grad_norm": 1.2206148979521507,
3194 "kl": 0.0294189453125,
3195 "learning_rate": 7.379310344827586e-07,
3196 "loss": 0.0012,
3197 "reward": 1.8749685287475586,
3198 "reward_std": 0.02704436704516411,
3199 "rewards/accuracy_reward": 0.8749687075614929,
3200 "rewards/format_reward": 1.0,
3201 "step": 228
3202 },
3203 {
3204 "clip_ratio": 0.0,
3205 "completion_length": 107.52474212646484,
3206 "epoch": 2.632183908045977,
3207 "grad_norm": 1.1892012690175122,
3208 "kl": 0.03076171875,
3209 "learning_rate": 7.367816091954022e-07,
3210 "loss": 0.0013,
3211 "reward": 1.885100245475769,
3212 "reward_std": 0.02430718205869198,
3213 "rewards/accuracy_reward": 0.8851003646850586,
3214 "rewards/format_reward": 1.0,
3215 "step": 229
3216 },
3217 {
3218 "clip_ratio": 0.0,
3219 "completion_length": 108.72786712646484,
3220 "epoch": 2.6436781609195403,
3221 "grad_norm": 1.474891820836295,
3222 "kl": 0.03076171875,
3223 "learning_rate": 7.35632183908046e-07,
3224 "loss": 0.0013,
3225 "reward": 1.8827028274536133,
3226 "reward_std": 0.026975825428962708,
3227 "rewards/accuracy_reward": 0.8827028274536133,
3228 "rewards/format_reward": 1.0,
3229 "step": 230
3230 },
3231 {
3232 "clip_ratio": 0.0,
3233 "completion_length": 108.77083587646484,
3234 "epoch": 2.655172413793103,
3235 "grad_norm": 1.7269926903360489,
3236 "kl": 0.0380859375,
3237 "learning_rate": 7.344827586206897e-07,
3238 "loss": 0.0016,
3239 "reward": 1.8721890449523926,
3240 "reward_std": 0.02634214609861374,
3241 "rewards/accuracy_reward": 0.8721892237663269,
3242 "rewards/format_reward": 1.0,
3243 "step": 231
3244 },
3245 {
3246 "clip_ratio": 0.0,
3247 "completion_length": 108.22526550292969,
3248 "epoch": 2.6666666666666665,
3249 "grad_norm": 1.9439461898517982,
3250 "kl": 0.0322265625,
3251 "learning_rate": 7.333333333333332e-07,
3252 "loss": 0.0014,
3253 "reward": 1.8815977573394775,
3254 "reward_std": 0.027597349137067795,
3255 "rewards/accuracy_reward": 0.8822487592697144,
3256 "rewards/format_reward": 0.9993489980697632,
3257 "step": 232
3258 },
3259 {
3260 "clip_ratio": 0.0,
3261 "completion_length": 108.474609375,
3262 "epoch": 2.67816091954023,
3263 "grad_norm": 1.4550598569752655,
3264 "kl": 0.03173828125,
3265 "learning_rate": 7.32183908045977e-07,
3266 "loss": 0.0013,
3267 "reward": 1.8949522972106934,
3268 "reward_std": 0.023627810180187225,
3269 "rewards/accuracy_reward": 0.8949524164199829,
3270 "rewards/format_reward": 1.0,
3271 "step": 233
3272 },
3273 {
3274 "clip_ratio": 0.0,
3275 "completion_length": 109.77214050292969,
3276 "epoch": 2.689655172413793,
3277 "grad_norm": 1.231833875421197,
3278 "kl": 0.033447265625,
3279 "learning_rate": 7.310344827586207e-07,
3280 "loss": 0.0014,
3281 "reward": 1.8845272064208984,
3282 "reward_std": 0.02502075955271721,
3283 "rewards/accuracy_reward": 0.8845272064208984,
3284 "rewards/format_reward": 1.0,
3285 "step": 234
3286 },
3287 {
3288 "clip_ratio": 0.0,
3289 "completion_length": 111.39974212646484,
3290 "epoch": 2.7011494252873565,
3291 "grad_norm": 1.3652986563130975,
3292 "kl": 0.035400390625,
3293 "learning_rate": 7.298850574712644e-07,
3294 "loss": 0.0015,
3295 "reward": 1.8603239059448242,
3296 "reward_std": 0.0263795405626297,
3297 "rewards/accuracy_reward": 0.8603239059448242,
3298 "rewards/format_reward": 1.0,
3299 "step": 235
3300 },
3301 {
3302 "clip_ratio": 0.0,
3303 "completion_length": 112.92057800292969,
3304 "epoch": 2.7126436781609193,
3305 "grad_norm": 1.6201244118842026,
3306 "kl": 0.038818359375,
3307 "learning_rate": 7.28735632183908e-07,
3308 "loss": 0.0016,
3309 "reward": 1.8918843269348145,
3310 "reward_std": 0.0242290198802948,
3311 "rewards/accuracy_reward": 0.891884446144104,
3312 "rewards/format_reward": 1.0,
3313 "step": 236
3314 },
3315 {
3316 "clip_ratio": 0.0,
3317 "completion_length": 109.63346862792969,
3318 "epoch": 2.7241379310344827,
3319 "grad_norm": 1.6072202456697025,
3320 "kl": 0.037841796875,
3321 "learning_rate": 7.275862068965517e-07,
3322 "loss": 0.0016,
3323 "reward": 1.891068458557129,
3324 "reward_std": 0.025224387645721436,
3325 "rewards/accuracy_reward": 0.8917193412780762,
3326 "rewards/format_reward": 0.9993489980697632,
3327 "step": 237
3328 },
3329 {
3330 "clip_ratio": 0.0,
3331 "completion_length": 109.85807800292969,
3332 "epoch": 2.735632183908046,
3333 "grad_norm": 3.1474234625781308,
3334 "kl": 0.0390625,
3335 "learning_rate": 7.264367816091954e-07,
3336 "loss": 0.0016,
3337 "reward": 1.886623740196228,
3338 "reward_std": 0.024830807000398636,
3339 "rewards/accuracy_reward": 0.8866235613822937,
3340 "rewards/format_reward": 1.0,
3341 "step": 238
3342 },
3343 {
3344 "clip_ratio": 0.0,
3345 "completion_length": 111.75521087646484,
3346 "epoch": 2.7471264367816093,
3347 "grad_norm": 1.6342951016344707,
3348 "kl": 0.0419921875,
3349 "learning_rate": 7.25287356321839e-07,
3350 "loss": 0.0017,
3351 "reward": 1.8781700134277344,
3352 "reward_std": 0.02809235453605652,
3353 "rewards/accuracy_reward": 0.8794721364974976,
3354 "rewards/format_reward": 0.9986979365348816,
3355 "step": 239
3356 },
3357 {
3358 "clip_ratio": 0.0,
3359 "completion_length": 113.28255462646484,
3360 "epoch": 2.7586206896551726,
3361 "grad_norm": 1.1515713105906038,
3362 "kl": 0.036865234375,
3363 "learning_rate": 7.241379310344827e-07,
3364 "loss": 0.0015,
3365 "reward": 1.89352548122406,
3366 "reward_std": 0.021531865000724792,
3367 "rewards/accuracy_reward": 0.8935256004333496,
3368 "rewards/format_reward": 1.0,
3369 "step": 240
3370 },
3371 {
3372 "clip_ratio": 0.0,
3373 "completion_length": 110.35482025146484,
3374 "epoch": 2.7701149425287355,
3375 "grad_norm": 2.045990886967949,
3376 "kl": 0.045166015625,
3377 "learning_rate": 7.229885057471265e-07,
3378 "loss": 0.0018,
3379 "reward": 1.8543777465820312,
3380 "reward_std": 0.028905829414725304,
3381 "rewards/accuracy_reward": 0.8556797504425049,
3382 "rewards/format_reward": 0.9986979365348816,
3383 "step": 241
3384 },
3385 {
3386 "clip_ratio": 0.0,
3387 "completion_length": 111.45573425292969,
3388 "epoch": 2.781609195402299,
3389 "grad_norm": 2.388872547648165,
3390 "kl": 0.03955078125,
3391 "learning_rate": 7.2183908045977e-07,
3392 "loss": 0.0016,
3393 "reward": 1.8795366287231445,
3394 "reward_std": 0.025653596967458725,
3395 "rewards/accuracy_reward": 0.8801875114440918,
3396 "rewards/format_reward": 0.9993489980697632,
3397 "step": 242
3398 },
3399 {
3400 "clip_ratio": 0.0,
3401 "completion_length": 108.02799987792969,
3402 "epoch": 2.793103448275862,
3403 "grad_norm": 1.313601894213018,
3404 "kl": 0.041015625,
3405 "learning_rate": 7.206896551724138e-07,
3406 "loss": 0.0017,
3407 "reward": 1.9008476734161377,
3408 "reward_std": 0.02234504744410515,
3409 "rewards/accuracy_reward": 0.9008476734161377,
3410 "rewards/format_reward": 1.0,
3411 "step": 243
3412 },
3413 {
3414 "clip_ratio": 0.0,
3415 "completion_length": 108.38802337646484,
3416 "epoch": 2.8045977011494254,
3417 "grad_norm": 1.690388570174564,
3418 "kl": 0.039794921875,
3419 "learning_rate": 7.195402298850575e-07,
3420 "loss": 0.0017,
3421 "reward": 1.8841087818145752,
3422 "reward_std": 0.02542022429406643,
3423 "rewards/accuracy_reward": 0.8854107856750488,
3424 "rewards/format_reward": 0.9986979365348816,
3425 "step": 244
3426 },
3427 {
3428 "clip_ratio": 0.0,
3429 "completion_length": 106.02409362792969,
3430 "epoch": 2.8160919540229887,
3431 "grad_norm": 1.4752619364083788,
3432 "kl": 0.038330078125,
3433 "learning_rate": 7.18390804597701e-07,
3434 "loss": 0.0016,
3435 "reward": 1.872473120689392,
3436 "reward_std": 0.026318645104765892,
3437 "rewards/accuracy_reward": 0.8724731206893921,
3438 "rewards/format_reward": 1.0,
3439 "step": 245
3440 },
3441 {
3442 "clip_ratio": 0.0,
3443 "completion_length": 105.24674987792969,
3444 "epoch": 2.8275862068965516,
3445 "grad_norm": 13.670541177628872,
3446 "kl": 0.036376953125,
3447 "learning_rate": 7.172413793103448e-07,
3448 "loss": 0.0015,
3449 "reward": 1.8723421096801758,
3450 "reward_std": 0.025184884667396545,
3451 "rewards/accuracy_reward": 0.8723421096801758,
3452 "rewards/format_reward": 1.0,
3453 "step": 246
3454 },
3455 {
3456 "clip_ratio": 0.0,
3457 "completion_length": 107.20638275146484,
3458 "epoch": 2.839080459770115,
3459 "grad_norm": 1.0940987098177108,
3460 "kl": 0.03515625,
3461 "learning_rate": 7.160919540229885e-07,
3462 "loss": 0.0015,
3463 "reward": 1.875187873840332,
3464 "reward_std": 0.02431398257613182,
3465 "rewards/accuracy_reward": 0.8751880526542664,
3466 "rewards/format_reward": 1.0,
3467 "step": 247
3468 },
3469 {
3470 "clip_ratio": 0.0,
3471 "completion_length": 106.302734375,
3472 "epoch": 2.8505747126436782,
3473 "grad_norm": 2.079946913833384,
3474 "kl": 0.052490234375,
3475 "learning_rate": 7.149425287356321e-07,
3476 "loss": 0.0022,
3477 "reward": 1.887062430381775,
3478 "reward_std": 0.024545643478631973,
3479 "rewards/accuracy_reward": 0.8870624303817749,
3480 "rewards/format_reward": 1.0,
3481 "step": 248
3482 },
3483 {
3484 "clip_ratio": 0.0,
3485 "completion_length": 106.73503112792969,
3486 "epoch": 2.862068965517241,
3487 "grad_norm": 1.9116339472111843,
3488 "kl": 0.033935546875,
3489 "learning_rate": 7.137931034482758e-07,
3490 "loss": 0.0014,
3491 "reward": 1.8704020977020264,
3492 "reward_std": 0.02490667998790741,
3493 "rewards/accuracy_reward": 0.8710530996322632,
3494 "rewards/format_reward": 0.9993489980697632,
3495 "step": 249
3496 },
3497 {
3498 "clip_ratio": 0.0,
3499 "completion_length": 105.78255462646484,
3500 "epoch": 2.873563218390805,
3501 "grad_norm": 1.198578783738147,
3502 "kl": 0.035400390625,
3503 "learning_rate": 7.126436781609195e-07,
3504 "loss": 0.0015,
3505 "reward": 1.8799991607666016,
3506 "reward_std": 0.022041937336325645,
3507 "rewards/accuracy_reward": 0.8799993395805359,
3508 "rewards/format_reward": 1.0,
3509 "step": 250
3510 },
3511 {
3512 "clip_ratio": 0.0,
3513 "completion_length": 104.40755462646484,
3514 "epoch": 2.8850574712643677,
3515 "grad_norm": 1.8122950855537878,
3516 "kl": 0.037353515625,
3517 "learning_rate": 7.114942528735633e-07,
3518 "loss": 0.0015,
3519 "reward": 1.8816900253295898,
3520 "reward_std": 0.02312072180211544,
3521 "rewards/accuracy_reward": 0.8816901445388794,
3522 "rewards/format_reward": 1.0,
3523 "step": 251
3524 },
3525 {
3526 "clip_ratio": 0.0,
3527 "completion_length": 104.2578125,
3528 "epoch": 2.896551724137931,
3529 "grad_norm": 1.8584979960733188,
3530 "kl": 0.03369140625,
3531 "learning_rate": 7.103448275862068e-07,
3532 "loss": 0.0014,
3533 "reward": 1.8846673965454102,
3534 "reward_std": 0.025826361030340195,
3535 "rewards/accuracy_reward": 0.8846673965454102,
3536 "rewards/format_reward": 1.0,
3537 "step": 252
3538 },
3539 {
3540 "clip_ratio": 0.0,
3541 "completion_length": 103.16796875,
3542 "epoch": 2.9080459770114944,
3543 "grad_norm": 3.6080553707797423,
3544 "kl": 0.033447265625,
3545 "learning_rate": 7.091954022988506e-07,
3546 "loss": 0.0014,
3547 "reward": 1.8778409957885742,
3548 "reward_std": 0.02433183044195175,
3549 "rewards/accuracy_reward": 0.8778411746025085,
3550 "rewards/format_reward": 1.0,
3551 "step": 253
3552 },
3553 {
3554 "clip_ratio": 0.0,
3555 "completion_length": 101.78776550292969,
3556 "epoch": 2.9195402298850572,
3557 "grad_norm": 1.4145733069699882,
3558 "kl": 0.037109375,
3559 "learning_rate": 7.080459770114943e-07,
3560 "loss": 0.0015,
3561 "reward": 1.881535291671753,
3562 "reward_std": 0.023436537012457848,
3563 "rewards/accuracy_reward": 0.8815354108810425,
3564 "rewards/format_reward": 1.0,
3565 "step": 254
3566 },
3567 {
3568 "clip_ratio": 0.0,
3569 "completion_length": 102.01237487792969,
3570 "epoch": 2.9310344827586206,
3571 "grad_norm": 1.3798889630920503,
3572 "kl": 0.037109375,
3573 "learning_rate": 7.068965517241378e-07,
3574 "loss": 0.0016,
3575 "reward": 1.8669624328613281,
3576 "reward_std": 0.024586662650108337,
3577 "rewards/accuracy_reward": 0.8669624328613281,
3578 "rewards/format_reward": 1.0,
3579 "step": 255
3580 },
3581 {
3582 "clip_ratio": 0.0,
3583 "completion_length": 101.84049987792969,
3584 "epoch": 2.942528735632184,
3585 "grad_norm": 1.501972491549555,
3586 "kl": 0.033935546875,
3587 "learning_rate": 7.057471264367816e-07,
3588 "loss": 0.0014,
3589 "reward": 1.8706506490707397,
3590 "reward_std": 0.02558681182563305,
3591 "rewards/accuracy_reward": 0.8706506490707397,
3592 "rewards/format_reward": 1.0,
3593 "step": 256
3594 },
3595 {
3596 "clip_ratio": 0.0,
3597 "completion_length": 101.26692962646484,
3598 "epoch": 2.954022988505747,
3599 "grad_norm": 1.4790939200318616,
3600 "kl": 0.037841796875,
3601 "learning_rate": 7.045977011494253e-07,
3602 "loss": 0.0016,
3603 "reward": 1.8889020681381226,
3604 "reward_std": 0.02365148440003395,
3605 "rewards/accuracy_reward": 0.888901948928833,
3606 "rewards/format_reward": 1.0,
3607 "step": 257
3608 },
3609 {
3610 "clip_ratio": 0.0,
3611 "completion_length": 99.91146087646484,
3612 "epoch": 2.9655172413793105,
3613 "grad_norm": 3.7173905554951854,
3614 "kl": 0.0380859375,
3615 "learning_rate": 7.034482758620688e-07,
3616 "loss": 0.0016,
3617 "reward": 1.8991615772247314,
3618 "reward_std": 0.01971290074288845,
3619 "rewards/accuracy_reward": 0.8991615772247314,
3620 "rewards/format_reward": 1.0,
3621 "step": 258
3622 },
3623 {
3624 "clip_ratio": 0.0,
3625 "completion_length": 100.97917175292969,
3626 "epoch": 2.9770114942528734,
3627 "grad_norm": 1.2829606539449347,
3628 "kl": 0.03564453125,
3629 "learning_rate": 7.022988505747126e-07,
3630 "loss": 0.0015,
3631 "reward": 1.88856840133667,
3632 "reward_std": 0.02619217149913311,
3633 "rewards/accuracy_reward": 0.8898705840110779,
3634 "rewards/format_reward": 0.9986979365348816,
3635 "step": 259
3636 },
3637 {
3638 "clip_ratio": 0.0,
3639 "completion_length": 99.66471862792969,
3640 "epoch": 2.9885057471264367,
3641 "grad_norm": 1.470317712158937,
3642 "kl": 0.03759765625,
3643 "learning_rate": 7.011494252873563e-07,
3644 "loss": 0.0016,
3645 "reward": 1.8722280263900757,
3646 "reward_std": 0.022509008646011353,
3647 "rewards/accuracy_reward": 0.8722281455993652,
3648 "rewards/format_reward": 1.0,
3649 "step": 260
3650 },
3651 {
3652 "clip_ratio": 0.0,
3653 "completion_length": 96.02809143066406,
3654 "epoch": 3.0,
3655 "grad_norm": 3.0919770267204147,
3656 "kl": 0.03466796875,
3657 "learning_rate": 7e-07,
3658 "loss": 0.0014,
3659 "reward": 1.8606176376342773,
3660 "reward_std": 0.022723043337464333,
3661 "rewards/accuracy_reward": 0.8606176376342773,
3662 "rewards/format_reward": 1.0,
3663 "step": 261
3664 },
3665 {
3666 "clip_ratio": 0.0,
3667 "completion_length": 103.25260925292969,
3668 "epoch": 3.0114942528735633,
3669 "grad_norm": 2.7419615747949386,
3670 "kl": 0.034912109375,
3671 "learning_rate": 6.988505747126436e-07,
3672 "loss": 0.0014,
3673 "reward": 1.869284987449646,
3674 "reward_std": 0.024234648793935776,
3675 "rewards/accuracy_reward": 0.8692850470542908,
3676 "rewards/format_reward": 1.0,
3677 "step": 262
3678 },
3679 {
3680 "clip_ratio": 0.0,
3681 "completion_length": 102.68359375,
3682 "epoch": 3.0229885057471266,
3683 "grad_norm": 1.559372722176547,
3684 "kl": 0.03271484375,
3685 "learning_rate": 6.977011494252873e-07,
3686 "loss": 0.0013,
3687 "reward": 1.8653929233551025,
3688 "reward_std": 0.02791503071784973,
3689 "rewards/accuracy_reward": 0.8660439252853394,
3690 "rewards/format_reward": 0.9993489980697632,
3691 "step": 263
3692 },
3693 {
3694 "clip_ratio": 0.0,
3695 "completion_length": 103.7109375,
3696 "epoch": 3.0344827586206895,
3697 "grad_norm": 2.1221245906508464,
3698 "kl": 0.033203125,
3699 "learning_rate": 6.96551724137931e-07,
3700 "loss": 0.0014,
3701 "reward": 1.8776719570159912,
3702 "reward_std": 0.023673653602600098,
3703 "rewards/accuracy_reward": 0.8776720762252808,
3704 "rewards/format_reward": 1.0,
3705 "step": 264
3706 },
3707 {
3708 "clip_ratio": 0.0,
3709 "completion_length": 104.60872650146484,
3710 "epoch": 3.045977011494253,
3711 "grad_norm": 1.6002601391217546,
3712 "kl": 0.03662109375,
3713 "learning_rate": 6.954022988505746e-07,
3714 "loss": 0.0015,
3715 "reward": 1.830482840538025,
3716 "reward_std": 0.02900915965437889,
3717 "rewards/accuracy_reward": 0.8304829597473145,
3718 "rewards/format_reward": 1.0,
3719 "step": 265
3720 },
3721 {
3722 "clip_ratio": 0.0,
3723 "completion_length": 103.30794525146484,
3724 "epoch": 3.057471264367816,
3725 "grad_norm": 1.6434905190427025,
3726 "kl": 0.035888671875,
3727 "learning_rate": 6.942528735632184e-07,
3728 "loss": 0.0015,
3729 "reward": 1.8308700323104858,
3730 "reward_std": 0.02639869973063469,
3731 "rewards/accuracy_reward": 0.8315210342407227,
3732 "rewards/format_reward": 0.9993489980697632,
3733 "step": 266
3734 },
3735 {
3736 "clip_ratio": 0.0,
3737 "completion_length": 105.29167175292969,
3738 "epoch": 3.0689655172413794,
3739 "grad_norm": 1.027886396009367,
3740 "kl": 0.03466796875,
3741 "learning_rate": 6.931034482758621e-07,
3742 "loss": 0.0014,
3743 "reward": 1.8851213455200195,
3744 "reward_std": 0.021387819200754166,
3745 "rewards/accuracy_reward": 0.8851213455200195,
3746 "rewards/format_reward": 1.0,
3747 "step": 267
3748 },
3749 {
3750 "clip_ratio": 0.0,
3751 "completion_length": 105.84700775146484,
3752 "epoch": 3.0804597701149423,
3753 "grad_norm": 1.5206328556554167,
3754 "kl": 0.038818359375,
3755 "learning_rate": 6.919540229885057e-07,
3756 "loss": 0.0016,
3757 "reward": 1.8900096416473389,
3758 "reward_std": 0.02050493285059929,
3759 "rewards/accuracy_reward": 0.8900095820426941,
3760 "rewards/format_reward": 1.0,
3761 "step": 268
3762 },
3763 {
3764 "clip_ratio": 0.0,
3765 "completion_length": 107.73177337646484,
3766 "epoch": 3.0919540229885056,
3767 "grad_norm": 1.1641802469606335,
3768 "kl": 0.034423828125,
3769 "learning_rate": 6.908045977011494e-07,
3770 "loss": 0.0015,
3771 "reward": 1.858663558959961,
3772 "reward_std": 0.025883881375193596,
3773 "rewards/accuracy_reward": 0.8593146800994873,
3774 "rewards/format_reward": 0.9993489980697632,
3775 "step": 269
3776 },
3777 {
3778 "clip_ratio": 0.0,
3779 "completion_length": 109.19140625,
3780 "epoch": 3.103448275862069,
3781 "grad_norm": 1.332286209838367,
3782 "kl": 0.03271484375,
3783 "learning_rate": 6.896551724137931e-07,
3784 "loss": 0.0014,
3785 "reward": 1.8819489479064941,
3786 "reward_std": 0.023023171350359917,
3787 "rewards/accuracy_reward": 0.8819491267204285,
3788 "rewards/format_reward": 1.0,
3789 "step": 270
3790 },
3791 {
3792 "clip_ratio": 0.0,
3793 "completion_length": 108.453125,
3794 "epoch": 3.1149425287356323,
3795 "grad_norm": 1.3705118964619707,
3796 "kl": 0.0341796875,
3797 "learning_rate": 6.885057471264368e-07,
3798 "loss": 0.0015,
3799 "reward": 1.858891248703003,
3800 "reward_std": 0.022298548370599747,
3801 "rewards/accuracy_reward": 0.8595423698425293,
3802 "rewards/format_reward": 0.9993489980697632,
3803 "step": 271
3804 },
3805 {
3806 "clip_ratio": 0.0,
3807 "completion_length": 110.52278900146484,
3808 "epoch": 3.1264367816091956,
3809 "grad_norm": 1.5377438314997252,
3810 "kl": 0.041015625,
3811 "learning_rate": 6.873563218390804e-07,
3812 "loss": 0.0017,
3813 "reward": 1.884607195854187,
3814 "reward_std": 0.024785785004496574,
3815 "rewards/accuracy_reward": 0.8852583169937134,
3816 "rewards/format_reward": 0.9993489980697632,
3817 "step": 272
3818 },
3819 {
3820 "clip_ratio": 0.0,
3821 "completion_length": 110.56380462646484,
3822 "epoch": 3.1379310344827585,
3823 "grad_norm": 1.9722987101278098,
3824 "kl": 0.03515625,
3825 "learning_rate": 6.862068965517241e-07,
3826 "loss": 0.0015,
3827 "reward": 1.8918275833129883,
3828 "reward_std": 0.021715257316827774,
3829 "rewards/accuracy_reward": 0.8918277621269226,
3830 "rewards/format_reward": 1.0,
3831 "step": 273
3832 },
3833 {
3834 "clip_ratio": 0.0,
3835 "completion_length": 114.19792175292969,
3836 "epoch": 3.1494252873563218,
3837 "grad_norm": 1.5898347428804778,
3838 "kl": 0.03125,
3839 "learning_rate": 6.850574712643678e-07,
3840 "loss": 0.0013,
3841 "reward": 1.8759284019470215,
3842 "reward_std": 0.025104787200689316,
3843 "rewards/accuracy_reward": 0.8765794634819031,
3844 "rewards/format_reward": 0.9993489980697632,
3845 "step": 274
3846 },
3847 {
3848 "clip_ratio": 0.0,
3849 "completion_length": 114.07747650146484,
3850 "epoch": 3.160919540229885,
3851 "grad_norm": 1.6076437000195318,
3852 "kl": 0.0380859375,
3853 "learning_rate": 6.839080459770114e-07,
3854 "loss": 0.0016,
3855 "reward": 1.8818080425262451,
3856 "reward_std": 0.02497541531920433,
3857 "rewards/accuracy_reward": 0.8824591040611267,
3858 "rewards/format_reward": 0.9993489980697632,
3859 "step": 275
3860 },
3861 {
3862 "clip_ratio": 0.0,
3863 "completion_length": 115.35546875,
3864 "epoch": 3.1724137931034484,
3865 "grad_norm": 1.7685732057058994,
3866 "kl": 0.039306640625,
3867 "learning_rate": 6.827586206896552e-07,
3868 "loss": 0.0016,
3869 "reward": 1.8782709836959839,
3870 "reward_std": 0.025368181988596916,
3871 "rewards/accuracy_reward": 0.8789221048355103,
3872 "rewards/format_reward": 0.9993489980697632,
3873 "step": 276
3874 },
3875 {
3876 "clip_ratio": 0.0,
3877 "completion_length": 115.39128112792969,
3878 "epoch": 3.1839080459770113,
3879 "grad_norm": 4.02053655146932,
3880 "kl": 0.04150390625,
3881 "learning_rate": 6.816091954022988e-07,
3882 "loss": 0.0017,
3883 "reward": 1.8910026550292969,
3884 "reward_std": 0.022438380867242813,
3885 "rewards/accuracy_reward": 0.8910026550292969,
3886 "rewards/format_reward": 1.0,
3887 "step": 277
3888 },
3889 {
3890 "clip_ratio": 0.0,
3891 "completion_length": 118.00130462646484,
3892 "epoch": 3.1954022988505746,
3893 "grad_norm": 1.2219545665192806,
3894 "kl": 0.033935546875,
3895 "learning_rate": 6.804597701149425e-07,
3896 "loss": 0.0014,
3897 "reward": 1.8810728788375854,
3898 "reward_std": 0.027784962207078934,
3899 "rewards/accuracy_reward": 0.8830260038375854,
3900 "rewards/format_reward": 0.998046875,
3901 "step": 278
3902 },
3903 {
3904 "clip_ratio": 0.0,
3905 "completion_length": 113.451171875,
3906 "epoch": 3.206896551724138,
3907 "grad_norm": 1.7436462195514546,
3908 "kl": 0.038818359375,
3909 "learning_rate": 6.793103448275862e-07,
3910 "loss": 0.0016,
3911 "reward": 1.8771369457244873,
3912 "reward_std": 0.023900484666228294,
3913 "rewards/accuracy_reward": 0.8771368861198425,
3914 "rewards/format_reward": 1.0,
3915 "step": 279
3916 },
3917 {
3918 "clip_ratio": 0.0,
3919 "completion_length": 114.857421875,
3920 "epoch": 3.218390804597701,
3921 "grad_norm": 1.564180567510325,
3922 "kl": 0.0390625,
3923 "learning_rate": 6.781609195402298e-07,
3924 "loss": 0.0016,
3925 "reward": 1.8888477087020874,
3926 "reward_std": 0.02220313809812069,
3927 "rewards/accuracy_reward": 0.8888477683067322,
3928 "rewards/format_reward": 1.0,
3929 "step": 280
3930 },
3931 {
3932 "clip_ratio": 0.0,
3933 "completion_length": 114.80924987792969,
3934 "epoch": 3.2298850574712645,
3935 "grad_norm": 4.146116713241933,
3936 "kl": 0.040771484375,
3937 "learning_rate": 6.770114942528736e-07,
3938 "loss": 0.0017,
3939 "reward": 1.8889455795288086,
3940 "reward_std": 0.021025802940130234,
3941 "rewards/accuracy_reward": 0.8889455795288086,
3942 "rewards/format_reward": 1.0,
3943 "step": 281
3944 },
3945 {
3946 "clip_ratio": 0.0,
3947 "completion_length": 114.03450775146484,
3948 "epoch": 3.2413793103448274,
3949 "grad_norm": 2.0093993818021167,
3950 "kl": 0.037841796875,
3951 "learning_rate": 6.758620689655172e-07,
3952 "loss": 0.0016,
3953 "reward": 1.8997611999511719,
3954 "reward_std": 0.022101037204265594,
3955 "rewards/accuracy_reward": 0.8997613191604614,
3956 "rewards/format_reward": 1.0,
3957 "step": 282
3958 },
3959 {
3960 "clip_ratio": 0.0,
3961 "completion_length": 113.27734375,
3962 "epoch": 3.2528735632183907,
3963 "grad_norm": 1.7252038523721087,
3964 "kl": 0.043212890625,
3965 "learning_rate": 6.747126436781609e-07,
3966 "loss": 0.0018,
3967 "reward": 1.893257975578308,
3968 "reward_std": 0.024223104119300842,
3969 "rewards/accuracy_reward": 0.8945600986480713,
3970 "rewards/format_reward": 0.9986979365348816,
3971 "step": 283
3972 },
3973 {
3974 "clip_ratio": 0.0,
3975 "completion_length": 112.23177337646484,
3976 "epoch": 3.264367816091954,
3977 "grad_norm": 1.3254878153740217,
3978 "kl": 0.03955078125,
3979 "learning_rate": 6.735632183908046e-07,
3980 "loss": 0.0017,
3981 "reward": 1.897439956665039,
3982 "reward_std": 0.021015014499425888,
3983 "rewards/accuracy_reward": 0.8974398374557495,
3984 "rewards/format_reward": 1.0,
3985 "step": 284
3986 },
3987 {
3988 "clip_ratio": 0.0,
3989 "completion_length": 110.99609375,
3990 "epoch": 3.2758620689655173,
3991 "grad_norm": 1.5719828379459555,
3992 "kl": 0.03955078125,
3993 "learning_rate": 6.724137931034482e-07,
3994 "loss": 0.0017,
3995 "reward": 1.8719691038131714,
3996 "reward_std": 0.022564705461263657,
3997 "rewards/accuracy_reward": 0.8719691038131714,
3998 "rewards/format_reward": 1.0,
3999 "step": 285
4000 },
4001 {
4002 "clip_ratio": 0.0,
4003 "completion_length": 109.23372650146484,
4004 "epoch": 3.2873563218390807,
4005 "grad_norm": 1.1121764344089415,
4006 "kl": 0.04052734375,
4007 "learning_rate": 6.71264367816092e-07,
4008 "loss": 0.0017,
4009 "reward": 1.88375985622406,
4010 "reward_std": 0.022753890603780746,
4011 "rewards/accuracy_reward": 0.8837599158287048,
4012 "rewards/format_reward": 1.0,
4013 "step": 286
4014 },
4015 {
4016 "clip_ratio": 0.0,
4017 "completion_length": 107.83724212646484,
4018 "epoch": 3.2988505747126435,
4019 "grad_norm": 4.799405333071082,
4020 "kl": 0.04296875,
4021 "learning_rate": 6.701149425287356e-07,
4022 "loss": 0.0018,
4023 "reward": 1.8786205053329468,
4024 "reward_std": 0.025325840339064598,
4025 "rewards/accuracy_reward": 0.8799225687980652,
4026 "rewards/format_reward": 0.9986979365348816,
4027 "step": 287
4028 },
4029 {
4030 "clip_ratio": 0.0,
4031 "completion_length": 105.91536712646484,
4032 "epoch": 3.310344827586207,
4033 "grad_norm": 2.021051884647491,
4034 "kl": 0.04345703125,
4035 "learning_rate": 6.689655172413793e-07,
4036 "loss": 0.0018,
4037 "reward": 1.8967899084091187,
4038 "reward_std": 0.019771821796894073,
4039 "rewards/accuracy_reward": 0.8967899084091187,
4040 "rewards/format_reward": 1.0,
4041 "step": 288
4042 },
4043 {
4044 "clip_ratio": 0.0,
4045 "completion_length": 105.49544525146484,
4046 "epoch": 3.32183908045977,
4047 "grad_norm": 1.9020287418215316,
4048 "kl": 0.04052734375,
4049 "learning_rate": 6.67816091954023e-07,
4050 "loss": 0.0017,
4051 "reward": 1.8606189489364624,
4052 "reward_std": 0.025568749755620956,
4053 "rewards/accuracy_reward": 0.8619210720062256,
4054 "rewards/format_reward": 0.9986979365348816,
4055 "step": 289
4056 },
4057 {
4058 "clip_ratio": 0.0,
4059 "completion_length": 105.46745300292969,
4060 "epoch": 3.3333333333333335,
4061 "grad_norm": 1.2275084362530455,
4062 "kl": 0.041015625,
4063 "learning_rate": 6.666666666666666e-07,
4064 "loss": 0.0017,
4065 "reward": 1.8807505369186401,
4066 "reward_std": 0.02470255456864834,
4067 "rewards/accuracy_reward": 0.8807506561279297,
4068 "rewards/format_reward": 1.0,
4069 "step": 290
4070 },
4071 {
4072 "clip_ratio": 0.0,
4073 "completion_length": 105.484375,
4074 "epoch": 3.344827586206897,
4075 "grad_norm": 1.5137610627678855,
4076 "kl": 0.036376953125,
4077 "learning_rate": 6.655172413793103e-07,
4078 "loss": 0.0015,
4079 "reward": 1.8718066215515137,
4080 "reward_std": 0.025618024170398712,
4081 "rewards/accuracy_reward": 0.8718067407608032,
4082 "rewards/format_reward": 1.0,
4083 "step": 291
4084 },
4085 {
4086 "clip_ratio": 0.0,
4087 "completion_length": 105.90234375,
4088 "epoch": 3.3563218390804597,
4089 "grad_norm": 3.2723373427710034,
4090 "kl": 0.037109375,
4091 "learning_rate": 6.64367816091954e-07,
4092 "loss": 0.0016,
4093 "reward": 1.8626410961151123,
4094 "reward_std": 0.02414146065711975,
4095 "rewards/accuracy_reward": 0.8632920980453491,
4096 "rewards/format_reward": 0.9993489980697632,
4097 "step": 292
4098 },
4099 {
4100 "clip_ratio": 0.0,
4101 "completion_length": 104.763671875,
4102 "epoch": 3.367816091954023,
4103 "grad_norm": 1.4200196313502904,
4104 "kl": 0.042724609375,
4105 "learning_rate": 6.632183908045976e-07,
4106 "loss": 0.0018,
4107 "reward": 1.8814753293991089,
4108 "reward_std": 0.02261793240904808,
4109 "rewards/accuracy_reward": 0.8814753293991089,
4110 "rewards/format_reward": 1.0,
4111 "step": 293
4112 },
4113 {
4114 "clip_ratio": 0.0,
4115 "completion_length": 105.11849212646484,
4116 "epoch": 3.3793103448275863,
4117 "grad_norm": 2.2801476286690554,
4118 "kl": 0.038330078125,
4119 "learning_rate": 6.620689655172414e-07,
4120 "loss": 0.0016,
4121 "reward": 1.8842586278915405,
4122 "reward_std": 0.026064470410346985,
4123 "rewards/accuracy_reward": 0.8842586874961853,
4124 "rewards/format_reward": 1.0,
4125 "step": 294
4126 },
4127 {
4128 "clip_ratio": 0.0,
4129 "completion_length": 103.70052337646484,
4130 "epoch": 3.3908045977011496,
4131 "grad_norm": 1.1324912574969757,
4132 "kl": 0.0390625,
4133 "learning_rate": 6.609195402298851e-07,
4134 "loss": 0.0016,
4135 "reward": 1.8832792043685913,
4136 "reward_std": 0.02494307979941368,
4137 "rewards/accuracy_reward": 0.8832792043685913,
4138 "rewards/format_reward": 1.0,
4139 "step": 295
4140 },
4141 {
4142 "clip_ratio": 0.0,
4143 "completion_length": 103.57682800292969,
4144 "epoch": 3.4022988505747125,
4145 "grad_norm": 1.8918486699769739,
4146 "kl": 0.037841796875,
4147 "learning_rate": 6.597701149425286e-07,
4148 "loss": 0.0016,
4149 "reward": 1.8709214925765991,
4150 "reward_std": 0.02543533965945244,
4151 "rewards/accuracy_reward": 0.8709214925765991,
4152 "rewards/format_reward": 1.0,
4153 "step": 296
4154 },
4155 {
4156 "clip_ratio": 0.0,
4157 "completion_length": 105.50911712646484,
4158 "epoch": 3.413793103448276,
4159 "grad_norm": 3.3366246111039506,
4160 "kl": 0.036865234375,
4161 "learning_rate": 6.586206896551724e-07,
4162 "loss": 0.0015,
4163 "reward": 1.9022067785263062,
4164 "reward_std": 0.02121621184051037,
4165 "rewards/accuracy_reward": 0.9022065997123718,
4166 "rewards/format_reward": 1.0,
4167 "step": 297
4168 },
4169 {
4170 "clip_ratio": 0.0,
4171 "completion_length": 105.03060150146484,
4172 "epoch": 3.425287356321839,
4173 "grad_norm": 1.784585488176211,
4174 "kl": 0.0380859375,
4175 "learning_rate": 6.574712643678161e-07,
4176 "loss": 0.0016,
4177 "reward": 1.8803653717041016,
4178 "reward_std": 0.02356140874326229,
4179 "rewards/accuracy_reward": 0.8810164332389832,
4180 "rewards/format_reward": 0.9993489980697632,
4181 "step": 298
4182 },
4183 {
4184 "clip_ratio": 0.0,
4185 "completion_length": 104.619140625,
4186 "epoch": 3.4367816091954024,
4187 "grad_norm": 1.799334133513421,
4188 "kl": 0.039306640625,
4189 "learning_rate": 6.563218390804598e-07,
4190 "loss": 0.0016,
4191 "reward": 1.8940705060958862,
4192 "reward_std": 0.023154854774475098,
4193 "rewards/accuracy_reward": 0.8940703272819519,
4194 "rewards/format_reward": 1.0,
4195 "step": 299
4196 },
4197 {
4198 "clip_ratio": 0.0,
4199 "completion_length": 105.240234375,
4200 "epoch": 3.4482758620689653,
4201 "grad_norm": 1.5600328539462516,
4202 "kl": 0.037841796875,
4203 "learning_rate": 6.551724137931034e-07,
4204 "loss": 0.0016,
4205 "reward": 1.8870646953582764,
4206 "reward_std": 0.021850477904081345,
4207 "rewards/accuracy_reward": 0.8870646357536316,
4208 "rewards/format_reward": 1.0,
4209 "step": 300
4210 },
4211 {
4212 "clip_ratio": 0.0,
4213 "completion_length": 106.82682800292969,
4214 "epoch": 3.4597701149425286,
4215 "grad_norm": 1.3183868361300648,
4216 "kl": 0.041259765625,
4217 "learning_rate": 6.540229885057471e-07,
4218 "loss": 0.0017,
4219 "reward": 1.9015138149261475,
4220 "reward_std": 0.02374776266515255,
4221 "rewards/accuracy_reward": 0.9015137553215027,
4222 "rewards/format_reward": 1.0,
4223 "step": 301
4224 },
4225 {
4226 "clip_ratio": 0.0,
4227 "completion_length": 105.62109375,
4228 "epoch": 3.471264367816092,
4229 "grad_norm": 1.7939569716400035,
4230 "kl": 0.04052734375,
4231 "learning_rate": 6.528735632183908e-07,
4232 "loss": 0.0017,
4233 "reward": 1.8765311241149902,
4234 "reward_std": 0.023926347494125366,
4235 "rewards/accuracy_reward": 0.8765311241149902,
4236 "rewards/format_reward": 1.0,
4237 "step": 302
4238 },
4239 {
4240 "clip_ratio": 0.0,
4241 "completion_length": 105.55143737792969,
4242 "epoch": 3.4827586206896552,
4243 "grad_norm": 1.4131833430934648,
4244 "kl": 0.04150390625,
4245 "learning_rate": 6.517241379310344e-07,
4246 "loss": 0.0017,
4247 "reward": 1.8952958583831787,
4248 "reward_std": 0.021189194172620773,
4249 "rewards/accuracy_reward": 0.8952958583831787,
4250 "rewards/format_reward": 1.0,
4251 "step": 303
4252 },
4253 {
4254 "clip_ratio": 0.0,
4255 "completion_length": 103.07421875,
4256 "epoch": 3.4942528735632186,
4257 "grad_norm": 2.76037518653071,
4258 "kl": 0.039794921875,
4259 "learning_rate": 6.505747126436782e-07,
4260 "loss": 0.0016,
4261 "reward": 1.8874156475067139,
4262 "reward_std": 0.024307802319526672,
4263 "rewards/accuracy_reward": 0.8874154090881348,
4264 "rewards/format_reward": 1.0,
4265 "step": 304
4266 },
4267 {
4268 "clip_ratio": 0.0,
4269 "completion_length": 103.955078125,
4270 "epoch": 3.5057471264367814,
4271 "grad_norm": 2.1138139550456376,
4272 "kl": 0.04150390625,
4273 "learning_rate": 6.494252873563219e-07,
4274 "loss": 0.0017,
4275 "reward": 1.8822648525238037,
4276 "reward_std": 0.023932967334985733,
4277 "rewards/accuracy_reward": 0.8822647929191589,
4278 "rewards/format_reward": 1.0,
4279 "step": 305
4280 },
4281 {
4282 "clip_ratio": 0.0,
4283 "completion_length": 104.31575775146484,
4284 "epoch": 3.5172413793103448,
4285 "grad_norm": 1.2027549538193985,
4286 "kl": 0.044921875,
4287 "learning_rate": 6.482758620689654e-07,
4288 "loss": 0.0019,
4289 "reward": 1.8690357208251953,
4290 "reward_std": 0.02493324503302574,
4291 "rewards/accuracy_reward": 0.8696866035461426,
4292 "rewards/format_reward": 0.9993489980697632,
4293 "step": 306
4294 },
4295 {
4296 "clip_ratio": 0.0,
4297 "completion_length": 104.220703125,
4298 "epoch": 3.528735632183908,
4299 "grad_norm": 3.667217381252123,
4300 "kl": 0.042236328125,
4301 "learning_rate": 6.471264367816092e-07,
4302 "loss": 0.0018,
4303 "reward": 1.9012823104858398,
4304 "reward_std": 0.021280398592352867,
4305 "rewards/accuracy_reward": 0.9012823104858398,
4306 "rewards/format_reward": 1.0,
4307 "step": 307
4308 },
4309 {
4310 "clip_ratio": 0.0,
4311 "completion_length": 102.03646087646484,
4312 "epoch": 3.5402298850574714,
4313 "grad_norm": 1.3771503535932048,
4314 "kl": 0.044189453125,
4315 "learning_rate": 6.459770114942529e-07,
4316 "loss": 0.0018,
4317 "reward": 1.8968042135238647,
4318 "reward_std": 0.0209085401147604,
4319 "rewards/accuracy_reward": 0.8968044519424438,
4320 "rewards/format_reward": 1.0,
4321 "step": 308
4322 },
4323 {
4324 "clip_ratio": 0.0,
4325 "completion_length": 105.03841400146484,
4326 "epoch": 3.5517241379310347,
4327 "grad_norm": 2.601418631517434,
4328 "kl": 0.041748046875,
4329 "learning_rate": 6.448275862068964e-07,
4330 "loss": 0.0018,
4331 "reward": 1.8753538131713867,
4332 "reward_std": 0.024225711822509766,
4333 "rewards/accuracy_reward": 0.8760050535202026,
4334 "rewards/format_reward": 0.9993489980697632,
4335 "step": 309
4336 },
4337 {
4338 "clip_ratio": 0.0,
4339 "completion_length": 102.80403900146484,
4340 "epoch": 3.5632183908045976,
4341 "grad_norm": 1.5004741067737122,
4342 "kl": 0.0419921875,
4343 "learning_rate": 6.436781609195402e-07,
4344 "loss": 0.0018,
4345 "reward": 1.8968862295150757,
4346 "reward_std": 0.019103027880191803,
4347 "rewards/accuracy_reward": 0.8968861103057861,
4348 "rewards/format_reward": 1.0,
4349 "step": 310
4350 },
4351 {
4352 "clip_ratio": 0.0,
4353 "completion_length": 102.68685150146484,
4354 "epoch": 3.574712643678161,
4355 "grad_norm": 1.788362217489612,
4356 "kl": 0.040771484375,
4357 "learning_rate": 6.425287356321839e-07,
4358 "loss": 0.0017,
4359 "reward": 1.8962510824203491,
4360 "reward_std": 0.02111215889453888,
4361 "rewards/accuracy_reward": 0.8962510824203491,
4362 "rewards/format_reward": 1.0,
4363 "step": 311
4364 },
4365 {
4366 "clip_ratio": 0.0,
4367 "completion_length": 102.00260925292969,
4368 "epoch": 3.586206896551724,
4369 "grad_norm": 1.789359418846744,
4370 "kl": 0.042724609375,
4371 "learning_rate": 6.413793103448275e-07,
4372 "loss": 0.0018,
4373 "reward": 1.8833904266357422,
4374 "reward_std": 0.019987134262919426,
4375 "rewards/accuracy_reward": 0.8833904266357422,
4376 "rewards/format_reward": 1.0,
4377 "step": 312
4378 },
4379 {
4380 "clip_ratio": 0.0,
4381 "completion_length": 101.673828125,
4382 "epoch": 3.5977011494252875,
4383 "grad_norm": 4.249206728495501,
4384 "kl": 0.03857421875,
4385 "learning_rate": 6.402298850574712e-07,
4386 "loss": 0.0016,
4387 "reward": 1.891036868095398,
4388 "reward_std": 0.020312292501330376,
4389 "rewards/accuracy_reward": 0.8916880488395691,
4390 "rewards/format_reward": 0.9993489980697632,
4391 "step": 313
4392 },
4393 {
4394 "clip_ratio": 0.0,
4395 "completion_length": 101.59375,
4396 "epoch": 3.609195402298851,
4397 "grad_norm": 4.978124356714563,
4398 "kl": 0.03955078125,
4399 "learning_rate": 6.390804597701149e-07,
4400 "loss": 0.0017,
4401 "reward": 1.8778576850891113,
4402 "reward_std": 0.021446645259857178,
4403 "rewards/accuracy_reward": 0.8785087466239929,
4404 "rewards/format_reward": 0.9993489980697632,
4405 "step": 314
4406 },
4407 {
4408 "clip_ratio": 0.0,
4409 "completion_length": 101.49349212646484,
4410 "epoch": 3.6206896551724137,
4411 "grad_norm": 1.5867170646844477,
4412 "kl": 0.037353515625,
4413 "learning_rate": 6.379310344827587e-07,
4414 "loss": 0.0016,
4415 "reward": 1.898721694946289,
4416 "reward_std": 0.020030900835990906,
4417 "rewards/accuracy_reward": 0.8987216949462891,
4418 "rewards/format_reward": 1.0,
4419 "step": 315
4420 },
4421 {
4422 "clip_ratio": 0.0,
4423 "completion_length": 101.83659362792969,
4424 "epoch": 3.632183908045977,
4425 "grad_norm": 1.873279997651989,
4426 "kl": 0.041259765625,
4427 "learning_rate": 6.367816091954022e-07,
4428 "loss": 0.0017,
4429 "reward": 1.8622578382492065,
4430 "reward_std": 0.022315729409456253,
4431 "rewards/accuracy_reward": 0.8629088401794434,
4432 "rewards/format_reward": 0.9993489980697632,
4433 "step": 316
4434 },
4435 {
4436 "clip_ratio": 0.0,
4437 "completion_length": 103.33464050292969,
4438 "epoch": 3.6436781609195403,
4439 "grad_norm": 3.2650242705040964,
4440 "kl": 0.041748046875,
4441 "learning_rate": 6.35632183908046e-07,
4442 "loss": 0.0017,
4443 "reward": 1.886378288269043,
4444 "reward_std": 0.019871540367603302,
4445 "rewards/accuracy_reward": 0.8863782286643982,
4446 "rewards/format_reward": 1.0,
4447 "step": 317
4448 },
4449 {
4450 "clip_ratio": 0.0,
4451 "completion_length": 102.236328125,
4452 "epoch": 3.655172413793103,
4453 "grad_norm": 2.4548899206996175,
4454 "kl": 0.038818359375,
4455 "learning_rate": 6.344827586206897e-07,
4456 "loss": 0.0016,
4457 "reward": 1.8966128826141357,
4458 "reward_std": 0.023452280089259148,
4459 "rewards/accuracy_reward": 0.896612823009491,
4460 "rewards/format_reward": 1.0,
4461 "step": 318
4462 },
4463 {
4464 "clip_ratio": 0.0,
4465 "completion_length": 104.52409362792969,
4466 "epoch": 3.6666666666666665,
4467 "grad_norm": 1.753082960789312,
4468 "kl": 0.042236328125,
4469 "learning_rate": 6.333333333333332e-07,
4470 "loss": 0.0018,
4471 "reward": 1.8872454166412354,
4472 "reward_std": 0.02262219414114952,
4473 "rewards/accuracy_reward": 0.8878964185714722,
4474 "rewards/format_reward": 0.9993489980697632,
4475 "step": 319
4476 },
4477 {
4478 "clip_ratio": 0.0,
4479 "completion_length": 105.630859375,
4480 "epoch": 3.67816091954023,
4481 "grad_norm": 2.176188734436804,
4482 "kl": 0.038818359375,
4483 "learning_rate": 6.32183908045977e-07,
4484 "loss": 0.0016,
4485 "reward": 1.8845866918563843,
4486 "reward_std": 0.02126806601881981,
4487 "rewards/accuracy_reward": 0.8845868110656738,
4488 "rewards/format_reward": 1.0,
4489 "step": 320
4490 },
4491 {
4492 "clip_ratio": 0.0,
4493 "completion_length": 105.69140625,
4494 "epoch": 3.689655172413793,
4495 "grad_norm": 1.5506807446234716,
4496 "kl": 0.0390625,
4497 "learning_rate": 6.310344827586207e-07,
4498 "loss": 0.0016,
4499 "reward": 1.8919544219970703,
4500 "reward_std": 0.02176390215754509,
4501 "rewards/accuracy_reward": 0.8926056623458862,
4502 "rewards/format_reward": 0.9993489980697632,
4503 "step": 321
4504 },
4505 {
4506 "clip_ratio": 0.0,
4507 "completion_length": 108.40234375,
4508 "epoch": 3.7011494252873565,
4509 "grad_norm": 7.014431541777074,
4510 "kl": 0.03662109375,
4511 "learning_rate": 6.298850574712643e-07,
4512 "loss": 0.0015,
4513 "reward": 1.895882487297058,
4514 "reward_std": 0.020401567220687866,
4515 "rewards/accuracy_reward": 0.8958825469017029,
4516 "rewards/format_reward": 1.0,
4517 "step": 322
4518 },
4519 {
4520 "clip_ratio": 0.0,
4521 "completion_length": 108.291015625,
4522 "epoch": 3.7126436781609193,
4523 "grad_norm": 2.3982992480163206,
4524 "kl": 0.041259765625,
4525 "learning_rate": 6.28735632183908e-07,
4526 "loss": 0.0017,
4527 "reward": 1.8751071691513062,
4528 "reward_std": 0.02190619520843029,
4529 "rewards/accuracy_reward": 0.8751071691513062,
4530 "rewards/format_reward": 1.0,
4531 "step": 323
4532 },
4533 {
4534 "clip_ratio": 0.0,
4535 "completion_length": 106.45638275146484,
4536 "epoch": 3.7241379310344827,
4537 "grad_norm": 1.903714474914862,
4538 "kl": 0.043212890625,
4539 "learning_rate": 6.275862068965517e-07,
4540 "loss": 0.0018,
4541 "reward": 1.883380651473999,
4542 "reward_std": 0.02156060002744198,
4543 "rewards/accuracy_reward": 0.8840314745903015,
4544 "rewards/format_reward": 0.9993489980697632,
4545 "step": 324
4546 },
4547 {
4548 "clip_ratio": 0.0,
4549 "completion_length": 107.33724212646484,
4550 "epoch": 3.735632183908046,
4551 "grad_norm": 3.392518970641651,
4552 "kl": 0.03955078125,
4553 "learning_rate": 6.264367816091954e-07,
4554 "loss": 0.0017,
4555 "reward": 1.8731813430786133,
4556 "reward_std": 0.022301897406578064,
4557 "rewards/accuracy_reward": 0.8738325834274292,
4558 "rewards/format_reward": 0.9993489980697632,
4559 "step": 325
4560 },
4561 {
4562 "clip_ratio": 0.0,
4563 "completion_length": 109.10221862792969,
4564 "epoch": 3.7471264367816093,
4565 "grad_norm": 2.4944444297866886,
4566 "kl": 0.0400390625,
4567 "learning_rate": 6.25287356321839e-07,
4568 "loss": 0.0017,
4569 "reward": 1.8849986791610718,
4570 "reward_std": 0.025180388242006302,
4571 "rewards/accuracy_reward": 0.8863007426261902,
4572 "rewards/format_reward": 0.9986979365348816,
4573 "step": 326
4574 },
4575 {
4576 "clip_ratio": 0.0,
4577 "completion_length": 110.51692962646484,
4578 "epoch": 3.7586206896551726,
4579 "grad_norm": 1.5524974855638687,
4580 "kl": 0.048095703125,
4581 "learning_rate": 6.241379310344828e-07,
4582 "loss": 0.002,
4583 "reward": 1.8775360584259033,
4584 "reward_std": 0.020194731652736664,
4585 "rewards/accuracy_reward": 0.8781870603561401,
4586 "rewards/format_reward": 0.9993489980697632,
4587 "step": 327
4588 },
4589 {
4590 "clip_ratio": 0.0,
4591 "completion_length": 109.32487487792969,
4592 "epoch": 3.7701149425287355,
4593 "grad_norm": 1.4924502520471505,
4594 "kl": 0.044189453125,
4595 "learning_rate": 6.229885057471264e-07,
4596 "loss": 0.0018,
4597 "reward": 1.8867871761322021,
4598 "reward_std": 0.02044479362666607,
4599 "rewards/accuracy_reward": 0.8874382972717285,
4600 "rewards/format_reward": 0.9993489980697632,
4601 "step": 328
4602 },
4603 {
4604 "clip_ratio": 0.0,
4605 "completion_length": 110.1875,
4606 "epoch": 3.781609195402299,
4607 "grad_norm": 2.570142758125839,
4608 "kl": 0.04248046875,
4609 "learning_rate": 6.2183908045977e-07,
4610 "loss": 0.0017,
4611 "reward": 1.8741064071655273,
4612 "reward_std": 0.019452206790447235,
4613 "rewards/accuracy_reward": 0.8741063475608826,
4614 "rewards/format_reward": 1.0,
4615 "step": 329
4616 },
4617 {
4618 "clip_ratio": 0.0,
4619 "completion_length": 108.177734375,
4620 "epoch": 3.793103448275862,
4621 "grad_norm": 1.638699330431456,
4622 "kl": 0.048583984375,
4623 "learning_rate": 6.206896551724138e-07,
4624 "loss": 0.002,
4625 "reward": 1.883441686630249,
4626 "reward_std": 0.021484442055225372,
4627 "rewards/accuracy_reward": 0.8847436904907227,
4628 "rewards/format_reward": 0.9986979365348816,
4629 "step": 330
4630 },
4631 {
4632 "clip_ratio": 0.0,
4633 "completion_length": 109.916015625,
4634 "epoch": 3.8045977011494254,
4635 "grad_norm": 1.4709865788564034,
4636 "kl": 0.041015625,
4637 "learning_rate": 6.195402298850575e-07,
4638 "loss": 0.0017,
4639 "reward": 1.8892269134521484,
4640 "reward_std": 0.019907724112272263,
4641 "rewards/accuracy_reward": 0.8892270922660828,
4642 "rewards/format_reward": 1.0,
4643 "step": 331
4644 },
4645 {
4646 "clip_ratio": 0.0,
4647 "completion_length": 110.96224212646484,
4648 "epoch": 3.8160919540229887,
4649 "grad_norm": 3.682456578081511,
4650 "kl": 0.044189453125,
4651 "learning_rate": 6.183908045977011e-07,
4652 "loss": 0.0018,
4653 "reward": 1.8904244899749756,
4654 "reward_std": 0.017542677000164986,
4655 "rewards/accuracy_reward": 0.8904244303703308,
4656 "rewards/format_reward": 1.0,
4657 "step": 332
4658 },
4659 {
4660 "clip_ratio": 0.0,
4661 "completion_length": 110.03385925292969,
4662 "epoch": 3.8275862068965516,
4663 "grad_norm": 1.4011409988409274,
4664 "kl": 0.042724609375,
4665 "learning_rate": 6.172413793103448e-07,
4666 "loss": 0.0018,
4667 "reward": 1.8981231451034546,
4668 "reward_std": 0.019530044868588448,
4669 "rewards/accuracy_reward": 0.8981232047080994,
4670 "rewards/format_reward": 1.0,
4671 "step": 333
4672 },
4673 {
4674 "clip_ratio": 0.0,
4675 "completion_length": 110.95052337646484,
4676 "epoch": 3.839080459770115,
4677 "grad_norm": 2.57360289310368,
4678 "kl": 0.04296875,
4679 "learning_rate": 6.160919540229885e-07,
4680 "loss": 0.0018,
4681 "reward": 1.8822907209396362,
4682 "reward_std": 0.020302042365074158,
4683 "rewards/accuracy_reward": 0.882290780544281,
4684 "rewards/format_reward": 1.0,
4685 "step": 334
4686 },
4687 {
4688 "clip_ratio": 0.0,
4689 "completion_length": 109.95964050292969,
4690 "epoch": 3.8505747126436782,
4691 "grad_norm": 3.9665298261234136,
4692 "kl": 0.04833984375,
4693 "learning_rate": 6.149425287356322e-07,
4694 "loss": 0.002,
4695 "reward": 1.8941223621368408,
4696 "reward_std": 0.02138252556324005,
4697 "rewards/accuracy_reward": 0.8947733640670776,
4698 "rewards/format_reward": 0.9993489980697632,
4699 "step": 335
4700 },
4701 {
4702 "clip_ratio": 0.0,
4703 "completion_length": 111.89974212646484,
4704 "epoch": 3.862068965517241,
4705 "grad_norm": 2.706717328267551,
4706 "kl": 0.046875,
4707 "learning_rate": 6.137931034482758e-07,
4708 "loss": 0.0019,
4709 "reward": 1.8608148097991943,
4710 "reward_std": 0.02197238616645336,
4711 "rewards/accuracy_reward": 0.8608149886131287,
4712 "rewards/format_reward": 1.0,
4713 "step": 336
4714 },
4715 {
4716 "clip_ratio": 0.0,
4717 "completion_length": 111.17839050292969,
4718 "epoch": 3.873563218390805,
4719 "grad_norm": 2.9331158810141704,
4720 "kl": 0.043212890625,
4721 "learning_rate": 6.126436781609195e-07,
4722 "loss": 0.0018,
4723 "reward": 1.877394676208496,
4724 "reward_std": 0.022260598838329315,
4725 "rewards/accuracy_reward": 0.8773946762084961,
4726 "rewards/format_reward": 1.0,
4727 "step": 337
4728 },
4729 {
4730 "clip_ratio": 0.0,
4731 "completion_length": 109.91732025146484,
4732 "epoch": 3.8850574712643677,
4733 "grad_norm": 1.7505237848600723,
4734 "kl": 0.046875,
4735 "learning_rate": 6.114942528735632e-07,
4736 "loss": 0.002,
4737 "reward": 1.8993761539459229,
4738 "reward_std": 0.01834731549024582,
4739 "rewards/accuracy_reward": 0.8993761539459229,
4740 "rewards/format_reward": 1.0,
4741 "step": 338
4742 },
4743 {
4744 "clip_ratio": 0.0,
4745 "completion_length": 110.50456237792969,
4746 "epoch": 3.896551724137931,
4747 "grad_norm": 1.8091603678916988,
4748 "kl": 0.04541015625,
4749 "learning_rate": 6.103448275862068e-07,
4750 "loss": 0.0019,
4751 "reward": 1.8772138357162476,
4752 "reward_std": 0.02404957078397274,
4753 "rewards/accuracy_reward": 0.8778649568557739,
4754 "rewards/format_reward": 0.9993489980697632,
4755 "step": 339
4756 },
4757 {
4758 "clip_ratio": 0.0,
4759 "completion_length": 109.89453125,
4760 "epoch": 3.9080459770114944,
4761 "grad_norm": 2.7420144960936064,
4762 "kl": 0.0439453125,
4763 "learning_rate": 6.091954022988506e-07,
4764 "loss": 0.0018,
4765 "reward": 1.8750686645507812,
4766 "reward_std": 0.02069086581468582,
4767 "rewards/accuracy_reward": 0.8750687837600708,
4768 "rewards/format_reward": 1.0,
4769 "step": 340
4770 },
4771 {
4772 "clip_ratio": 0.0,
4773 "completion_length": 108.76692962646484,
4774 "epoch": 3.9195402298850572,
4775 "grad_norm": 1.3320968254982721,
4776 "kl": 0.042724609375,
4777 "learning_rate": 6.080459770114942e-07,
4778 "loss": 0.0017,
4779 "reward": 1.8823789358139038,
4780 "reward_std": 0.020603572949767113,
4781 "rewards/accuracy_reward": 0.8823789358139038,
4782 "rewards/format_reward": 1.0,
4783 "step": 341
4784 },
4785 {
4786 "clip_ratio": 0.0,
4787 "completion_length": 108.79232025146484,
4788 "epoch": 3.9310344827586206,
4789 "grad_norm": 2.6292451489584363,
4790 "kl": 0.042236328125,
4791 "learning_rate": 6.068965517241379e-07,
4792 "loss": 0.0017,
4793 "reward": 1.8959369659423828,
4794 "reward_std": 0.02122236043214798,
4795 "rewards/accuracy_reward": 0.8965880274772644,
4796 "rewards/format_reward": 0.9993489980697632,
4797 "step": 342
4798 },
4799 {
4800 "clip_ratio": 0.0,
4801 "completion_length": 110.26237487792969,
4802 "epoch": 3.942528735632184,
4803 "grad_norm": 1.1786665172082536,
4804 "kl": 0.041748046875,
4805 "learning_rate": 6.057471264367816e-07,
4806 "loss": 0.0017,
4807 "reward": 1.8724334239959717,
4808 "reward_std": 0.02036610245704651,
4809 "rewards/accuracy_reward": 0.8724333643913269,
4810 "rewards/format_reward": 1.0,
4811 "step": 343
4812 },
4813 {
4814 "clip_ratio": 0.0,
4815 "completion_length": 108.0703125,
4816 "epoch": 3.954022988505747,
4817 "grad_norm": 1.8258343549622353,
4818 "kl": 0.041015625,
4819 "learning_rate": 6.045977011494252e-07,
4820 "loss": 0.0017,
4821 "reward": 1.8895347118377686,
4822 "reward_std": 0.021330809220671654,
4823 "rewards/accuracy_reward": 0.8901857137680054,
4824 "rewards/format_reward": 0.9993489980697632,
4825 "step": 344
4826 },
4827 {
4828 "clip_ratio": 0.0,
4829 "completion_length": 107.08984375,
4830 "epoch": 3.9655172413793105,
4831 "grad_norm": 1.2316728903522134,
4832 "kl": 0.04150390625,
4833 "learning_rate": 6.03448275862069e-07,
4834 "loss": 0.0017,
4835 "reward": 1.8853051662445068,
4836 "reward_std": 0.020341580733656883,
4837 "rewards/accuracy_reward": 0.8853051662445068,
4838 "rewards/format_reward": 1.0,
4839 "step": 345
4840 },
4841 {
4842 "clip_ratio": 0.0,
4843 "completion_length": 108.59765625,
4844 "epoch": 3.9770114942528734,
4845 "grad_norm": 3.7347087879528753,
4846 "kl": 0.0498046875,
4847 "learning_rate": 6.022988505747126e-07,
4848 "loss": 0.0021,
4849 "reward": 1.8948701620101929,
4850 "reward_std": 0.020036697387695312,
4851 "rewards/accuracy_reward": 0.8948701620101929,
4852 "rewards/format_reward": 1.0,
4853 "step": 346
4854 },
4855 {
4856 "clip_ratio": 0.0,
4857 "completion_length": 105.96745300292969,
4858 "epoch": 3.9885057471264367,
4859 "grad_norm": 1.374220142289457,
4860 "kl": 0.047119140625,
4861 "learning_rate": 6.011494252873563e-07,
4862 "loss": 0.002,
4863 "reward": 1.878260612487793,
4864 "reward_std": 0.020665448158979416,
4865 "rewards/accuracy_reward": 0.8782605528831482,
4866 "rewards/format_reward": 1.0,
4867 "step": 347
4868 },
4869 {
4870 "clip_ratio": 0.0,
4871 "completion_length": 99.40168762207031,
4872 "epoch": 4.0,
4873 "grad_norm": 1.3796700468562093,
4874 "kl": 0.0361328125,
4875 "learning_rate": 6e-07,
4876 "loss": 0.0015,
4877 "reward": 1.8900009393692017,
4878 "reward_std": 0.0169126745313406,
4879 "rewards/accuracy_reward": 0.890001118183136,
4880 "rewards/format_reward": 1.0,
4881 "step": 348
4882 },
4883 {
4884 "clip_ratio": 0.0,
4885 "completion_length": 108.21484375,
4886 "epoch": 4.011494252873563,
4887 "grad_norm": 1.4078584124283187,
4888 "kl": 0.036376953125,
4889 "learning_rate": 5.988505747126437e-07,
4890 "loss": 0.0015,
4891 "reward": 1.861340880393982,
4892 "reward_std": 0.024126818403601646,
4893 "rewards/accuracy_reward": 0.8613407611846924,
4894 "rewards/format_reward": 1.0,
4895 "step": 349
4896 },
4897 {
4898 "clip_ratio": 0.0,
4899 "completion_length": 107.14388275146484,
4900 "epoch": 4.022988505747127,
4901 "grad_norm": 1.9884071163588437,
4902 "kl": 0.044189453125,
4903 "learning_rate": 5.977011494252874e-07,
4904 "loss": 0.0019,
4905 "reward": 1.8884938955307007,
4906 "reward_std": 0.02110714465379715,
4907 "rewards/accuracy_reward": 0.8884940147399902,
4908 "rewards/format_reward": 1.0,
4909 "step": 350
4910 },
4911 {
4912 "clip_ratio": 0.0,
4913 "completion_length": 107.16341400146484,
4914 "epoch": 4.0344827586206895,
4915 "grad_norm": 1.4221350160981188,
4916 "kl": 0.036376953125,
4917 "learning_rate": 5.96551724137931e-07,
4918 "loss": 0.0015,
4919 "reward": 1.8791723251342773,
4920 "reward_std": 0.022562285885214806,
4921 "rewards/accuracy_reward": 0.8791724443435669,
4922 "rewards/format_reward": 1.0,
4923 "step": 351
4924 },
4925 {
4926 "clip_ratio": 0.0,
4927 "completion_length": 106.138671875,
4928 "epoch": 4.045977011494253,
4929 "grad_norm": 1.654901894003328,
4930 "kl": 0.038818359375,
4931 "learning_rate": 5.954022988505747e-07,
4932 "loss": 0.0016,
4933 "reward": 1.8821697235107422,
4934 "reward_std": 0.02151145413517952,
4935 "rewards/accuracy_reward": 0.8821697235107422,
4936 "rewards/format_reward": 1.0,
4937 "step": 352
4938 },
4939 {
4940 "clip_ratio": 0.0,
4941 "completion_length": 105.92839050292969,
4942 "epoch": 4.057471264367816,
4943 "grad_norm": 1.5963537501741232,
4944 "kl": 0.039306640625,
4945 "learning_rate": 5.942528735632184e-07,
4946 "loss": 0.0016,
4947 "reward": 1.8790719509124756,
4948 "reward_std": 0.021063879132270813,
4949 "rewards/accuracy_reward": 0.8797230124473572,
4950 "rewards/format_reward": 0.9993489980697632,
4951 "step": 353
4952 },
4953 {
4954 "clip_ratio": 0.0,
4955 "completion_length": 107.041015625,
4956 "epoch": 4.068965517241379,
4957 "grad_norm": 1.8114870192410322,
4958 "kl": 0.03759765625,
4959 "learning_rate": 5.93103448275862e-07,
4960 "loss": 0.0016,
4961 "reward": 1.865633487701416,
4962 "reward_std": 0.0278116874396801,
4963 "rewards/accuracy_reward": 0.8669356107711792,
4964 "rewards/format_reward": 0.9986979365348816,
4965 "step": 354
4966 },
4967 {
4968 "clip_ratio": 0.0,
4969 "completion_length": 106.82357025146484,
4970 "epoch": 4.080459770114943,
4971 "grad_norm": 2.8490723955207993,
4972 "kl": 0.037841796875,
4973 "learning_rate": 5.919540229885057e-07,
4974 "loss": 0.0016,
4975 "reward": 1.8946452140808105,
4976 "reward_std": 0.021742399781942368,
4977 "rewards/accuracy_reward": 0.8946452140808105,
4978 "rewards/format_reward": 1.0,
4979 "step": 355
4980 },
4981 {
4982 "clip_ratio": 0.0,
4983 "completion_length": 107.27474212646484,
4984 "epoch": 4.091954022988506,
4985 "grad_norm": 1.5566047273728554,
4986 "kl": 0.037353515625,
4987 "learning_rate": 5.908045977011494e-07,
4988 "loss": 0.0016,
4989 "reward": 1.875108242034912,
4990 "reward_std": 0.021865393966436386,
4991 "rewards/accuracy_reward": 0.8751082420349121,
4992 "rewards/format_reward": 1.0,
4993 "step": 356
4994 },
4995 {
4996 "clip_ratio": 0.0,
4997 "completion_length": 107.48763275146484,
4998 "epoch": 4.103448275862069,
4999 "grad_norm": 1.9776553067288296,
5000 "kl": 0.03564453125,
5001 "learning_rate": 5.89655172413793e-07,
5002 "loss": 0.0015,
5003 "reward": 1.881319284439087,
5004 "reward_std": 0.024803385138511658,
5005 "rewards/accuracy_reward": 0.8813192248344421,
5006 "rewards/format_reward": 1.0,
5007 "step": 357
5008 },
5009 {
5010 "clip_ratio": 0.0,
5011 "completion_length": 109.21940612792969,
5012 "epoch": 4.114942528735632,
5013 "grad_norm": 1.6859022821903558,
5014 "kl": 0.0361328125,
5015 "learning_rate": 5.885057471264368e-07,
5016 "loss": 0.0015,
5017 "reward": 1.86405348777771,
5018 "reward_std": 0.023439563810825348,
5019 "rewards/accuracy_reward": 0.8653554916381836,
5020 "rewards/format_reward": 0.9986979365348816,
5021 "step": 358
5022 },
5023 {
5024 "clip_ratio": 0.0,
5025 "completion_length": 106.50521087646484,
5026 "epoch": 4.126436781609195,
5027 "grad_norm": 2.4612604405299474,
5028 "kl": 0.03955078125,
5029 "learning_rate": 5.873563218390805e-07,
5030 "loss": 0.0017,
5031 "reward": 1.8807547092437744,
5032 "reward_std": 0.02118275687098503,
5033 "rewards/accuracy_reward": 0.880754828453064,
5034 "rewards/format_reward": 1.0,
5035 "step": 359
5036 },
5037 {
5038 "clip_ratio": 0.0,
5039 "completion_length": 108.39714050292969,
5040 "epoch": 4.137931034482759,
5041 "grad_norm": 1.6609596832825777,
5042 "kl": 0.035888671875,
5043 "learning_rate": 5.86206896551724e-07,
5044 "loss": 0.0015,
5045 "reward": 1.8441890478134155,
5046 "reward_std": 0.023664182052016258,
5047 "rewards/accuracy_reward": 0.8441890478134155,
5048 "rewards/format_reward": 1.0,
5049 "step": 360
5050 },
5051 {
5052 "clip_ratio": 0.0,
5053 "completion_length": 109.435546875,
5054 "epoch": 4.149425287356322,
5055 "grad_norm": 2.5111143452832625,
5056 "kl": 0.034912109375,
5057 "learning_rate": 5.850574712643678e-07,
5058 "loss": 0.0015,
5059 "reward": 1.8899774551391602,
5060 "reward_std": 0.026515571400523186,
5061 "rewards/accuracy_reward": 0.8906285762786865,
5062 "rewards/format_reward": 0.9993489980697632,
5063 "step": 361
5064 },
5065 {
5066 "clip_ratio": 0.0,
5067 "completion_length": 108.841796875,
5068 "epoch": 4.160919540229885,
5069 "grad_norm": 3.9427924775939784,
5070 "kl": 0.037353515625,
5071 "learning_rate": 5.839080459770115e-07,
5072 "loss": 0.0016,
5073 "reward": 1.8844678401947021,
5074 "reward_std": 0.02146860770881176,
5075 "rewards/accuracy_reward": 0.8844677805900574,
5076 "rewards/format_reward": 1.0,
5077 "step": 362
5078 },
5079 {
5080 "clip_ratio": 0.0,
5081 "completion_length": 106.84765625,
5082 "epoch": 4.172413793103448,
5083 "grad_norm": 1.6920489480855494,
5084 "kl": 0.037109375,
5085 "learning_rate": 5.827586206896552e-07,
5086 "loss": 0.0016,
5087 "reward": 1.8880507946014404,
5088 "reward_std": 0.022687291726469994,
5089 "rewards/accuracy_reward": 0.8880506753921509,
5090 "rewards/format_reward": 1.0,
5091 "step": 363
5092 },
5093 {
5094 "clip_ratio": 0.0,
5095 "completion_length": 109.06315612792969,
5096 "epoch": 4.183908045977011,
5097 "grad_norm": 2.152778241207781,
5098 "kl": 0.037353515625,
5099 "learning_rate": 5.816091954022988e-07,
5100 "loss": 0.0016,
5101 "reward": 1.8860154151916504,
5102 "reward_std": 0.02288450300693512,
5103 "rewards/accuracy_reward": 0.8860155940055847,
5104 "rewards/format_reward": 1.0,
5105 "step": 364
5106 },
5107 {
5108 "clip_ratio": 0.0,
5109 "completion_length": 109.43034362792969,
5110 "epoch": 4.195402298850575,
5111 "grad_norm": 1.3564134180585476,
5112 "kl": 0.037353515625,
5113 "learning_rate": 5.804597701149425e-07,
5114 "loss": 0.0015,
5115 "reward": 1.8712117671966553,
5116 "reward_std": 0.02276797592639923,
5117 "rewards/accuracy_reward": 0.8712117671966553,
5118 "rewards/format_reward": 1.0,
5119 "step": 365
5120 },
5121 {
5122 "clip_ratio": 0.0,
5123 "completion_length": 105.294921875,
5124 "epoch": 4.206896551724138,
5125 "grad_norm": 1.1730079534850056e+29,
5126 "kl": 2.937689741663549e+26,
5127 "learning_rate": 5.793103448275862e-07,
5128 "loss": 1.1743651528396316e+25,
5129 "reward": 1.8752689361572266,
5130 "reward_std": 0.025293130427598953,
5131 "rewards/accuracy_reward": 0.8765711784362793,
5132 "rewards/format_reward": 0.9986979365348816,
5133 "step": 366
5134 },
5135 {
5136 "clip_ratio": 0.0,
5137 "completion_length": 106.33203125,
5138 "epoch": 4.218390804597701,
5139 "grad_norm": 1.492960779675916,
5140 "kl": 0.038818359375,
5141 "learning_rate": 5.781609195402298e-07,
5142 "loss": 0.0016,
5143 "reward": 1.8979356288909912,
5144 "reward_std": 0.023821339011192322,
5145 "rewards/accuracy_reward": 0.8985867500305176,
5146 "rewards/format_reward": 0.9993489980697632,
5147 "step": 367
5148 },
5149 {
5150 "clip_ratio": 0.0,
5151 "completion_length": 105.62630462646484,
5152 "epoch": 4.2298850574712645,
5153 "grad_norm": 1.4743009917575458,
5154 "kl": 0.040283203125,
5155 "learning_rate": 5.770114942528736e-07,
5156 "loss": 0.0017,
5157 "reward": 1.9055975675582886,
5158 "reward_std": 0.019971728324890137,
5159 "rewards/accuracy_reward": 0.9055975675582886,
5160 "rewards/format_reward": 1.0,
5161 "step": 368
5162 },
5163 {
5164 "clip_ratio": 0.0,
5165 "completion_length": 107.99674987792969,
5166 "epoch": 4.241379310344827,
5167 "grad_norm": 3.9302261665663982,
5168 "kl": 0.0400390625,
5169 "learning_rate": 5.758620689655173e-07,
5170 "loss": 0.0016,
5171 "reward": 1.8823680877685547,
5172 "reward_std": 0.022455014288425446,
5173 "rewards/accuracy_reward": 0.8823680877685547,
5174 "rewards/format_reward": 1.0,
5175 "step": 369
5176 },
5177 {
5178 "clip_ratio": 0.0,
5179 "completion_length": 106.39583587646484,
5180 "epoch": 4.252873563218391,
5181 "grad_norm": 2.0938326905522238,
5182 "kl": 0.0400390625,
5183 "learning_rate": 5.747126436781608e-07,
5184 "loss": 0.0016,
5185 "reward": 1.8727083206176758,
5186 "reward_std": 0.02592053823173046,
5187 "rewards/accuracy_reward": 0.8733593821525574,
5188 "rewards/format_reward": 0.9993489980697632,
5189 "step": 370
5190 },
5191 {
5192 "clip_ratio": 0.0,
5193 "completion_length": 107.81706237792969,
5194 "epoch": 4.264367816091954,
5195 "grad_norm": 2.888553640412068,
5196 "kl": 0.040771484375,
5197 "learning_rate": 5.735632183908046e-07,
5198 "loss": 0.0017,
5199 "reward": 1.882491111755371,
5200 "reward_std": 0.020577851682901382,
5201 "rewards/accuracy_reward": 0.8824911117553711,
5202 "rewards/format_reward": 1.0,
5203 "step": 371
5204 },
5205 {
5206 "clip_ratio": 0.0,
5207 "completion_length": 105.640625,
5208 "epoch": 4.275862068965517,
5209 "grad_norm": 1.4686082294156166,
5210 "kl": 0.051513671875,
5211 "learning_rate": 5.724137931034483e-07,
5212 "loss": 0.0021,
5213 "reward": 1.882289171218872,
5214 "reward_std": 0.023612579330801964,
5215 "rewards/accuracy_reward": 0.8822891712188721,
5216 "rewards/format_reward": 1.0,
5217 "step": 372
5218 },
5219 {
5220 "clip_ratio": 0.0,
5221 "completion_length": 107.6875,
5222 "epoch": 4.287356321839081,
5223 "grad_norm": 2.244464531990695,
5224 "kl": 0.040283203125,
5225 "learning_rate": 5.712643678160918e-07,
5226 "loss": 0.0017,
5227 "reward": 1.886061429977417,
5228 "reward_std": 0.023760950192809105,
5229 "rewards/accuracy_reward": 0.8867123126983643,
5230 "rewards/format_reward": 0.9993489980697632,
5231 "step": 373
5232 },
5233 {
5234 "clip_ratio": 0.0,
5235 "completion_length": 108.7734375,
5236 "epoch": 4.2988505747126435,
5237 "grad_norm": 1.5023210240131932,
5238 "kl": 0.040771484375,
5239 "learning_rate": 5.701149425287356e-07,
5240 "loss": 0.0017,
5241 "reward": 1.8881802558898926,
5242 "reward_std": 0.019625011831521988,
5243 "rewards/accuracy_reward": 0.8881802558898926,
5244 "rewards/format_reward": 1.0,
5245 "step": 374
5246 },
5247 {
5248 "clip_ratio": 0.0,
5249 "completion_length": 108.12044525146484,
5250 "epoch": 4.310344827586207,
5251 "grad_norm": 1.757713278971671,
5252 "kl": 0.038330078125,
5253 "learning_rate": 5.689655172413793e-07,
5254 "loss": 0.0016,
5255 "reward": 1.8772163391113281,
5256 "reward_std": 0.022540029138326645,
5257 "rewards/accuracy_reward": 0.8772165179252625,
5258 "rewards/format_reward": 1.0,
5259 "step": 375
5260 },
5261 {
5262 "clip_ratio": 0.0,
5263 "completion_length": 107.61458587646484,
5264 "epoch": 4.32183908045977,
5265 "grad_norm": 1.6452684635685983,
5266 "kl": 0.038818359375,
5267 "learning_rate": 5.678160919540229e-07,
5268 "loss": 0.0016,
5269 "reward": 1.8954042196273804,
5270 "reward_std": 0.02163463830947876,
5271 "rewards/accuracy_reward": 0.8967063426971436,
5272 "rewards/format_reward": 0.9986979365348816,
5273 "step": 376
5274 },
5275 {
5276 "clip_ratio": 0.0,
5277 "completion_length": 107.05208587646484,
5278 "epoch": 4.333333333333333,
5279 "grad_norm": 2.2555305608969154,
5280 "kl": 0.03955078125,
5281 "learning_rate": 5.666666666666666e-07,
5282 "loss": 0.0016,
5283 "reward": 1.8833587169647217,
5284 "reward_std": 0.021704208105802536,
5285 "rewards/accuracy_reward": 0.8833586573600769,
5286 "rewards/format_reward": 1.0,
5287 "step": 377
5288 },
5289 {
5290 "clip_ratio": 0.0,
5291 "completion_length": 107.78450775146484,
5292 "epoch": 4.344827586206897,
5293 "grad_norm": 2.212131167395634,
5294 "kl": 0.04541015625,
5295 "learning_rate": 5.655172413793103e-07,
5296 "loss": 0.0019,
5297 "reward": 1.8870363235473633,
5298 "reward_std": 0.020737424492836,
5299 "rewards/accuracy_reward": 0.8870364427566528,
5300 "rewards/format_reward": 1.0,
5301 "step": 378
5302 },
5303 {
5304 "clip_ratio": 0.0,
5305 "completion_length": 107.40104675292969,
5306 "epoch": 4.35632183908046,
5307 "grad_norm": 1.529590167352119,
5308 "kl": 0.045654296875,
5309 "learning_rate": 5.643678160919541e-07,
5310 "loss": 0.0019,
5311 "reward": 1.8749973773956299,
5312 "reward_std": 0.018837254494428635,
5313 "rewards/accuracy_reward": 0.8749972581863403,
5314 "rewards/format_reward": 1.0,
5315 "step": 379
5316 },
5317 {
5318 "clip_ratio": 0.0,
5319 "completion_length": 106.90495300292969,
5320 "epoch": 4.3678160919540225,
5321 "grad_norm": 2.1938200030551385,
5322 "kl": 0.04248046875,
5323 "learning_rate": 5.632183908045976e-07,
5324 "loss": 0.0018,
5325 "reward": 1.8977670669555664,
5326 "reward_std": 0.02074752375483513,
5327 "rewards/accuracy_reward": 0.8977672457695007,
5328 "rewards/format_reward": 1.0,
5329 "step": 380
5330 },
5331 {
5332 "clip_ratio": 0.0,
5333 "completion_length": 107.07096862792969,
5334 "epoch": 4.379310344827586,
5335 "grad_norm": 3.9533429205437813,
5336 "kl": 0.047119140625,
5337 "learning_rate": 5.620689655172414e-07,
5338 "loss": 0.0019,
5339 "reward": 1.9117012023925781,
5340 "reward_std": 0.02050834149122238,
5341 "rewards/accuracy_reward": 0.9117013216018677,
5342 "rewards/format_reward": 1.0,
5343 "step": 381
5344 },
5345 {
5346 "clip_ratio": 0.0,
5347 "completion_length": 105.44140625,
5348 "epoch": 4.390804597701149,
5349 "grad_norm": 2.4576931014163828,
5350 "kl": 0.04150390625,
5351 "learning_rate": 5.609195402298851e-07,
5352 "loss": 0.0017,
5353 "reward": 1.8892667293548584,
5354 "reward_std": 0.024918202310800552,
5355 "rewards/accuracy_reward": 0.8899178504943848,
5356 "rewards/format_reward": 0.9993489980697632,
5357 "step": 382
5358 },
5359 {
5360 "clip_ratio": 0.0,
5361 "completion_length": 105.494140625,
5362 "epoch": 4.402298850574713,
5363 "grad_norm": 2.1902999394095324,
5364 "kl": 0.04150390625,
5365 "learning_rate": 5.597701149425286e-07,
5366 "loss": 0.0017,
5367 "reward": 1.9016425609588623,
5368 "reward_std": 0.018868938088417053,
5369 "rewards/accuracy_reward": 0.9016425013542175,
5370 "rewards/format_reward": 1.0,
5371 "step": 383
5372 },
5373 {
5374 "clip_ratio": 0.0,
5375 "completion_length": 105.78515625,
5376 "epoch": 4.413793103448276,
5377 "grad_norm": 1.883452563889067,
5378 "kl": 0.041259765625,
5379 "learning_rate": 5.586206896551724e-07,
5380 "loss": 0.0017,
5381 "reward": 1.8980283737182617,
5382 "reward_std": 0.021328266710042953,
5383 "rewards/accuracy_reward": 0.8986794352531433,
5384 "rewards/format_reward": 0.9993489980697632,
5385 "step": 384
5386 },
5387 {
5388 "clip_ratio": 0.0,
5389 "completion_length": 104.46745300292969,
5390 "epoch": 4.425287356321839,
5391 "grad_norm": 2.3714294651121084,
5392 "kl": 0.042724609375,
5393 "learning_rate": 5.574712643678161e-07,
5394 "loss": 0.0017,
5395 "reward": 1.8801054954528809,
5396 "reward_std": 0.02180486172437668,
5397 "rewards/accuracy_reward": 0.8801054954528809,
5398 "rewards/format_reward": 1.0,
5399 "step": 385
5400 },
5401 {
5402 "clip_ratio": 0.0,
5403 "completion_length": 102.17057800292969,
5404 "epoch": 4.436781609195402,
5405 "grad_norm": 1.3480188133058681,
5406 "kl": 0.044189453125,
5407 "learning_rate": 5.563218390804598e-07,
5408 "loss": 0.0018,
5409 "reward": 1.9038050174713135,
5410 "reward_std": 0.023260444402694702,
5411 "rewards/accuracy_reward": 0.9044560194015503,
5412 "rewards/format_reward": 0.9993489980697632,
5413 "step": 386
5414 },
5415 {
5416 "clip_ratio": 0.0,
5417 "completion_length": 103.60807800292969,
5418 "epoch": 4.448275862068965,
5419 "grad_norm": 1.558932018548512,
5420 "kl": 0.040283203125,
5421 "learning_rate": 5.551724137931034e-07,
5422 "loss": 0.0016,
5423 "reward": 1.883833646774292,
5424 "reward_std": 0.021813951432704926,
5425 "rewards/accuracy_reward": 0.883833646774292,
5426 "rewards/format_reward": 1.0,
5427 "step": 387
5428 },
5429 {
5430 "clip_ratio": 0.0,
5431 "completion_length": 104.23372650146484,
5432 "epoch": 4.459770114942529,
5433 "grad_norm": 3.4341748113797994,
5434 "kl": 0.042724609375,
5435 "learning_rate": 5.540229885057471e-07,
5436 "loss": 0.0018,
5437 "reward": 1.8973731994628906,
5438 "reward_std": 0.021094098687171936,
5439 "rewards/accuracy_reward": 0.8973731398582458,
5440 "rewards/format_reward": 1.0,
5441 "step": 388
5442 },
5443 {
5444 "clip_ratio": 0.0,
5445 "completion_length": 102.3828125,
5446 "epoch": 4.471264367816092,
5447 "grad_norm": 2.3537727105401713,
5448 "kl": 0.045654296875,
5449 "learning_rate": 5.528735632183908e-07,
5450 "loss": 0.0019,
5451 "reward": 1.9044450521469116,
5452 "reward_std": 0.02275899611413479,
5453 "rewards/accuracy_reward": 0.9050959944725037,
5454 "rewards/format_reward": 0.9993489980697632,
5455 "step": 389
5456 },
5457 {
5458 "clip_ratio": 0.0,
5459 "completion_length": 102.58528900146484,
5460 "epoch": 4.482758620689655,
5461 "grad_norm": 2.091387625283226,
5462 "kl": 0.044921875,
5463 "learning_rate": 5.517241379310344e-07,
5464 "loss": 0.0019,
5465 "reward": 1.8750637769699097,
5466 "reward_std": 0.02193189226090908,
5467 "rewards/accuracy_reward": 0.8750636577606201,
5468 "rewards/format_reward": 1.0,
5469 "step": 390
5470 },
5471 {
5472 "clip_ratio": 0.0,
5473 "completion_length": 101.68489837646484,
5474 "epoch": 4.494252873563219,
5475 "grad_norm": 1.563517095069176,
5476 "kl": 0.0439453125,
5477 "learning_rate": 5.505747126436782e-07,
5478 "loss": 0.0018,
5479 "reward": 1.895681381225586,
5480 "reward_std": 0.021227214485406876,
5481 "rewards/accuracy_reward": 0.8956813812255859,
5482 "rewards/format_reward": 1.0,
5483 "step": 391
5484 },
5485 {
5486 "clip_ratio": 0.0,
5487 "completion_length": 101.5703125,
5488 "epoch": 4.505747126436781,
5489 "grad_norm": 1.4561203507859735,
5490 "kl": 0.044677734375,
5491 "learning_rate": 5.494252873563218e-07,
5492 "loss": 0.0018,
5493 "reward": 1.903104543685913,
5494 "reward_std": 0.019182192161679268,
5495 "rewards/accuracy_reward": 0.9031044840812683,
5496 "rewards/format_reward": 1.0,
5497 "step": 392
5498 },
5499 {
5500 "clip_ratio": 0.0,
5501 "completion_length": 103.791015625,
5502 "epoch": 4.517241379310345,
5503 "grad_norm": 2.20068200415075,
5504 "kl": 0.03955078125,
5505 "learning_rate": 5.482758620689654e-07,
5506 "loss": 0.0016,
5507 "reward": 1.8813633918762207,
5508 "reward_std": 0.02065703272819519,
5509 "rewards/accuracy_reward": 0.8813633918762207,
5510 "rewards/format_reward": 1.0,
5511 "step": 393
5512 },
5513 {
5514 "clip_ratio": 0.0,
5515 "completion_length": 102.048828125,
5516 "epoch": 4.528735632183908,
5517 "grad_norm": 2.153835030110258,
5518 "kl": 0.0400390625,
5519 "learning_rate": 5.471264367816092e-07,
5520 "loss": 0.0017,
5521 "reward": 1.8950791358947754,
5522 "reward_std": 0.018896300345659256,
5523 "rewards/accuracy_reward": 0.8950790762901306,
5524 "rewards/format_reward": 1.0,
5525 "step": 394
5526 },
5527 {
5528 "clip_ratio": 0.0,
5529 "completion_length": 105.44206237792969,
5530 "epoch": 4.540229885057471,
5531 "grad_norm": 1.8689205868418028,
5532 "kl": 0.04931640625,
5533 "learning_rate": 5.459770114942529e-07,
5534 "loss": 0.002,
5535 "reward": 1.8724088668823242,
5536 "reward_std": 0.022876422852277756,
5537 "rewards/accuracy_reward": 0.8730599880218506,
5538 "rewards/format_reward": 0.9993489980697632,
5539 "step": 395
5540 },
5541 {
5542 "clip_ratio": 0.0,
5543 "completion_length": 105.875,
5544 "epoch": 4.551724137931035,
5545 "grad_norm": 1.6153380302756586,
5546 "kl": 0.040283203125,
5547 "learning_rate": 5.448275862068966e-07,
5548 "loss": 0.0017,
5549 "reward": 1.8891279697418213,
5550 "reward_std": 0.022592080757021904,
5551 "rewards/accuracy_reward": 0.8897790312767029,
5552 "rewards/format_reward": 0.9993489980697632,
5553 "step": 396
5554 },
5555 {
5556 "clip_ratio": 0.0,
5557 "completion_length": 105.71614837646484,
5558 "epoch": 4.563218390804598,
5559 "grad_norm": 1.5101980917336142,
5560 "kl": 0.039306640625,
5561 "learning_rate": 5.436781609195402e-07,
5562 "loss": 0.0016,
5563 "reward": 1.8952821493148804,
5564 "reward_std": 0.02106665074825287,
5565 "rewards/accuracy_reward": 0.8952820301055908,
5566 "rewards/format_reward": 1.0,
5567 "step": 397
5568 },
5569 {
5570 "clip_ratio": 0.0,
5571 "completion_length": 108.232421875,
5572 "epoch": 4.574712643678161,
5573 "grad_norm": 1.4461465189423137,
5574 "kl": 0.040283203125,
5575 "learning_rate": 5.425287356321839e-07,
5576 "loss": 0.0017,
5577 "reward": 1.8775584697723389,
5578 "reward_std": 0.025320343673229218,
5579 "rewards/accuracy_reward": 0.8782094717025757,
5580 "rewards/format_reward": 0.9993489980697632,
5581 "step": 398
5582 },
5583 {
5584 "clip_ratio": 0.0,
5585 "completion_length": 108.45052337646484,
5586 "epoch": 4.586206896551724,
5587 "grad_norm": 1.2782259292486342,
5588 "kl": 0.038818359375,
5589 "learning_rate": 5.413793103448276e-07,
5590 "loss": 0.0016,
5591 "reward": 1.8783025741577148,
5592 "reward_std": 0.024571552872657776,
5593 "rewards/accuracy_reward": 0.8789536356925964,
5594 "rewards/format_reward": 0.9993489980697632,
5595 "step": 399
5596 },
5597 {
5598 "clip_ratio": 0.0,
5599 "completion_length": 109.37435150146484,
5600 "epoch": 4.597701149425287,
5601 "grad_norm": 4.341249687805903,
5602 "kl": 0.03955078125,
5603 "learning_rate": 5.402298850574712e-07,
5604 "loss": 0.0016,
5605 "reward": 1.8882777690887451,
5606 "reward_std": 0.025847896933555603,
5607 "rewards/accuracy_reward": 0.8895797729492188,
5608 "rewards/format_reward": 0.9986979365348816,
5609 "step": 400
5610 },
5611 {
5612 "clip_ratio": 0.0,
5613 "completion_length": 111.140625,
5614 "epoch": 4.609195402298851,
5615 "grad_norm": 1.3663166069622814,
5616 "kl": 0.03759765625,
5617 "learning_rate": 5.39080459770115e-07,
5618 "loss": 0.0016,
5619 "reward": 1.8888124227523804,
5620 "reward_std": 0.025306126102805138,
5621 "rewards/accuracy_reward": 0.8901144862174988,
5622 "rewards/format_reward": 0.9986979365348816,
5623 "step": 401
5624 },
5625 {
5626 "clip_ratio": 0.0,
5627 "completion_length": 110.66471862792969,
5628 "epoch": 4.620689655172414,
5629 "grad_norm": 1.6302378119468088,
5630 "kl": 0.03955078125,
5631 "learning_rate": 5.379310344827586e-07,
5632 "loss": 0.0017,
5633 "reward": 1.8746542930603027,
5634 "reward_std": 0.02441917359828949,
5635 "rewards/accuracy_reward": 0.8753053545951843,
5636 "rewards/format_reward": 0.9993489980697632,
5637 "step": 402
5638 },
5639 {
5640 "clip_ratio": 0.0,
5641 "completion_length": 109.66276550292969,
5642 "epoch": 4.6321839080459775,
5643 "grad_norm": 1.4816501158522215,
5644 "kl": 0.039794921875,
5645 "learning_rate": 5.367816091954022e-07,
5646 "loss": 0.0016,
5647 "reward": 1.894063949584961,
5648 "reward_std": 0.024088043719530106,
5649 "rewards/accuracy_reward": 0.8947149515151978,
5650 "rewards/format_reward": 0.9993489980697632,
5651 "step": 403
5652 },
5653 {
5654 "clip_ratio": 0.0,
5655 "completion_length": 110.3828125,
5656 "epoch": 4.64367816091954,
5657 "grad_norm": 1.6836651779382024,
5658 "kl": 0.04345703125,
5659 "learning_rate": 5.35632183908046e-07,
5660 "loss": 0.0018,
5661 "reward": 1.8845057487487793,
5662 "reward_std": 0.02305610477924347,
5663 "rewards/accuracy_reward": 0.8845058679580688,
5664 "rewards/format_reward": 1.0,
5665 "step": 404
5666 },
5667 {
5668 "clip_ratio": 0.0,
5669 "completion_length": 110.69596862792969,
5670 "epoch": 4.655172413793103,
5671 "grad_norm": 1.1778273851354586,
5672 "kl": 0.04345703125,
5673 "learning_rate": 5.344827586206896e-07,
5674 "loss": 0.0018,
5675 "reward": 1.8848464488983154,
5676 "reward_std": 0.02404012344777584,
5677 "rewards/accuracy_reward": 0.8854975700378418,
5678 "rewards/format_reward": 0.9993489980697632,
5679 "step": 405
5680 },
5681 {
5682 "clip_ratio": 0.0,
5683 "completion_length": 108.251953125,
5684 "epoch": 4.666666666666667,
5685 "grad_norm": 1.427234022329123,
5686 "kl": 0.043212890625,
5687 "learning_rate": 5.333333333333333e-07,
5688 "loss": 0.0018,
5689 "reward": 1.9008078575134277,
5690 "reward_std": 0.020701348781585693,
5691 "rewards/accuracy_reward": 0.9008078575134277,
5692 "rewards/format_reward": 1.0,
5693 "step": 406
5694 },
5695 {
5696 "clip_ratio": 0.0,
5697 "completion_length": 110.42513275146484,
5698 "epoch": 4.67816091954023,
5699 "grad_norm": 2.5640372485607696,
5700 "kl": 0.04296875,
5701 "learning_rate": 5.32183908045977e-07,
5702 "loss": 0.0018,
5703 "reward": 1.8793271780014038,
5704 "reward_std": 0.026321526616811752,
5705 "rewards/accuracy_reward": 0.8806290626525879,
5706 "rewards/format_reward": 0.9986979365348816,
5707 "step": 407
5708 },
5709 {
5710 "clip_ratio": 0.0,
5711 "completion_length": 108.38932800292969,
5712 "epoch": 4.689655172413794,
5713 "grad_norm": 2.3519843115525343,
5714 "kl": 0.041259765625,
5715 "learning_rate": 5.310344827586206e-07,
5716 "loss": 0.0017,
5717 "reward": 1.9006179571151733,
5718 "reward_std": 0.0217905193567276,
5719 "rewards/accuracy_reward": 0.9006179571151733,
5720 "rewards/format_reward": 1.0,
5721 "step": 408
5722 },
5723 {
5724 "clip_ratio": 0.0,
5725 "completion_length": 108.32161712646484,
5726 "epoch": 4.7011494252873565,
5727 "grad_norm": 3.886735063581474,
5728 "kl": 0.0458984375,
5729 "learning_rate": 5.298850574712644e-07,
5730 "loss": 0.0019,
5731 "reward": 1.904909372329712,
5732 "reward_std": 0.019571471959352493,
5733 "rewards/accuracy_reward": 0.9049093127250671,
5734 "rewards/format_reward": 1.0,
5735 "step": 409
5736 },
5737 {
5738 "clip_ratio": 0.0,
5739 "completion_length": 108.49935150146484,
5740 "epoch": 4.712643678160919,
5741 "grad_norm": 1.9215943999639125,
5742 "kl": 0.045654296875,
5743 "learning_rate": 5.28735632183908e-07,
5744 "loss": 0.0019,
5745 "reward": 1.896188735961914,
5746 "reward_std": 0.02258872799575329,
5747 "rewards/accuracy_reward": 0.8961889147758484,
5748 "rewards/format_reward": 1.0,
5749 "step": 410
5750 },
5751 {
5752 "clip_ratio": 0.0,
5753 "completion_length": 105.84635925292969,
5754 "epoch": 4.724137931034483,
5755 "grad_norm": 2.323925371294666,
5756 "kl": 0.039794921875,
5757 "learning_rate": 5.275862068965517e-07,
5758 "loss": 0.0017,
5759 "reward": 1.8925457000732422,
5760 "reward_std": 0.01859492063522339,
5761 "rewards/accuracy_reward": 0.8925456404685974,
5762 "rewards/format_reward": 1.0,
5763 "step": 411
5764 },
5765 {
5766 "clip_ratio": 0.0,
5767 "completion_length": 105.20638275146484,
5768 "epoch": 4.735632183908046,
5769 "grad_norm": 1.655400396429134,
5770 "kl": 0.041748046875,
5771 "learning_rate": 5.264367816091954e-07,
5772 "loss": 0.0017,
5773 "reward": 1.8739862442016602,
5774 "reward_std": 0.021640470251441002,
5775 "rewards/accuracy_reward": 0.8746374845504761,
5776 "rewards/format_reward": 0.9993489980697632,
5777 "step": 412
5778 },
5779 {
5780 "clip_ratio": 0.0,
5781 "completion_length": 107.99153900146484,
5782 "epoch": 4.747126436781609,
5783 "grad_norm": 1.9363409073368856,
5784 "kl": 0.049072265625,
5785 "learning_rate": 5.252873563218391e-07,
5786 "loss": 0.002,
5787 "reward": 1.8912228345870972,
5788 "reward_std": 0.021241484209895134,
5789 "rewards/accuracy_reward": 0.8918737769126892,
5790 "rewards/format_reward": 0.9993489980697632,
5791 "step": 413
5792 },
5793 {
5794 "clip_ratio": 0.0,
5795 "completion_length": 106.73763275146484,
5796 "epoch": 4.758620689655173,
5797 "grad_norm": 2.279760774245835,
5798 "kl": 0.04052734375,
5799 "learning_rate": 5.241379310344828e-07,
5800 "loss": 0.0017,
5801 "reward": 1.9129436016082764,
5802 "reward_std": 0.01870916411280632,
5803 "rewards/accuracy_reward": 0.9129435420036316,
5804 "rewards/format_reward": 1.0,
5805 "step": 414
5806 },
5807 {
5808 "clip_ratio": 0.0,
5809 "completion_length": 106.61003112792969,
5810 "epoch": 4.7701149425287355,
5811 "grad_norm": 2.45904909614863,
5812 "kl": 0.03515625,
5813 "learning_rate": 5.229885057471264e-07,
5814 "loss": 0.0015,
5815 "reward": 1.9052283763885498,
5816 "reward_std": 0.019063513725996017,
5817 "rewards/accuracy_reward": 0.9052283763885498,
5818 "rewards/format_reward": 1.0,
5819 "step": 415
5820 },
5821 {
5822 "clip_ratio": 0.0,
5823 "completion_length": 107.0078125,
5824 "epoch": 4.781609195402299,
5825 "grad_norm": 1.6845003543176094,
5826 "kl": 0.041015625,
5827 "learning_rate": 5.218390804597701e-07,
5828 "loss": 0.0017,
5829 "reward": 1.894358515739441,
5830 "reward_std": 0.022152118384838104,
5831 "rewards/accuracy_reward": 0.8950096964836121,
5832 "rewards/format_reward": 0.9993489980697632,
5833 "step": 416
5834 },
5835 {
5836 "clip_ratio": 0.0,
5837 "completion_length": 105.95833587646484,
5838 "epoch": 4.793103448275862,
5839 "grad_norm": 4.433200172934703,
5840 "kl": 0.0634765625,
5841 "learning_rate": 5.206896551724138e-07,
5842 "loss": 0.0026,
5843 "reward": 1.8981165885925293,
5844 "reward_std": 0.020231489092111588,
5845 "rewards/accuracy_reward": 0.8981165885925293,
5846 "rewards/format_reward": 1.0,
5847 "step": 417
5848 },
5849 {
5850 "clip_ratio": 0.0,
5851 "completion_length": 105.35482025146484,
5852 "epoch": 4.804597701149425,
5853 "grad_norm": 1.7763571926472603,
5854 "kl": 0.042724609375,
5855 "learning_rate": 5.195402298850574e-07,
5856 "loss": 0.0018,
5857 "reward": 1.906449556350708,
5858 "reward_std": 0.020989403128623962,
5859 "rewards/accuracy_reward": 0.9064494967460632,
5860 "rewards/format_reward": 1.0,
5861 "step": 418
5862 },
5863 {
5864 "clip_ratio": 0.0,
5865 "completion_length": 104.24089050292969,
5866 "epoch": 4.816091954022989,
5867 "grad_norm": 1.4886525030955973,
5868 "kl": 0.04736328125,
5869 "learning_rate": 5.183908045977012e-07,
5870 "loss": 0.002,
5871 "reward": 1.8850057125091553,
5872 "reward_std": 0.020064577460289,
5873 "rewards/accuracy_reward": 0.8850056529045105,
5874 "rewards/format_reward": 1.0,
5875 "step": 419
5876 },
5877 {
5878 "clip_ratio": 0.0,
5879 "completion_length": 105.37565612792969,
5880 "epoch": 4.827586206896552,
5881 "grad_norm": 2.8859840023098866,
5882 "kl": 0.0390625,
5883 "learning_rate": 5.172413793103448e-07,
5884 "loss": 0.0016,
5885 "reward": 1.8768961429595947,
5886 "reward_std": 0.018807468935847282,
5887 "rewards/accuracy_reward": 0.8768962621688843,
5888 "rewards/format_reward": 1.0,
5889 "step": 420
5890 },
5891 {
5892 "clip_ratio": 0.0,
5893 "completion_length": 106.56380462646484,
5894 "epoch": 4.8390804597701145,
5895 "grad_norm": 1.8199908836931964,
5896 "kl": 0.04150390625,
5897 "learning_rate": 5.160919540229884e-07,
5898 "loss": 0.0017,
5899 "reward": 1.8867862224578857,
5900 "reward_std": 0.022299369797110558,
5901 "rewards/accuracy_reward": 0.8867861032485962,
5902 "rewards/format_reward": 1.0,
5903 "step": 421
5904 },
5905 {
5906 "clip_ratio": 0.0,
5907 "completion_length": 105.96484375,
5908 "epoch": 4.850574712643678,
5909 "grad_norm": 1.415400095170082,
5910 "kl": 0.042724609375,
5911 "learning_rate": 5.149425287356322e-07,
5912 "loss": 0.0018,
5913 "reward": 1.9118112325668335,
5914 "reward_std": 0.019953366369009018,
5915 "rewards/accuracy_reward": 0.9118112325668335,
5916 "rewards/format_reward": 1.0,
5917 "step": 422
5918 },
5919 {
5920 "clip_ratio": 0.0,
5921 "completion_length": 106.650390625,
5922 "epoch": 4.862068965517241,
5923 "grad_norm": 2.027158566984518,
5924 "kl": 0.044189453125,
5925 "learning_rate": 5.137931034482759e-07,
5926 "loss": 0.0018,
5927 "reward": 1.878554344177246,
5928 "reward_std": 0.019561052322387695,
5929 "rewards/accuracy_reward": 0.8785543441772461,
5930 "rewards/format_reward": 1.0,
5931 "step": 423
5932 },
5933 {
5934 "clip_ratio": 0.0,
5935 "completion_length": 104.13932800292969,
5936 "epoch": 4.873563218390805,
5937 "grad_norm": 3.123515896042613,
5938 "kl": 0.048095703125,
5939 "learning_rate": 5.126436781609194e-07,
5940 "loss": 0.002,
5941 "reward": 1.8973406553268433,
5942 "reward_std": 0.021455293521285057,
5943 "rewards/accuracy_reward": 0.8973406553268433,
5944 "rewards/format_reward": 1.0,
5945 "step": 424
5946 },
5947 {
5948 "clip_ratio": 0.0,
5949 "completion_length": 104.84831237792969,
5950 "epoch": 4.885057471264368,
5951 "grad_norm": 2.2665060638151737,
5952 "kl": 0.05078125,
5953 "learning_rate": 5.114942528735632e-07,
5954 "loss": 0.0021,
5955 "reward": 1.8799655437469482,
5956 "reward_std": 0.02300919219851494,
5957 "rewards/accuracy_reward": 0.8799654841423035,
5958 "rewards/format_reward": 1.0,
5959 "step": 425
5960 },
5961 {
5962 "clip_ratio": 0.0,
5963 "completion_length": 103.94401550292969,
5964 "epoch": 4.896551724137931,
5965 "grad_norm": 1.6919605234703265,
5966 "kl": 0.0556640625,
5967 "learning_rate": 5.103448275862069e-07,
5968 "loss": 0.0023,
5969 "reward": 1.9173061847686768,
5970 "reward_std": 0.01950909197330475,
5971 "rewards/accuracy_reward": 0.9173061847686768,
5972 "rewards/format_reward": 1.0,
5973 "step": 426
5974 },
5975 {
5976 "clip_ratio": 0.0,
5977 "completion_length": 104.74153900146484,
5978 "epoch": 4.908045977011494,
5979 "grad_norm": 2.1229070642282553,
5980 "kl": 0.04931640625,
5981 "learning_rate": 5.091954022988506e-07,
5982 "loss": 0.002,
5983 "reward": 1.8742494583129883,
5984 "reward_std": 0.023737162351608276,
5985 "rewards/accuracy_reward": 0.8742495775222778,
5986 "rewards/format_reward": 1.0,
5987 "step": 427
5988 },
5989 {
5990 "clip_ratio": 0.0,
5991 "completion_length": 102.42643737792969,
5992 "epoch": 4.919540229885057,
5993 "grad_norm": 2.1832269520490892,
5994 "kl": 0.052001953125,
5995 "learning_rate": 5.080459770114942e-07,
5996 "loss": 0.0021,
5997 "reward": 1.8955520391464233,
5998 "reward_std": 0.020405521616339684,
5999 "rewards/accuracy_reward": 0.8955520391464233,
6000 "rewards/format_reward": 1.0,
6001 "step": 428
6002 },
6003 {
6004 "clip_ratio": 0.0,
6005 "completion_length": 103.27214050292969,
6006 "epoch": 4.931034482758621,
6007 "grad_norm": 2.536754289442291,
6008 "kl": 0.0546875,
6009 "learning_rate": 5.068965517241379e-07,
6010 "loss": 0.0023,
6011 "reward": 1.896885633468628,
6012 "reward_std": 0.020467374473810196,
6013 "rewards/accuracy_reward": 0.8968856334686279,
6014 "rewards/format_reward": 1.0,
6015 "step": 429
6016 },
6017 {
6018 "clip_ratio": 0.0,
6019 "completion_length": 103.57357025146484,
6020 "epoch": 4.942528735632184,
6021 "grad_norm": 1.3718070549122303,
6022 "kl": 0.05517578125,
6023 "learning_rate": 5.057471264367817e-07,
6024 "loss": 0.0022,
6025 "reward": 1.8929929733276367,
6026 "reward_std": 0.024712808430194855,
6027 "rewards/accuracy_reward": 0.8929929733276367,
6028 "rewards/format_reward": 1.0,
6029 "step": 430
6030 },
6031 {
6032 "clip_ratio": 0.0,
6033 "completion_length": 101.10807800292969,
6034 "epoch": 4.954022988505747,
6035 "grad_norm": 2.668322989932513,
6036 "kl": 0.05322265625,
6037 "learning_rate": 5.045977011494252e-07,
6038 "loss": 0.0022,
6039 "reward": 1.8981032371520996,
6040 "reward_std": 0.020073171705007553,
6041 "rewards/accuracy_reward": 0.8981032371520996,
6042 "rewards/format_reward": 1.0,
6043 "step": 431
6044 },
6045 {
6046 "clip_ratio": 0.0,
6047 "completion_length": 103.62630462646484,
6048 "epoch": 4.9655172413793105,
6049 "grad_norm": 2.4540498394359047,
6050 "kl": 0.0546875,
6051 "learning_rate": 5.03448275862069e-07,
6052 "loss": 0.0023,
6053 "reward": 1.8993358612060547,
6054 "reward_std": 0.018647989258170128,
6055 "rewards/accuracy_reward": 0.8993359804153442,
6056 "rewards/format_reward": 1.0,
6057 "step": 432
6058 },
6059 {
6060 "clip_ratio": 0.0,
6061 "completion_length": 104.70964050292969,
6062 "epoch": 4.977011494252873,
6063 "grad_norm": 1.963612640849598,
6064 "kl": 0.052734375,
6065 "learning_rate": 5.022988505747127e-07,
6066 "loss": 0.0022,
6067 "reward": 1.8898952007293701,
6068 "reward_std": 0.022081639617681503,
6069 "rewards/accuracy_reward": 0.8898952007293701,
6070 "rewards/format_reward": 1.0,
6071 "step": 433
6072 },
6073 {
6074 "clip_ratio": 0.0,
6075 "completion_length": 103.87109375,
6076 "epoch": 4.988505747126437,
6077 "grad_norm": 2.3000221102493157,
6078 "kl": 0.046142578125,
6079 "learning_rate": 5.011494252873562e-07,
6080 "loss": 0.0019,
6081 "reward": 1.89603853225708,
6082 "reward_std": 0.020961783826351166,
6083 "rewards/accuracy_reward": 0.8960385322570801,
6084 "rewards/format_reward": 1.0,
6085 "step": 434
6086 },
6087 {
6088 "clip_ratio": 0.0,
6089 "completion_length": 99.20084381103516,
6090 "epoch": 5.0,
6091 "grad_norm": 1.792728503326833,
6092 "kl": 0.045166015625,
6093 "learning_rate": 5e-07,
6094 "loss": 0.0019,
6095 "reward": 1.9164615869522095,
6096 "reward_std": 0.01855144090950489,
6097 "rewards/accuracy_reward": 0.9164614677429199,
6098 "rewards/format_reward": 1.0,
6099 "step": 435
6100 },
6101 {
6102 "clip_ratio": 0.0,
6103 "completion_length": 109.03125,
6104 "epoch": 5.011494252873563,
6105 "grad_norm": 2.411488733190351,
6106 "kl": 0.042236328125,
6107 "learning_rate": 4.988505747126436e-07,
6108 "loss": 0.0017,
6109 "reward": 1.8886754512786865,
6110 "reward_std": 0.023060960695147514,
6111 "rewards/accuracy_reward": 0.8886754512786865,
6112 "rewards/format_reward": 1.0,
6113 "step": 436
6114 },
6115 {
6116 "clip_ratio": 0.0,
6117 "completion_length": 109.51302337646484,
6118 "epoch": 5.022988505747127,
6119 "grad_norm": 1.2846385937268994,
6120 "kl": 0.04638671875,
6121 "learning_rate": 4.977011494252874e-07,
6122 "loss": 0.0019,
6123 "reward": 1.8596938848495483,
6124 "reward_std": 0.02523483708500862,
6125 "rewards/accuracy_reward": 0.8603450059890747,
6126 "rewards/format_reward": 0.9993489980697632,
6127 "step": 437
6128 },
6129 {
6130 "clip_ratio": 0.0,
6131 "completion_length": 109.48177337646484,
6132 "epoch": 5.0344827586206895,
6133 "grad_norm": 4.302610321521328,
6134 "kl": 0.04638671875,
6135 "learning_rate": 4.96551724137931e-07,
6136 "loss": 0.0019,
6137 "reward": 1.875889539718628,
6138 "reward_std": 0.02352536842226982,
6139 "rewards/accuracy_reward": 0.8758895397186279,
6140 "rewards/format_reward": 1.0,
6141 "step": 438
6142 },
6143 {
6144 "clip_ratio": 0.0,
6145 "completion_length": 108.92578125,
6146 "epoch": 5.045977011494253,
6147 "grad_norm": 1.9265386376130789,
6148 "kl": 0.0390625,
6149 "learning_rate": 4.954022988505746e-07,
6150 "loss": 0.0016,
6151 "reward": 1.8992599248886108,
6152 "reward_std": 0.023157542571425438,
6153 "rewards/accuracy_reward": 0.8999109268188477,
6154 "rewards/format_reward": 0.9993489980697632,
6155 "step": 439
6156 },
6157 {
6158 "clip_ratio": 0.0,
6159 "completion_length": 107.67448425292969,
6160 "epoch": 5.057471264367816,
6161 "grad_norm": 1.6723222924852237,
6162 "kl": 0.0439453125,
6163 "learning_rate": 4.942528735632184e-07,
6164 "loss": 0.0018,
6165 "reward": 1.8739842176437378,
6166 "reward_std": 0.022657310590147972,
6167 "rewards/accuracy_reward": 0.8746352195739746,
6168 "rewards/format_reward": 0.9993489980697632,
6169 "step": 440
6170 },
6171 {
6172 "clip_ratio": 0.0,
6173 "completion_length": 109.04167175292969,
6174 "epoch": 5.068965517241379,
6175 "grad_norm": 1.602166960354794,
6176 "kl": 0.039306640625,
6177 "learning_rate": 4.93103448275862e-07,
6178 "loss": 0.0016,
6179 "reward": 1.8998823165893555,
6180 "reward_std": 0.02184360846877098,
6181 "rewards/accuracy_reward": 0.8998822569847107,
6182 "rewards/format_reward": 1.0,
6183 "step": 441
6184 },
6185 {
6186 "clip_ratio": 0.0,
6187 "completion_length": 108.63932800292969,
6188 "epoch": 5.080459770114943,
6189 "grad_norm": 2.4680045505704205,
6190 "kl": 0.039306640625,
6191 "learning_rate": 4.919540229885058e-07,
6192 "loss": 0.0016,
6193 "reward": 1.8900998830795288,
6194 "reward_std": 0.02164739929139614,
6195 "rewards/accuracy_reward": 0.8900998830795288,
6196 "rewards/format_reward": 1.0,
6197 "step": 442
6198 },
6199 {
6200 "clip_ratio": 0.0,
6201 "completion_length": 106.908203125,
6202 "epoch": 5.091954022988506,
6203 "grad_norm": 1.390816247246552,
6204 "kl": 0.041748046875,
6205 "learning_rate": 4.908045977011494e-07,
6206 "loss": 0.0017,
6207 "reward": 1.889683723449707,
6208 "reward_std": 0.022919952869415283,
6209 "rewards/accuracy_reward": 0.8896838426589966,
6210 "rewards/format_reward": 1.0,
6211 "step": 443
6212 },
6213 {
6214 "clip_ratio": 0.0,
6215 "completion_length": 107.91341400146484,
6216 "epoch": 5.103448275862069,
6217 "grad_norm": 1.197765074052055,
6218 "kl": 0.04736328125,
6219 "learning_rate": 4.89655172413793e-07,
6220 "loss": 0.002,
6221 "reward": 1.8937995433807373,
6222 "reward_std": 0.021353360265493393,
6223 "rewards/accuracy_reward": 0.8937994837760925,
6224 "rewards/format_reward": 1.0,
6225 "step": 444
6226 },
6227 {
6228 "clip_ratio": 0.0,
6229 "completion_length": 107.26692962646484,
6230 "epoch": 5.114942528735632,
6231 "grad_norm": 24.16259559584872,
6232 "kl": 0.0478515625,
6233 "learning_rate": 4.885057471264368e-07,
6234 "loss": 0.002,
6235 "reward": 1.8777693510055542,
6236 "reward_std": 0.019756315276026726,
6237 "rewards/accuracy_reward": 0.8777693510055542,
6238 "rewards/format_reward": 1.0,
6239 "step": 445
6240 },
6241 {
6242 "clip_ratio": 0.0,
6243 "completion_length": 105.94206237792969,
6244 "epoch": 5.126436781609195,
6245 "grad_norm": 2.3533060150658844,
6246 "kl": 0.050537109375,
6247 "learning_rate": 4.873563218390804e-07,
6248 "loss": 0.0021,
6249 "reward": 1.8900680541992188,
6250 "reward_std": 0.021740447729825974,
6251 "rewards/accuracy_reward": 0.8900680541992188,
6252 "rewards/format_reward": 1.0,
6253 "step": 446
6254 },
6255 {
6256 "clip_ratio": 0.0,
6257 "completion_length": 106.99544525146484,
6258 "epoch": 5.137931034482759,
6259 "grad_norm": 2.1820025718352136,
6260 "kl": 0.046142578125,
6261 "learning_rate": 4.86206896551724e-07,
6262 "loss": 0.0019,
6263 "reward": 1.8730883598327637,
6264 "reward_std": 0.022952785715460777,
6265 "rewards/accuracy_reward": 0.8730884790420532,
6266 "rewards/format_reward": 1.0,
6267 "step": 447
6268 },
6269 {
6270 "clip_ratio": 0.0,
6271 "completion_length": 107.60546875,
6272 "epoch": 5.149425287356322,
6273 "grad_norm": 1.6285679666257735,
6274 "kl": 0.0400390625,
6275 "learning_rate": 4.850574712643678e-07,
6276 "loss": 0.0017,
6277 "reward": 1.889723777770996,
6278 "reward_std": 0.02422255277633667,
6279 "rewards/accuracy_reward": 0.8897239565849304,
6280 "rewards/format_reward": 1.0,
6281 "step": 448
6282 },
6283 {
6284 "clip_ratio": 0.0,
6285 "completion_length": 107.62760925292969,
6286 "epoch": 5.160919540229885,
6287 "grad_norm": 1.838531888615258,
6288 "kl": 0.04150390625,
6289 "learning_rate": 4.839080459770114e-07,
6290 "loss": 0.0017,
6291 "reward": 1.8831251859664917,
6292 "reward_std": 0.023560214787721634,
6293 "rewards/accuracy_reward": 0.8831251859664917,
6294 "rewards/format_reward": 1.0,
6295 "step": 449
6296 },
6297 {
6298 "clip_ratio": 0.0,
6299 "completion_length": 106.32747650146484,
6300 "epoch": 5.172413793103448,
6301 "grad_norm": 3.3852429919466283,
6302 "kl": 0.04296875,
6303 "learning_rate": 4.827586206896552e-07,
6304 "loss": 0.0018,
6305 "reward": 1.8916248083114624,
6306 "reward_std": 0.018707800656557083,
6307 "rewards/accuracy_reward": 0.891624927520752,
6308 "rewards/format_reward": 1.0,
6309 "step": 450
6310 },
6311 {
6312 "clip_ratio": 0.0,
6313 "completion_length": 106.82747650146484,
6314 "epoch": 5.183908045977011,
6315 "grad_norm": 1.0585987907156273,
6316 "kl": 0.040771484375,
6317 "learning_rate": 4.816091954022988e-07,
6318 "loss": 0.0017,
6319 "reward": 1.8994290828704834,
6320 "reward_std": 0.020674733445048332,
6321 "rewards/accuracy_reward": 0.9000802040100098,
6322 "rewards/format_reward": 0.9993489980697632,
6323 "step": 451
6324 },
6325 {
6326 "clip_ratio": 0.0,
6327 "completion_length": 108.90950775146484,
6328 "epoch": 5.195402298850575,
6329 "grad_norm": 1.5208325687196538,
6330 "kl": 0.041259765625,
6331 "learning_rate": 4.804597701149424e-07,
6332 "loss": 0.0017,
6333 "reward": 1.8975791931152344,
6334 "reward_std": 0.021052822470664978,
6335 "rewards/accuracy_reward": 0.898230254650116,
6336 "rewards/format_reward": 0.9993489980697632,
6337 "step": 452
6338 },
6339 {
6340 "clip_ratio": 0.0,
6341 "completion_length": 104.69140625,
6342 "epoch": 5.206896551724138,
6343 "grad_norm": 1.7465767723786063,
6344 "kl": 0.04541015625,
6345 "learning_rate": 4.793103448275862e-07,
6346 "loss": 0.0019,
6347 "reward": 1.901599645614624,
6348 "reward_std": 0.019404159858822823,
6349 "rewards/accuracy_reward": 0.9015995860099792,
6350 "rewards/format_reward": 1.0,
6351 "step": 453
6352 },
6353 {
6354 "clip_ratio": 0.0,
6355 "completion_length": 107.00716400146484,
6356 "epoch": 5.218390804597701,
6357 "grad_norm": 1.7494745095643287,
6358 "kl": 0.04296875,
6359 "learning_rate": 4.781609195402298e-07,
6360 "loss": 0.0018,
6361 "reward": 1.897741675376892,
6362 "reward_std": 0.018847916275262833,
6363 "rewards/accuracy_reward": 0.8977417945861816,
6364 "rewards/format_reward": 1.0,
6365 "step": 454
6366 },
6367 {
6368 "clip_ratio": 0.0,
6369 "completion_length": 106.82552337646484,
6370 "epoch": 5.2298850574712645,
6371 "grad_norm": 149.72760374173768,
6372 "kl": 0.1123046875,
6373 "learning_rate": 4.770114942528736e-07,
6374 "loss": 0.0045,
6375 "reward": 1.9023511409759521,
6376 "reward_std": 0.02475971356034279,
6377 "rewards/accuracy_reward": 0.9030022621154785,
6378 "rewards/format_reward": 0.9993489980697632,
6379 "step": 455
6380 },
6381 {
6382 "clip_ratio": 0.0,
6383 "completion_length": 108.265625,
6384 "epoch": 5.241379310344827,
6385 "grad_norm": 1.5032856559021295,
6386 "kl": 0.04248046875,
6387 "learning_rate": 4.7586206896551725e-07,
6388 "loss": 0.0018,
6389 "reward": 1.8960156440734863,
6390 "reward_std": 0.020089447498321533,
6391 "rewards/accuracy_reward": 0.8960156440734863,
6392 "rewards/format_reward": 1.0,
6393 "step": 456
6394 },
6395 {
6396 "clip_ratio": 0.0,
6397 "completion_length": 107.44466400146484,
6398 "epoch": 5.252873563218391,
6399 "grad_norm": 5.380770477839403,
6400 "kl": 0.043701171875,
6401 "learning_rate": 4.747126436781609e-07,
6402 "loss": 0.0018,
6403 "reward": 1.879178762435913,
6404 "reward_std": 0.024046046659350395,
6405 "rewards/accuracy_reward": 0.8798297643661499,
6406 "rewards/format_reward": 0.9993489980697632,
6407 "step": 457
6408 },
6409 {
6410 "clip_ratio": 0.0,
6411 "completion_length": 108.21419525146484,
6412 "epoch": 5.264367816091954,
6413 "grad_norm": 1.307828461511747,
6414 "kl": 0.039794921875,
6415 "learning_rate": 4.735632183908046e-07,
6416 "loss": 0.0017,
6417 "reward": 1.9000663757324219,
6418 "reward_std": 0.0211566723883152,
6419 "rewards/accuracy_reward": 0.9000665545463562,
6420 "rewards/format_reward": 1.0,
6421 "step": 458
6422 },
6423 {
6424 "clip_ratio": 0.0,
6425 "completion_length": 105.48828125,
6426 "epoch": 5.275862068965517,
6427 "grad_norm": 1.6755788743402862,
6428 "kl": 0.046142578125,
6429 "learning_rate": 4.7241379310344827e-07,
6430 "loss": 0.0019,
6431 "reward": 1.9154987335205078,
6432 "reward_std": 0.019508585333824158,
6433 "rewards/accuracy_reward": 0.9154989123344421,
6434 "rewards/format_reward": 1.0,
6435 "step": 459
6436 },
6437 {
6438 "clip_ratio": 0.0,
6439 "completion_length": 106.59505462646484,
6440 "epoch": 5.287356321839081,
6441 "grad_norm": 1.410115167386873,
6442 "kl": 0.048828125,
6443 "learning_rate": 4.712643678160919e-07,
6444 "loss": 0.002,
6445 "reward": 1.8869271278381348,
6446 "reward_std": 0.021610528230667114,
6447 "rewards/accuracy_reward": 0.8875781893730164,
6448 "rewards/format_reward": 0.9993489980697632,
6449 "step": 460
6450 },
6451 {
6452 "clip_ratio": 0.0,
6453 "completion_length": 107.11653900146484,
6454 "epoch": 5.2988505747126435,
6455 "grad_norm": 2.472073181988844,
6456 "kl": 0.046142578125,
6457 "learning_rate": 4.7011494252873565e-07,
6458 "loss": 0.0019,
6459 "reward": 1.8838642835617065,
6460 "reward_std": 0.020128531381487846,
6461 "rewards/accuracy_reward": 0.8838642835617065,
6462 "rewards/format_reward": 1.0,
6463 "step": 461
6464 },
6465 {
6466 "clip_ratio": 0.0,
6467 "completion_length": 107.26237487792969,
6468 "epoch": 5.310344827586207,
6469 "grad_norm": 1.982912784034594,
6470 "kl": 0.04638671875,
6471 "learning_rate": 4.689655172413793e-07,
6472 "loss": 0.0019,
6473 "reward": 1.8954026699066162,
6474 "reward_std": 0.020416123792529106,
6475 "rewards/accuracy_reward": 0.8960537910461426,
6476 "rewards/format_reward": 0.9993489980697632,
6477 "step": 462
6478 },
6479 {
6480 "clip_ratio": 0.0,
6481 "completion_length": 107.216796875,
6482 "epoch": 5.32183908045977,
6483 "grad_norm": 1.9630010216472946,
6484 "kl": 0.048828125,
6485 "learning_rate": 4.678160919540229e-07,
6486 "loss": 0.002,
6487 "reward": 1.8689860105514526,
6488 "reward_std": 0.02052994631230831,
6489 "rewards/accuracy_reward": 0.8689860105514526,
6490 "rewards/format_reward": 1.0,
6491 "step": 463
6492 },
6493 {
6494 "clip_ratio": 0.0,
6495 "completion_length": 107.13997650146484,
6496 "epoch": 5.333333333333333,
6497 "grad_norm": 2.293151993990854,
6498 "kl": 0.05078125,
6499 "learning_rate": 4.6666666666666666e-07,
6500 "loss": 0.0021,
6501 "reward": 1.8916696310043335,
6502 "reward_std": 0.016695309430360794,
6503 "rewards/accuracy_reward": 0.8916696906089783,
6504 "rewards/format_reward": 1.0,
6505 "step": 464
6506 },
6507 {
6508 "clip_ratio": 0.0,
6509 "completion_length": 107.533203125,
6510 "epoch": 5.344827586206897,
6511 "grad_norm": 2.180207259559519,
6512 "kl": 0.044921875,
6513 "learning_rate": 4.655172413793103e-07,
6514 "loss": 0.0019,
6515 "reward": 1.8959643840789795,
6516 "reward_std": 0.018700793385505676,
6517 "rewards/accuracy_reward": 0.8959642648696899,
6518 "rewards/format_reward": 1.0,
6519 "step": 465
6520 },
6521 {
6522 "clip_ratio": 0.0,
6523 "completion_length": 107.427734375,
6524 "epoch": 5.35632183908046,
6525 "grad_norm": 1.8241902873816702,
6526 "kl": 0.0498046875,
6527 "learning_rate": 4.6436781609195404e-07,
6528 "loss": 0.0021,
6529 "reward": 1.8787130117416382,
6530 "reward_std": 0.020657163113355637,
6531 "rewards/accuracy_reward": 0.8793638944625854,
6532 "rewards/format_reward": 0.9993489980697632,
6533 "step": 466
6534 },
6535 {
6536 "clip_ratio": 0.0,
6537 "completion_length": 106.609375,
6538 "epoch": 5.3678160919540225,
6539 "grad_norm": 2.607546823999089,
6540 "kl": 0.04638671875,
6541 "learning_rate": 4.632183908045977e-07,
6542 "loss": 0.0019,
6543 "reward": 1.8970588445663452,
6544 "reward_std": 0.018911032006144524,
6545 "rewards/accuracy_reward": 0.8970588445663452,
6546 "rewards/format_reward": 1.0,
6547 "step": 467
6548 },
6549 {
6550 "clip_ratio": 0.0,
6551 "completion_length": 108.97526550292969,
6552 "epoch": 5.379310344827586,
6553 "grad_norm": 2.615397696144289,
6554 "kl": 0.04345703125,
6555 "learning_rate": 4.620689655172413e-07,
6556 "loss": 0.0018,
6557 "reward": 1.881915807723999,
6558 "reward_std": 0.01975308358669281,
6559 "rewards/accuracy_reward": 0.881915807723999,
6560 "rewards/format_reward": 1.0,
6561 "step": 468
6562 },
6563 {
6564 "clip_ratio": 0.0,
6565 "completion_length": 105.55534362792969,
6566 "epoch": 5.390804597701149,
6567 "grad_norm": 1.9430369662373463,
6568 "kl": 0.049560546875,
6569 "learning_rate": 4.6091954022988506e-07,
6570 "loss": 0.0021,
6571 "reward": 1.8931167125701904,
6572 "reward_std": 0.021739525720477104,
6573 "rewards/accuracy_reward": 0.8937677145004272,
6574 "rewards/format_reward": 0.9993489980697632,
6575 "step": 469
6576 },
6577 {
6578 "clip_ratio": 0.0,
6579 "completion_length": 105.79362487792969,
6580 "epoch": 5.402298850574713,
6581 "grad_norm": 2.0918662703233295,
6582 "kl": 0.0498046875,
6583 "learning_rate": 4.597701149425287e-07,
6584 "loss": 0.0021,
6585 "reward": 1.9036979675292969,
6586 "reward_std": 0.015338393859565258,
6587 "rewards/accuracy_reward": 0.9036981463432312,
6588 "rewards/format_reward": 1.0,
6589 "step": 470
6590 },
6591 {
6592 "clip_ratio": 0.0,
6593 "completion_length": 106.98828125,
6594 "epoch": 5.413793103448276,
6595 "grad_norm": 1.6066027943146437,
6596 "kl": 0.043701171875,
6597 "learning_rate": 4.586206896551724e-07,
6598 "loss": 0.0018,
6599 "reward": 1.9139502048492432,
6600 "reward_std": 0.01869853213429451,
6601 "rewards/accuracy_reward": 0.9139501452445984,
6602 "rewards/format_reward": 1.0,
6603 "step": 471
6604 },
6605 {
6606 "clip_ratio": 0.0,
6607 "completion_length": 107.34114837646484,
6608 "epoch": 5.425287356321839,
6609 "grad_norm": 2.0098215318578716,
6610 "kl": 0.047607421875,
6611 "learning_rate": 4.574712643678161e-07,
6612 "loss": 0.002,
6613 "reward": 1.8916778564453125,
6614 "reward_std": 0.020163239911198616,
6615 "rewards/accuracy_reward": 0.8916778564453125,
6616 "rewards/format_reward": 1.0,
6617 "step": 472
6618 },
6619 {
6620 "clip_ratio": 0.0,
6621 "completion_length": 105.19271087646484,
6622 "epoch": 5.436781609195402,
6623 "grad_norm": 1.9402327167023088,
6624 "kl": 0.05029296875,
6625 "learning_rate": 4.563218390804597e-07,
6626 "loss": 0.0021,
6627 "reward": 1.8789701461791992,
6628 "reward_std": 0.023091118782758713,
6629 "rewards/accuracy_reward": 0.8796213865280151,
6630 "rewards/format_reward": 0.9993489980697632,
6631 "step": 473
6632 },
6633 {
6634 "clip_ratio": 0.0,
6635 "completion_length": 107.625,
6636 "epoch": 5.448275862068965,
6637 "grad_norm": 1.5800586554447937,
6638 "kl": 0.04443359375,
6639 "learning_rate": 4.5517241379310346e-07,
6640 "loss": 0.0019,
6641 "reward": 1.8999593257904053,
6642 "reward_std": 0.019150175154209137,
6643 "rewards/accuracy_reward": 0.8999593257904053,
6644 "rewards/format_reward": 1.0,
6645 "step": 474
6646 },
6647 {
6648 "clip_ratio": 0.0,
6649 "completion_length": 106.41732025146484,
6650 "epoch": 5.459770114942529,
6651 "grad_norm": 1.3572231730795086,
6652 "kl": 0.048583984375,
6653 "learning_rate": 4.540229885057471e-07,
6654 "loss": 0.002,
6655 "reward": 1.9113028049468994,
6656 "reward_std": 0.018931671977043152,
6657 "rewards/accuracy_reward": 0.9113027453422546,
6658 "rewards/format_reward": 1.0,
6659 "step": 475
6660 },
6661 {
6662 "clip_ratio": 0.0,
6663 "completion_length": 106.810546875,
6664 "epoch": 5.471264367816092,
6665 "grad_norm": 1.4707638373352834,
6666 "kl": 0.04931640625,
6667 "learning_rate": 4.528735632183908e-07,
6668 "loss": 0.0021,
6669 "reward": 1.9164535999298096,
6670 "reward_std": 0.01708863489329815,
6671 "rewards/accuracy_reward": 0.91645348072052,
6672 "rewards/format_reward": 1.0,
6673 "step": 476
6674 },
6675 {
6676 "clip_ratio": 0.0,
6677 "completion_length": 105.80013275146484,
6678 "epoch": 5.482758620689655,
6679 "grad_norm": 1.6157878642932322,
6680 "kl": 0.0458984375,
6681 "learning_rate": 4.5172413793103447e-07,
6682 "loss": 0.0019,
6683 "reward": 1.897383689880371,
6684 "reward_std": 0.019057631492614746,
6685 "rewards/accuracy_reward": 0.8973836302757263,
6686 "rewards/format_reward": 1.0,
6687 "step": 477
6688 },
6689 {
6690 "clip_ratio": 0.0,
6691 "completion_length": 103.48372650146484,
6692 "epoch": 5.494252873563219,
6693 "grad_norm": 2.039464641047124,
6694 "kl": 0.048828125,
6695 "learning_rate": 4.505747126436781e-07,
6696 "loss": 0.002,
6697 "reward": 1.8875281810760498,
6698 "reward_std": 0.02109697088599205,
6699 "rewards/accuracy_reward": 0.8875283002853394,
6700 "rewards/format_reward": 1.0,
6701 "step": 478
6702 },
6703 {
6704 "clip_ratio": 0.0,
6705 "completion_length": 104.49870300292969,
6706 "epoch": 5.505747126436781,
6707 "grad_norm": 2.039169092715578,
6708 "kl": 0.046142578125,
6709 "learning_rate": 4.494252873563218e-07,
6710 "loss": 0.0019,
6711 "reward": 1.8861839771270752,
6712 "reward_std": 0.02034366875886917,
6713 "rewards/accuracy_reward": 0.8861838579177856,
6714 "rewards/format_reward": 1.0,
6715 "step": 479
6716 },
6717 {
6718 "clip_ratio": 0.0,
6719 "completion_length": 104.89453125,
6720 "epoch": 5.517241379310345,
6721 "grad_norm": 1.4686995511345136,
6722 "kl": 0.050537109375,
6723 "learning_rate": 4.482758620689655e-07,
6724 "loss": 0.0021,
6725 "reward": 1.9063835144042969,
6726 "reward_std": 0.018992867320775986,
6727 "rewards/accuracy_reward": 0.9070345163345337,
6728 "rewards/format_reward": 0.9993489980697632,
6729 "step": 480
6730 },
6731 {
6732 "clip_ratio": 0.0,
6733 "completion_length": 103.40690612792969,
6734 "epoch": 5.528735632183908,
6735 "grad_norm": 2.232648278432696,
6736 "kl": 0.04296875,
6737 "learning_rate": 4.471264367816092e-07,
6738 "loss": 0.0018,
6739 "reward": 1.8770596981048584,
6740 "reward_std": 0.02336341142654419,
6741 "rewards/accuracy_reward": 0.8770596981048584,
6742 "rewards/format_reward": 1.0,
6743 "step": 481
6744 },
6745 {
6746 "clip_ratio": 0.0,
6747 "completion_length": 103.29427337646484,
6748 "epoch": 5.540229885057471,
6749 "grad_norm": 1.9103143500415654,
6750 "kl": 0.04833984375,
6751 "learning_rate": 4.4597701149425287e-07,
6752 "loss": 0.002,
6753 "reward": 1.8889049291610718,
6754 "reward_std": 0.019074462354183197,
6755 "rewards/accuracy_reward": 0.8889049887657166,
6756 "rewards/format_reward": 1.0,
6757 "step": 482
6758 },
6759 {
6760 "clip_ratio": 0.0,
6761 "completion_length": 104.26171875,
6762 "epoch": 5.551724137931035,
6763 "grad_norm": 1.5192873928817636,
6764 "kl": 0.041015625,
6765 "learning_rate": 4.4482758620689656e-07,
6766 "loss": 0.0017,
6767 "reward": 1.882556438446045,
6768 "reward_std": 0.02097918465733528,
6769 "rewards/accuracy_reward": 0.8825565576553345,
6770 "rewards/format_reward": 1.0,
6771 "step": 483
6772 },
6773 {
6774 "clip_ratio": 0.0,
6775 "completion_length": 103.24284362792969,
6776 "epoch": 5.563218390804598,
6777 "grad_norm": 4.569269242461156,
6778 "kl": 0.047607421875,
6779 "learning_rate": 4.436781609195402e-07,
6780 "loss": 0.0019,
6781 "reward": 1.9005216360092163,
6782 "reward_std": 0.02364097349345684,
6783 "rewards/accuracy_reward": 0.9011726379394531,
6784 "rewards/format_reward": 0.9993489980697632,
6785 "step": 484
6786 },
6787 {
6788 "clip_ratio": 0.0,
6789 "completion_length": 102.662109375,
6790 "epoch": 5.574712643678161,
6791 "grad_norm": 1.7930715875513572,
6792 "kl": 0.045654296875,
6793 "learning_rate": 4.425287356321839e-07,
6794 "loss": 0.0019,
6795 "reward": 1.9116904735565186,
6796 "reward_std": 0.016724035143852234,
6797 "rewards/accuracy_reward": 0.9116904139518738,
6798 "rewards/format_reward": 1.0,
6799 "step": 485
6800 },
6801 {
6802 "clip_ratio": 0.0,
6803 "completion_length": 104.36003112792969,
6804 "epoch": 5.586206896551724,
6805 "grad_norm": 4.334056364497592,
6806 "kl": 0.044921875,
6807 "learning_rate": 4.413793103448276e-07,
6808 "loss": 0.0018,
6809 "reward": 1.895902156829834,
6810 "reward_std": 0.02204793691635132,
6811 "rewards/accuracy_reward": 0.8959023356437683,
6812 "rewards/format_reward": 1.0,
6813 "step": 486
6814 },
6815 {
6816 "clip_ratio": 0.0,
6817 "completion_length": 104.138671875,
6818 "epoch": 5.597701149425287,
6819 "grad_norm": 1.9027704497320623,
6820 "kl": 0.044921875,
6821 "learning_rate": 4.402298850574712e-07,
6822 "loss": 0.0019,
6823 "reward": 1.8900482654571533,
6824 "reward_std": 0.019576409831643105,
6825 "rewards/accuracy_reward": 0.8900482654571533,
6826 "rewards/format_reward": 1.0,
6827 "step": 487
6828 },
6829 {
6830 "clip_ratio": 0.0,
6831 "completion_length": 103.41471862792969,
6832 "epoch": 5.609195402298851,
6833 "grad_norm": 3.118890962601994,
6834 "kl": 0.044189453125,
6835 "learning_rate": 4.3908045977011495e-07,
6836 "loss": 0.0018,
6837 "reward": 1.890057921409607,
6838 "reward_std": 0.01878949999809265,
6839 "rewards/accuracy_reward": 0.8907088041305542,
6840 "rewards/format_reward": 0.9993489980697632,
6841 "step": 488
6842 },
6843 {
6844 "clip_ratio": 0.0,
6845 "completion_length": 104.119140625,
6846 "epoch": 5.620689655172414,
6847 "grad_norm": 1.369384680255404,
6848 "kl": 0.045166015625,
6849 "learning_rate": 4.379310344827586e-07,
6850 "loss": 0.0019,
6851 "reward": 1.8981773853302002,
6852 "reward_std": 0.020252332091331482,
6853 "rewards/accuracy_reward": 0.8981773853302002,
6854 "rewards/format_reward": 1.0,
6855 "step": 489
6856 },
6857 {
6858 "clip_ratio": 0.0,
6859 "completion_length": 102.96028900146484,
6860 "epoch": 5.6321839080459775,
6861 "grad_norm": 1.3682700993582457,
6862 "kl": 0.04150390625,
6863 "learning_rate": 4.367816091954023e-07,
6864 "loss": 0.0017,
6865 "reward": 1.8991467952728271,
6866 "reward_std": 0.019649513065814972,
6867 "rewards/accuracy_reward": 0.8991466760635376,
6868 "rewards/format_reward": 1.0,
6869 "step": 490
6870 },
6871 {
6872 "clip_ratio": 0.0,
6873 "completion_length": 104.962890625,
6874 "epoch": 5.64367816091954,
6875 "grad_norm": 1.7998521186414984,
6876 "kl": 0.046875,
6877 "learning_rate": 4.3563218390804597e-07,
6878 "loss": 0.0019,
6879 "reward": 1.8821454048156738,
6880 "reward_std": 0.019271574914455414,
6881 "rewards/accuracy_reward": 0.8821454048156738,
6882 "rewards/format_reward": 1.0,
6883 "step": 491
6884 },
6885 {
6886 "clip_ratio": 0.0,
6887 "completion_length": 103.05208587646484,
6888 "epoch": 5.655172413793103,
6889 "grad_norm": 2.074675270916482,
6890 "kl": 0.046875,
6891 "learning_rate": 4.344827586206896e-07,
6892 "loss": 0.0019,
6893 "reward": 1.9152133464813232,
6894 "reward_std": 0.019003426656126976,
6895 "rewards/accuracy_reward": 0.9152132868766785,
6896 "rewards/format_reward": 1.0,
6897 "step": 492
6898 },
6899 {
6900 "clip_ratio": 0.0,
6901 "completion_length": 102.97396087646484,
6902 "epoch": 5.666666666666667,
6903 "grad_norm": 1.6500508152691888,
6904 "kl": 0.047607421875,
6905 "learning_rate": 4.3333333333333335e-07,
6906 "loss": 0.002,
6907 "reward": 1.9029719829559326,
6908 "reward_std": 0.019596107304096222,
6909 "rewards/accuracy_reward": 0.9029721021652222,
6910 "rewards/format_reward": 1.0,
6911 "step": 493
6912 },
6913 {
6914 "clip_ratio": 0.0,
6915 "completion_length": 103.75130462646484,
6916 "epoch": 5.67816091954023,
6917 "grad_norm": 1.7635232706402635,
6918 "kl": 0.043701171875,
6919 "learning_rate": 4.32183908045977e-07,
6920 "loss": 0.0018,
6921 "reward": 1.8977859020233154,
6922 "reward_std": 0.01936359517276287,
6923 "rewards/accuracy_reward": 0.8977858424186707,
6924 "rewards/format_reward": 1.0,
6925 "step": 494
6926 },
6927 {
6928 "clip_ratio": 0.0,
6929 "completion_length": 104.8671875,
6930 "epoch": 5.689655172413794,
6931 "grad_norm": 7.5630569258900815,
6932 "kl": 0.049560546875,
6933 "learning_rate": 4.310344827586206e-07,
6934 "loss": 0.0021,
6935 "reward": 1.8984394073486328,
6936 "reward_std": 0.020785300061106682,
6937 "rewards/accuracy_reward": 0.8984395265579224,
6938 "rewards/format_reward": 1.0,
6939 "step": 495
6940 },
6941 {
6942 "clip_ratio": 0.0,
6943 "completion_length": 104.19075775146484,
6944 "epoch": 5.7011494252873565,
6945 "grad_norm": 1.9789378304791254,
6946 "kl": 0.04541015625,
6947 "learning_rate": 4.2988505747126437e-07,
6948 "loss": 0.0019,
6949 "reward": 1.8898248672485352,
6950 "reward_std": 0.019473157823085785,
6951 "rewards/accuracy_reward": 0.8898249864578247,
6952 "rewards/format_reward": 1.0,
6953 "step": 496
6954 },
6955 {
6956 "clip_ratio": 0.0,
6957 "completion_length": 105.52214050292969,
6958 "epoch": 5.712643678160919,
6959 "grad_norm": 2.230731469046121,
6960 "kl": 0.046875,
6961 "learning_rate": 4.28735632183908e-07,
6962 "loss": 0.0019,
6963 "reward": 1.8999899625778198,
6964 "reward_std": 0.017196331173181534,
6965 "rewards/accuracy_reward": 0.8999900221824646,
6966 "rewards/format_reward": 1.0,
6967 "step": 497
6968 },
6969 {
6970 "clip_ratio": 0.0,
6971 "completion_length": 102.529296875,
6972 "epoch": 5.724137931034483,
6973 "grad_norm": 1.8172179930249694,
6974 "kl": 0.04736328125,
6975 "learning_rate": 4.2758620689655174e-07,
6976 "loss": 0.002,
6977 "reward": 1.9146230220794678,
6978 "reward_std": 0.016667529940605164,
6979 "rewards/accuracy_reward": 0.9146231412887573,
6980 "rewards/format_reward": 1.0,
6981 "step": 498
6982 },
6983 {
6984 "clip_ratio": 0.0,
6985 "completion_length": 102.14323425292969,
6986 "epoch": 5.735632183908046,
6987 "grad_norm": 1.7799144316476456,
6988 "kl": 0.051025390625,
6989 "learning_rate": 4.264367816091954e-07,
6990 "loss": 0.0021,
6991 "reward": 1.8982200622558594,
6992 "reward_std": 0.01865963265299797,
6993 "rewards/accuracy_reward": 0.8982200026512146,
6994 "rewards/format_reward": 1.0,
6995 "step": 499
6996 },
6997 {
6998 "clip_ratio": 0.0,
6999 "completion_length": 102.78060150146484,
7000 "epoch": 5.747126436781609,
7001 "grad_norm": 3.2056367063069513,
7002 "kl": 0.045166015625,
7003 "learning_rate": 4.25287356321839e-07,
7004 "loss": 0.0019,
7005 "reward": 1.9110320806503296,
7006 "reward_std": 0.018477408215403557,
7007 "rewards/accuracy_reward": 0.9110321998596191,
7008 "rewards/format_reward": 1.0,
7009 "step": 500
7010 },
7011 {
7012 "clip_ratio": 0.0,
7013 "completion_length": 104.09375,
7014 "epoch": 5.758620689655173,
7015 "grad_norm": 2.1278601395571775,
7016 "kl": 0.04345703125,
7017 "learning_rate": 4.2413793103448276e-07,
7018 "loss": 0.0018,
7019 "reward": 1.896863341331482,
7020 "reward_std": 0.02090194821357727,
7021 "rewards/accuracy_reward": 0.8981654047966003,
7022 "rewards/format_reward": 0.9986979365348816,
7023 "step": 501
7024 },
7025 {
7026 "clip_ratio": 0.0,
7027 "completion_length": 105.63346862792969,
7028 "epoch": 5.7701149425287355,
7029 "grad_norm": 1.933803516191336,
7030 "kl": 0.046875,
7031 "learning_rate": 4.229885057471264e-07,
7032 "loss": 0.002,
7033 "reward": 1.8992946147918701,
7034 "reward_std": 0.01685231737792492,
7035 "rewards/accuracy_reward": 0.8992947340011597,
7036 "rewards/format_reward": 1.0,
7037 "step": 502
7038 },
7039 {
7040 "clip_ratio": 0.0,
7041 "completion_length": 102.67903900146484,
7042 "epoch": 5.781609195402299,
7043 "grad_norm": 2.5798280611914204,
7044 "kl": 0.049072265625,
7045 "learning_rate": 4.218390804597701e-07,
7046 "loss": 0.002,
7047 "reward": 1.8981560468673706,
7048 "reward_std": 0.017263038083910942,
7049 "rewards/accuracy_reward": 0.8981560468673706,
7050 "rewards/format_reward": 1.0,
7051 "step": 503
7052 },
7053 {
7054 "clip_ratio": 0.0,
7055 "completion_length": 105.14583587646484,
7056 "epoch": 5.793103448275862,
7057 "grad_norm": 2.076692138986448,
7058 "kl": 0.043701171875,
7059 "learning_rate": 4.206896551724138e-07,
7060 "loss": 0.0018,
7061 "reward": 1.8787815570831299,
7062 "reward_std": 0.019007613882422447,
7063 "rewards/accuracy_reward": 0.8787816762924194,
7064 "rewards/format_reward": 1.0,
7065 "step": 504
7066 },
7067 {
7068 "clip_ratio": 0.0,
7069 "completion_length": 104.9453125,
7070 "epoch": 5.804597701149425,
7071 "grad_norm": 2.062105078777337,
7072 "kl": 0.049072265625,
7073 "learning_rate": 4.195402298850574e-07,
7074 "loss": 0.002,
7075 "reward": 1.8786977529525757,
7076 "reward_std": 0.018628563731908798,
7077 "rewards/accuracy_reward": 0.8786976337432861,
7078 "rewards/format_reward": 1.0,
7079 "step": 505
7080 },
7081 {
7082 "clip_ratio": 0.0,
7083 "completion_length": 105.208984375,
7084 "epoch": 5.816091954022989,
7085 "grad_norm": 3.4179007868834175,
7086 "kl": 0.04443359375,
7087 "learning_rate": 4.1839080459770116e-07,
7088 "loss": 0.0019,
7089 "reward": 1.9090993404388428,
7090 "reward_std": 0.018269993364810944,
7091 "rewards/accuracy_reward": 0.9090994596481323,
7092 "rewards/format_reward": 1.0,
7093 "step": 506
7094 },
7095 {
7096 "clip_ratio": 0.0,
7097 "completion_length": 104.859375,
7098 "epoch": 5.827586206896552,
7099 "grad_norm": 1.642778882446284,
7100 "kl": 0.04541015625,
7101 "learning_rate": 4.172413793103448e-07,
7102 "loss": 0.0019,
7103 "reward": 1.895350694656372,
7104 "reward_std": 0.017278993502259254,
7105 "rewards/accuracy_reward": 0.8960016965866089,
7106 "rewards/format_reward": 0.9993489980697632,
7107 "step": 507
7108 },
7109 {
7110 "clip_ratio": 0.0,
7111 "completion_length": 105.00521087646484,
7112 "epoch": 5.8390804597701145,
7113 "grad_norm": 2.6806085379649702,
7114 "kl": 0.049072265625,
7115 "learning_rate": 4.160919540229885e-07,
7116 "loss": 0.002,
7117 "reward": 1.9006847143173218,
7118 "reward_std": 0.016903840005397797,
7119 "rewards/accuracy_reward": 0.9006847143173218,
7120 "rewards/format_reward": 1.0,
7121 "step": 508
7122 },
7123 {
7124 "clip_ratio": 0.0,
7125 "completion_length": 104.69466400146484,
7126 "epoch": 5.850574712643678,
7127 "grad_norm": 3.6854562093571386,
7128 "kl": 0.05859375,
7129 "learning_rate": 4.149425287356322e-07,
7130 "loss": 0.0024,
7131 "reward": 1.908639907836914,
7132 "reward_std": 0.016876667737960815,
7133 "rewards/accuracy_reward": 0.9086400866508484,
7134 "rewards/format_reward": 1.0,
7135 "step": 509
7136 },
7137 {
7138 "clip_ratio": 0.0,
7139 "completion_length": 105.81120300292969,
7140 "epoch": 5.862068965517241,
7141 "grad_norm": 3.453501517404399,
7142 "kl": 0.0517578125,
7143 "learning_rate": 4.1379310344827586e-07,
7144 "loss": 0.0022,
7145 "reward": 1.903754472732544,
7146 "reward_std": 0.019029080867767334,
7147 "rewards/accuracy_reward": 0.9044055342674255,
7148 "rewards/format_reward": 0.9993489980697632,
7149 "step": 510
7150 },
7151 {
7152 "clip_ratio": 0.0,
7153 "completion_length": 105.5859375,
7154 "epoch": 5.873563218390805,
7155 "grad_norm": 1.803202256070274,
7156 "kl": 0.054443359375,
7157 "learning_rate": 4.126436781609195e-07,
7158 "loss": 0.0022,
7159 "reward": 1.8829306364059448,
7160 "reward_std": 0.01749059185385704,
7161 "rewards/accuracy_reward": 0.8829306364059448,
7162 "rewards/format_reward": 1.0,
7163 "step": 511
7164 },
7165 {
7166 "clip_ratio": 0.0,
7167 "completion_length": 104.27995300292969,
7168 "epoch": 5.885057471264368,
7169 "grad_norm": 1.5556838404555604,
7170 "kl": 0.044677734375,
7171 "learning_rate": 4.114942528735632e-07,
7172 "loss": 0.0019,
7173 "reward": 1.9034370183944702,
7174 "reward_std": 0.018126491457223892,
7175 "rewards/accuracy_reward": 0.903437077999115,
7176 "rewards/format_reward": 1.0,
7177 "step": 512
7178 },
7179 {
7180 "clip_ratio": 0.0,
7181 "completion_length": 105.17057800292969,
7182 "epoch": 5.896551724137931,
7183 "grad_norm": 3.4957318575488383,
7184 "kl": 0.0498046875,
7185 "learning_rate": 4.103448275862069e-07,
7186 "loss": 0.002,
7187 "reward": 1.9079878330230713,
7188 "reward_std": 0.01815355196595192,
7189 "rewards/accuracy_reward": 0.9079879522323608,
7190 "rewards/format_reward": 1.0,
7191 "step": 513
7192 },
7193 {
7194 "clip_ratio": 0.0,
7195 "completion_length": 106.59765625,
7196 "epoch": 5.908045977011494,
7197 "grad_norm": 2.707726805902382,
7198 "kl": 0.044921875,
7199 "learning_rate": 4.0919540229885057e-07,
7200 "loss": 0.0019,
7201 "reward": 1.884354591369629,
7202 "reward_std": 0.018658233806490898,
7203 "rewards/accuracy_reward": 0.8843547105789185,
7204 "rewards/format_reward": 1.0,
7205 "step": 514
7206 },
7207 {
7208 "clip_ratio": 0.0,
7209 "completion_length": 104.71354675292969,
7210 "epoch": 5.919540229885057,
7211 "grad_norm": 1.4965768623048503,
7212 "kl": 0.04931640625,
7213 "learning_rate": 4.0804597701149426e-07,
7214 "loss": 0.0021,
7215 "reward": 1.8936374187469482,
7216 "reward_std": 0.01746196486055851,
7217 "rewards/accuracy_reward": 0.8936373591423035,
7218 "rewards/format_reward": 1.0,
7219 "step": 515
7220 },
7221 {
7222 "clip_ratio": 0.0,
7223 "completion_length": 106.97982025146484,
7224 "epoch": 5.931034482758621,
7225 "grad_norm": 2.9479475900202115,
7226 "kl": 0.046630859375,
7227 "learning_rate": 4.068965517241379e-07,
7228 "loss": 0.0019,
7229 "reward": 1.902561902999878,
7230 "reward_std": 0.01716381497681141,
7231 "rewards/accuracy_reward": 0.9025619029998779,
7232 "rewards/format_reward": 1.0,
7233 "step": 516
7234 },
7235 {
7236 "clip_ratio": 0.0,
7237 "completion_length": 107.25456237792969,
7238 "epoch": 5.942528735632184,
7239 "grad_norm": 2.5970229292314677,
7240 "kl": 0.05029296875,
7241 "learning_rate": 4.057471264367816e-07,
7242 "loss": 0.0021,
7243 "reward": 1.8918704986572266,
7244 "reward_std": 0.019317103549838066,
7245 "rewards/accuracy_reward": 0.8918704986572266,
7246 "rewards/format_reward": 1.0,
7247 "step": 517
7248 },
7249 {
7250 "clip_ratio": 0.0,
7251 "completion_length": 105.76888275146484,
7252 "epoch": 5.954022988505747,
7253 "grad_norm": 1.9125407656929412,
7254 "kl": 0.045654296875,
7255 "learning_rate": 4.045977011494253e-07,
7256 "loss": 0.0019,
7257 "reward": 1.89088773727417,
7258 "reward_std": 0.0201788991689682,
7259 "rewards/accuracy_reward": 0.8915388584136963,
7260 "rewards/format_reward": 0.9993489980697632,
7261 "step": 518
7262 },
7263 {
7264 "clip_ratio": 0.0,
7265 "completion_length": 105.34375,
7266 "epoch": 5.9655172413793105,
7267 "grad_norm": 1.906615415914849,
7268 "kl": 0.05322265625,
7269 "learning_rate": 4.034482758620689e-07,
7270 "loss": 0.0022,
7271 "reward": 1.8955434560775757,
7272 "reward_std": 0.017683900892734528,
7273 "rewards/accuracy_reward": 0.8955433368682861,
7274 "rewards/format_reward": 1.0,
7275 "step": 519
7276 },
7277 {
7278 "clip_ratio": 0.0,
7279 "completion_length": 103.95573425292969,
7280 "epoch": 5.977011494252873,
7281 "grad_norm": 1.930378197428578,
7282 "kl": 0.052734375,
7283 "learning_rate": 4.0229885057471266e-07,
7284 "loss": 0.0021,
7285 "reward": 1.9113073348999023,
7286 "reward_std": 0.015919553115963936,
7287 "rewards/accuracy_reward": 0.9113074541091919,
7288 "rewards/format_reward": 1.0,
7289 "step": 520
7290 },
7291 {
7292 "clip_ratio": 0.0,
7293 "completion_length": 104.345703125,
7294 "epoch": 5.988505747126437,
7295 "grad_norm": 2.559459936562671,
7296 "kl": 0.048828125,
7297 "learning_rate": 4.011494252873563e-07,
7298 "loss": 0.0021,
7299 "reward": 1.903090476989746,
7300 "reward_std": 0.017953019589185715,
7301 "rewards/accuracy_reward": 0.9030904769897461,
7302 "rewards/format_reward": 1.0,
7303 "step": 521
7304 },
7305 {
7306 "clip_ratio": 0.0,
7307 "completion_length": 101.29354095458984,
7308 "epoch": 6.0,
7309 "grad_norm": 1.320684949997662,
7310 "kl": 0.043212890625,
7311 "learning_rate": 4e-07,
7312 "loss": 0.0018,
7313 "reward": 1.9057987928390503,
7314 "reward_std": 0.014725634828209877,
7315 "rewards/accuracy_reward": 0.9057987332344055,
7316 "rewards/format_reward": 1.0,
7317 "step": 522
7318 },
7319 {
7320 "clip_ratio": 0.0,
7321 "completion_length": 107.42317962646484,
7322 "epoch": 6.011494252873563,
7323 "grad_norm": 1.785801287990398,
7324 "kl": 0.046875,
7325 "learning_rate": 3.9885057471264367e-07,
7326 "loss": 0.0019,
7327 "reward": 1.8730857372283936,
7328 "reward_std": 0.016687028110027313,
7329 "rewards/accuracy_reward": 0.8730856776237488,
7330 "rewards/format_reward": 1.0,
7331 "step": 523
7332 },
7333 {
7334 "clip_ratio": 0.0,
7335 "completion_length": 107.64778900146484,
7336 "epoch": 6.022988505747127,
7337 "grad_norm": 2.818216992605221,
7338 "kl": 0.05126953125,
7339 "learning_rate": 3.977011494252873e-07,
7340 "loss": 0.0021,
7341 "reward": 1.8802497386932373,
7342 "reward_std": 0.019351143389940262,
7343 "rewards/accuracy_reward": 0.8802497386932373,
7344 "rewards/format_reward": 1.0,
7345 "step": 524
7346 },
7347 {
7348 "clip_ratio": 0.0,
7349 "completion_length": 104.970703125,
7350 "epoch": 6.0344827586206895,
7351 "grad_norm": 1.373324008322693,
7352 "kl": 0.04833984375,
7353 "learning_rate": 3.9655172413793105e-07,
7354 "loss": 0.002,
7355 "reward": 1.8902562856674194,
7356 "reward_std": 0.017598673701286316,
7357 "rewards/accuracy_reward": 0.8902562856674194,
7358 "rewards/format_reward": 1.0,
7359 "step": 525
7360 },
7361 {
7362 "clip_ratio": 0.0,
7363 "completion_length": 104.61263275146484,
7364 "epoch": 6.045977011494253,
7365 "grad_norm": 1.4203897246386383,
7366 "kl": 0.046630859375,
7367 "learning_rate": 3.954022988505747e-07,
7368 "loss": 0.002,
7369 "reward": 1.896735668182373,
7370 "reward_std": 0.0163632333278656,
7371 "rewards/accuracy_reward": 0.896735668182373,
7372 "rewards/format_reward": 1.0,
7373 "step": 526
7374 },
7375 {
7376 "clip_ratio": 0.0,
7377 "completion_length": 103.630859375,
7378 "epoch": 6.057471264367816,
7379 "grad_norm": 1.9485819155646742,
7380 "kl": 0.047607421875,
7381 "learning_rate": 3.942528735632183e-07,
7382 "loss": 0.002,
7383 "reward": 1.8787517547607422,
7384 "reward_std": 0.020119938999414444,
7385 "rewards/accuracy_reward": 0.879402756690979,
7386 "rewards/format_reward": 0.9993489980697632,
7387 "step": 527
7388 },
7389 {
7390 "clip_ratio": 0.0,
7391 "completion_length": 104.02214050292969,
7392 "epoch": 6.068965517241379,
7393 "grad_norm": 2.495312428574511,
7394 "kl": 0.050537109375,
7395 "learning_rate": 3.9310344827586207e-07,
7396 "loss": 0.0021,
7397 "reward": 1.9046882390975952,
7398 "reward_std": 0.01929212175309658,
7399 "rewards/accuracy_reward": 0.9046882390975952,
7400 "rewards/format_reward": 1.0,
7401 "step": 528
7402 },
7403 {
7404 "clip_ratio": 0.0,
7405 "completion_length": 103.466796875,
7406 "epoch": 6.080459770114943,
7407 "grad_norm": 1.8085811940047551,
7408 "kl": 0.048583984375,
7409 "learning_rate": 3.919540229885057e-07,
7410 "loss": 0.002,
7411 "reward": 1.8941335678100586,
7412 "reward_std": 0.0205199234187603,
7413 "rewards/accuracy_reward": 0.894133448600769,
7414 "rewards/format_reward": 1.0,
7415 "step": 529
7416 },
7417 {
7418 "clip_ratio": 0.0,
7419 "completion_length": 102.52018737792969,
7420 "epoch": 6.091954022988506,
7421 "grad_norm": 5.365446513796388,
7422 "kl": 0.04638671875,
7423 "learning_rate": 3.9080459770114945e-07,
7424 "loss": 0.0019,
7425 "reward": 1.9010367393493652,
7426 "reward_std": 0.017713043838739395,
7427 "rewards/accuracy_reward": 0.9010368585586548,
7428 "rewards/format_reward": 1.0,
7429 "step": 530
7430 },
7431 {
7432 "clip_ratio": 0.0,
7433 "completion_length": 104.74089050292969,
7434 "epoch": 6.103448275862069,
7435 "grad_norm": 2.828191865732173,
7436 "kl": 0.049072265625,
7437 "learning_rate": 3.896551724137931e-07,
7438 "loss": 0.002,
7439 "reward": 1.8785228729248047,
7440 "reward_std": 0.02002035826444626,
7441 "rewards/accuracy_reward": 0.8785228133201599,
7442 "rewards/format_reward": 1.0,
7443 "step": 531
7444 },
7445 {
7446 "clip_ratio": 0.0,
7447 "completion_length": 103.07747650146484,
7448 "epoch": 6.114942528735632,
7449 "grad_norm": 4.047622387425111,
7450 "kl": 0.04443359375,
7451 "learning_rate": 3.885057471264367e-07,
7452 "loss": 0.0018,
7453 "reward": 1.8791890144348145,
7454 "reward_std": 0.01751401089131832,
7455 "rewards/accuracy_reward": 0.8791891932487488,
7456 "rewards/format_reward": 1.0,
7457 "step": 532
7458 },
7459 {
7460 "clip_ratio": 0.0,
7461 "completion_length": 103.36263275146484,
7462 "epoch": 6.126436781609195,
7463 "grad_norm": 1.3128782551208291,
7464 "kl": 0.044189453125,
7465 "learning_rate": 3.8735632183908046e-07,
7466 "loss": 0.0019,
7467 "reward": 1.9140952825546265,
7468 "reward_std": 0.01683102920651436,
7469 "rewards/accuracy_reward": 0.9140952825546265,
7470 "rewards/format_reward": 1.0,
7471 "step": 533
7472 },
7473 {
7474 "clip_ratio": 0.0,
7475 "completion_length": 105.18815612792969,
7476 "epoch": 6.137931034482759,
7477 "grad_norm": 1.933372124731939,
7478 "kl": 0.038818359375,
7479 "learning_rate": 3.862068965517241e-07,
7480 "loss": 0.0016,
7481 "reward": 1.8693971633911133,
7482 "reward_std": 0.016525980085134506,
7483 "rewards/accuracy_reward": 0.8693971633911133,
7484 "rewards/format_reward": 1.0,
7485 "step": 534
7486 },
7487 {
7488 "clip_ratio": 0.0,
7489 "completion_length": 107.75521087646484,
7490 "epoch": 6.149425287356322,
7491 "grad_norm": 1.6333597141209881,
7492 "kl": 0.04150390625,
7493 "learning_rate": 3.850574712643678e-07,
7494 "loss": 0.0018,
7495 "reward": 1.90386962890625,
7496 "reward_std": 0.015211975201964378,
7497 "rewards/accuracy_reward": 0.9038697481155396,
7498 "rewards/format_reward": 1.0,
7499 "step": 535
7500 },
7501 {
7502 "clip_ratio": 0.0,
7503 "completion_length": 107.96549987792969,
7504 "epoch": 6.160919540229885,
7505 "grad_norm": 1.2717871465323818,
7506 "kl": 0.0419921875,
7507 "learning_rate": 3.839080459770115e-07,
7508 "loss": 0.0018,
7509 "reward": 1.8973326683044434,
7510 "reward_std": 0.01976935565471649,
7511 "rewards/accuracy_reward": 0.8979837894439697,
7512 "rewards/format_reward": 0.9993489980697632,
7513 "step": 536
7514 },
7515 {
7516 "clip_ratio": 0.0,
7517 "completion_length": 107.41341400146484,
7518 "epoch": 6.172413793103448,
7519 "grad_norm": 2.481211895104965,
7520 "kl": 0.04248046875,
7521 "learning_rate": 3.827586206896551e-07,
7522 "loss": 0.0018,
7523 "reward": 1.9014673233032227,
7524 "reward_std": 0.018813788890838623,
7525 "rewards/accuracy_reward": 0.9014673233032227,
7526 "rewards/format_reward": 1.0,
7527 "step": 537
7528 },
7529 {
7530 "clip_ratio": 0.0,
7531 "completion_length": 108.07292175292969,
7532 "epoch": 6.183908045977011,
7533 "grad_norm": 1.7579797956745593,
7534 "kl": 0.03759765625,
7535 "learning_rate": 3.8160919540229886e-07,
7536 "loss": 0.0016,
7537 "reward": 1.8949092626571655,
7538 "reward_std": 0.018055422231554985,
7539 "rewards/accuracy_reward": 0.8955603837966919,
7540 "rewards/format_reward": 0.9993489980697632,
7541 "step": 538
7542 },
7543 {
7544 "clip_ratio": 0.0,
7545 "completion_length": 110.69140625,
7546 "epoch": 6.195402298850575,
7547 "grad_norm": 2.0716783964182257,
7548 "kl": 0.045166015625,
7549 "learning_rate": 3.804597701149425e-07,
7550 "loss": 0.0019,
7551 "reward": 1.8800392150878906,
7552 "reward_std": 0.02071218751370907,
7553 "rewards/accuracy_reward": 0.8806902766227722,
7554 "rewards/format_reward": 0.9993489980697632,
7555 "step": 539
7556 },
7557 {
7558 "clip_ratio": 0.0,
7559 "completion_length": 106.82942962646484,
7560 "epoch": 6.206896551724138,
7561 "grad_norm": 3.1245967017230347,
7562 "kl": 0.0390625,
7563 "learning_rate": 3.793103448275862e-07,
7564 "loss": 0.0016,
7565 "reward": 1.8988111019134521,
7566 "reward_std": 0.01829328015446663,
7567 "rewards/accuracy_reward": 0.8988110423088074,
7568 "rewards/format_reward": 1.0,
7569 "step": 540
7570 },
7571 {
7572 "clip_ratio": 0.0,
7573 "completion_length": 107.06901550292969,
7574 "epoch": 6.218390804597701,
7575 "grad_norm": 4.050364995136617,
7576 "kl": 0.05517578125,
7577 "learning_rate": 3.781609195402299e-07,
7578 "loss": 0.0023,
7579 "reward": 1.8841352462768555,
7580 "reward_std": 0.022105498239398003,
7581 "rewards/accuracy_reward": 0.8847863674163818,
7582 "rewards/format_reward": 0.9993489980697632,
7583 "step": 541
7584 },
7585 {
7586 "clip_ratio": 0.0,
7587 "completion_length": 106.82747650146484,
7588 "epoch": 6.2298850574712645,
7589 "grad_norm": 1.8078253085183804,
7590 "kl": 0.038330078125,
7591 "learning_rate": 3.7701149425287357e-07,
7592 "loss": 0.0016,
7593 "reward": 1.910055160522461,
7594 "reward_std": 0.021085239946842194,
7595 "rewards/accuracy_reward": 0.9107062816619873,
7596 "rewards/format_reward": 0.9993489980697632,
7597 "step": 542
7598 },
7599 {
7600 "clip_ratio": 0.0,
7601 "completion_length": 107.75325775146484,
7602 "epoch": 6.241379310344827,
7603 "grad_norm": 1.5902924468702986,
7604 "kl": 0.048583984375,
7605 "learning_rate": 3.758620689655172e-07,
7606 "loss": 0.002,
7607 "reward": 1.8845319747924805,
7608 "reward_std": 0.022094160318374634,
7609 "rewards/accuracy_reward": 0.8845321536064148,
7610 "rewards/format_reward": 1.0,
7611 "step": 543
7612 },
7613 {
7614 "clip_ratio": 0.0,
7615 "completion_length": 107.82682800292969,
7616 "epoch": 6.252873563218391,
7617 "grad_norm": 2.0704586256752253,
7618 "kl": 0.042236328125,
7619 "learning_rate": 3.747126436781609e-07,
7620 "loss": 0.0018,
7621 "reward": 1.8901569843292236,
7622 "reward_std": 0.020656492561101913,
7623 "rewards/accuracy_reward": 0.8901569247245789,
7624 "rewards/format_reward": 1.0,
7625 "step": 544
7626 },
7627 {
7628 "clip_ratio": 0.0,
7629 "completion_length": 107.267578125,
7630 "epoch": 6.264367816091954,
7631 "grad_norm": 2.4508205899480724,
7632 "kl": 0.04248046875,
7633 "learning_rate": 3.735632183908046e-07,
7634 "loss": 0.0018,
7635 "reward": 1.8985986709594727,
7636 "reward_std": 0.021605072543025017,
7637 "rewards/accuracy_reward": 0.8992499113082886,
7638 "rewards/format_reward": 0.9993489980697632,
7639 "step": 545
7640 },
7641 {
7642 "clip_ratio": 0.0,
7643 "completion_length": 107.693359375,
7644 "epoch": 6.275862068965517,
7645 "grad_norm": 1.5807641157109527,
7646 "kl": 0.0439453125,
7647 "learning_rate": 3.7241379310344827e-07,
7648 "loss": 0.0018,
7649 "reward": 1.903763771057129,
7650 "reward_std": 0.020712215453386307,
7651 "rewards/accuracy_reward": 0.9037638902664185,
7652 "rewards/format_reward": 1.0,
7653 "step": 546
7654 },
7655 {
7656 "clip_ratio": 0.0,
7657 "completion_length": 109.677734375,
7658 "epoch": 6.287356321839081,
7659 "grad_norm": 1.8174266396212175,
7660 "kl": 0.043212890625,
7661 "learning_rate": 3.7126436781609196e-07,
7662 "loss": 0.0018,
7663 "reward": 1.8896994590759277,
7664 "reward_std": 0.02042810432612896,
7665 "rewards/accuracy_reward": 0.8896996378898621,
7666 "rewards/format_reward": 1.0,
7667 "step": 547
7668 },
7669 {
7670 "clip_ratio": 0.0,
7671 "completion_length": 108.779296875,
7672 "epoch": 6.2988505747126435,
7673 "grad_norm": 1.3923115045110266,
7674 "kl": 0.03857421875,
7675 "learning_rate": 3.701149425287356e-07,
7676 "loss": 0.0016,
7677 "reward": 1.8963736295700073,
7678 "reward_std": 0.021433323621749878,
7679 "rewards/accuracy_reward": 0.8963736295700073,
7680 "rewards/format_reward": 1.0,
7681 "step": 548
7682 },
7683 {
7684 "clip_ratio": 0.0,
7685 "completion_length": 108.72526550292969,
7686 "epoch": 6.310344827586207,
7687 "grad_norm": 1.7072054434768456,
7688 "kl": 0.041748046875,
7689 "learning_rate": 3.689655172413793e-07,
7690 "loss": 0.0017,
7691 "reward": 1.91050124168396,
7692 "reward_std": 0.02200714498758316,
7693 "rewards/accuracy_reward": 0.9111522436141968,
7694 "rewards/format_reward": 0.9993489980697632,
7695 "step": 549
7696 },
7697 {
7698 "clip_ratio": 0.0,
7699 "completion_length": 108.98567962646484,
7700 "epoch": 6.32183908045977,
7701 "grad_norm": 1.3415596828035086,
7702 "kl": 0.0390625,
7703 "learning_rate": 3.67816091954023e-07,
7704 "loss": 0.0017,
7705 "reward": 1.9038093090057373,
7706 "reward_std": 0.01953584887087345,
7707 "rewards/accuracy_reward": 0.9044603109359741,
7708 "rewards/format_reward": 0.9993489980697632,
7709 "step": 550
7710 },
7711 {
7712 "clip_ratio": 0.0,
7713 "completion_length": 108.85482025146484,
7714 "epoch": 6.333333333333333,
7715 "grad_norm": 1.2624977576240533,
7716 "kl": 0.039306640625,
7717 "learning_rate": 3.666666666666666e-07,
7718 "loss": 0.0017,
7719 "reward": 1.915178656578064,
7720 "reward_std": 0.017109807580709457,
7721 "rewards/accuracy_reward": 0.9151787161827087,
7722 "rewards/format_reward": 1.0,
7723 "step": 551
7724 },
7725 {
7726 "clip_ratio": 0.0,
7727 "completion_length": 108.31185150146484,
7728 "epoch": 6.344827586206897,
7729 "grad_norm": 1.7459891843430122,
7730 "kl": 0.0400390625,
7731 "learning_rate": 3.6551724137931036e-07,
7732 "loss": 0.0017,
7733 "reward": 1.8864727020263672,
7734 "reward_std": 0.02323821559548378,
7735 "rewards/accuracy_reward": 0.8871237635612488,
7736 "rewards/format_reward": 0.9993489980697632,
7737 "step": 552
7738 },
7739 {
7740 "clip_ratio": 0.0,
7741 "completion_length": 106.73893737792969,
7742 "epoch": 6.35632183908046,
7743 "grad_norm": 2.027513640363097,
7744 "kl": 0.042724609375,
7745 "learning_rate": 3.64367816091954e-07,
7746 "loss": 0.0018,
7747 "reward": 1.9012677669525146,
7748 "reward_std": 0.01830562949180603,
7749 "rewards/accuracy_reward": 0.9012677073478699,
7750 "rewards/format_reward": 1.0,
7751 "step": 553
7752 },
7753 {
7754 "clip_ratio": 0.0,
7755 "completion_length": 106.47200775146484,
7756 "epoch": 6.3678160919540225,
7757 "grad_norm": 3.2725385424225064,
7758 "kl": 0.0419921875,
7759 "learning_rate": 3.632183908045977e-07,
7760 "loss": 0.0017,
7761 "reward": 1.9009044170379639,
7762 "reward_std": 0.019121093675494194,
7763 "rewards/accuracy_reward": 0.9009043574333191,
7764 "rewards/format_reward": 1.0,
7765 "step": 554
7766 },
7767 {
7768 "clip_ratio": 0.0,
7769 "completion_length": 106.43034362792969,
7770 "epoch": 6.379310344827586,
7771 "grad_norm": 2.1062592964548665,
7772 "kl": 0.0419921875,
7773 "learning_rate": 3.620689655172414e-07,
7774 "loss": 0.0017,
7775 "reward": 1.9104199409484863,
7776 "reward_std": 0.020279204472899437,
7777 "rewards/accuracy_reward": 0.9110710024833679,
7778 "rewards/format_reward": 0.9993489980697632,
7779 "step": 555
7780 },
7781 {
7782 "clip_ratio": 0.0,
7783 "completion_length": 105.58919525146484,
7784 "epoch": 6.390804597701149,
7785 "grad_norm": 1.4371380643185012,
7786 "kl": 0.04150390625,
7787 "learning_rate": 3.60919540229885e-07,
7788 "loss": 0.0017,
7789 "reward": 1.9064548015594482,
7790 "reward_std": 0.019678324460983276,
7791 "rewards/accuracy_reward": 0.9077568054199219,
7792 "rewards/format_reward": 0.9986979365348816,
7793 "step": 556
7794 },
7795 {
7796 "clip_ratio": 0.0,
7797 "completion_length": 106.373046875,
7798 "epoch": 6.402298850574713,
7799 "grad_norm": 1.4862995937435215,
7800 "kl": 0.041259765625,
7801 "learning_rate": 3.5977011494252875e-07,
7802 "loss": 0.0017,
7803 "reward": 1.9054267406463623,
7804 "reward_std": 0.02058161422610283,
7805 "rewards/accuracy_reward": 0.9054265022277832,
7806 "rewards/format_reward": 1.0,
7807 "step": 557
7808 },
7809 {
7810 "clip_ratio": 0.0,
7811 "completion_length": 105.744140625,
7812 "epoch": 6.413793103448276,
7813 "grad_norm": 1.4955532746736115,
7814 "kl": 0.04052734375,
7815 "learning_rate": 3.586206896551724e-07,
7816 "loss": 0.0017,
7817 "reward": 1.9132366180419922,
7818 "reward_std": 0.0166233628988266,
7819 "rewards/accuracy_reward": 0.9132366180419922,
7820 "rewards/format_reward": 1.0,
7821 "step": 558
7822 },
7823 {
7824 "clip_ratio": 0.0,
7825 "completion_length": 106.3828125,
7826 "epoch": 6.425287356321839,
7827 "grad_norm": 1.5190626491993644,
7828 "kl": 0.038818359375,
7829 "learning_rate": 3.5747126436781603e-07,
7830 "loss": 0.0016,
7831 "reward": 1.9011564254760742,
7832 "reward_std": 0.018908878788352013,
7833 "rewards/accuracy_reward": 0.9018074870109558,
7834 "rewards/format_reward": 0.9993489980697632,
7835 "step": 559
7836 },
7837 {
7838 "clip_ratio": 0.0,
7839 "completion_length": 106.10612487792969,
7840 "epoch": 6.436781609195402,
7841 "grad_norm": 2.3464151581290253,
7842 "kl": 0.041259765625,
7843 "learning_rate": 3.5632183908045977e-07,
7844 "loss": 0.0017,
7845 "reward": 1.8979761600494385,
7846 "reward_std": 0.019787931814789772,
7847 "rewards/accuracy_reward": 0.8986272811889648,
7848 "rewards/format_reward": 0.9993489980697632,
7849 "step": 560
7850 },
7851 {
7852 "clip_ratio": 0.0,
7853 "completion_length": 106.68685150146484,
7854 "epoch": 6.448275862068965,
7855 "grad_norm": 5.8754331087951925,
7856 "kl": 0.04443359375,
7857 "learning_rate": 3.551724137931034e-07,
7858 "loss": 0.0018,
7859 "reward": 1.9075603485107422,
7860 "reward_std": 0.018795132637023926,
7861 "rewards/accuracy_reward": 0.9075603485107422,
7862 "rewards/format_reward": 1.0,
7863 "step": 561
7864 },
7865 {
7866 "clip_ratio": 0.0,
7867 "completion_length": 107.45833587646484,
7868 "epoch": 6.459770114942529,
7869 "grad_norm": 1.995883232305232,
7870 "kl": 0.04296875,
7871 "learning_rate": 3.5402298850574715e-07,
7872 "loss": 0.0018,
7873 "reward": 1.9002068042755127,
7874 "reward_std": 0.019703200086951256,
7875 "rewards/accuracy_reward": 0.9002067446708679,
7876 "rewards/format_reward": 1.0,
7877 "step": 562
7878 },
7879 {
7880 "clip_ratio": 0.0,
7881 "completion_length": 106.5234375,
7882 "epoch": 6.471264367816092,
7883 "grad_norm": 2.589878830540706,
7884 "kl": 0.044189453125,
7885 "learning_rate": 3.528735632183908e-07,
7886 "loss": 0.0018,
7887 "reward": 1.898942470550537,
7888 "reward_std": 0.023017754778265953,
7889 "rewards/accuracy_reward": 0.9008955955505371,
7890 "rewards/format_reward": 0.998046875,
7891 "step": 563
7892 },
7893 {
7894 "clip_ratio": 0.0,
7895 "completion_length": 106.181640625,
7896 "epoch": 6.482758620689655,
7897 "grad_norm": 2.327789501169785,
7898 "kl": 0.044677734375,
7899 "learning_rate": 3.517241379310344e-07,
7900 "loss": 0.0019,
7901 "reward": 1.9264135360717773,
7902 "reward_std": 0.018227433785796165,
7903 "rewards/accuracy_reward": 0.9264135360717773,
7904 "rewards/format_reward": 1.0,
7905 "step": 564
7906 },
7907 {
7908 "clip_ratio": 0.0,
7909 "completion_length": 105.37435150146484,
7910 "epoch": 6.494252873563219,
7911 "grad_norm": 3.3012610405965925,
7912 "kl": 0.044677734375,
7913 "learning_rate": 3.5057471264367817e-07,
7914 "loss": 0.0019,
7915 "reward": 1.882051944732666,
7916 "reward_std": 0.020881911739706993,
7917 "rewards/accuracy_reward": 0.882051944732666,
7918 "rewards/format_reward": 1.0,
7919 "step": 565
7920 },
7921 {
7922 "clip_ratio": 0.0,
7923 "completion_length": 105.64388275146484,
7924 "epoch": 6.505747126436781,
7925 "grad_norm": 1.5695423368611925,
7926 "kl": 0.039306640625,
7927 "learning_rate": 3.494252873563218e-07,
7928 "loss": 0.0016,
7929 "reward": 1.9002180099487305,
7930 "reward_std": 0.016297608613967896,
7931 "rewards/accuracy_reward": 0.9002181887626648,
7932 "rewards/format_reward": 1.0,
7933 "step": 566
7934 },
7935 {
7936 "clip_ratio": 0.0,
7937 "completion_length": 108.037109375,
7938 "epoch": 6.517241379310345,
7939 "grad_norm": 1.3087146389425561,
7940 "kl": 0.04248046875,
7941 "learning_rate": 3.482758620689655e-07,
7942 "loss": 0.0018,
7943 "reward": 1.9085086584091187,
7944 "reward_std": 0.017730899155139923,
7945 "rewards/accuracy_reward": 0.9085086584091187,
7946 "rewards/format_reward": 1.0,
7947 "step": 567
7948 },
7949 {
7950 "clip_ratio": 0.0,
7951 "completion_length": 105.88607025146484,
7952 "epoch": 6.528735632183908,
7953 "grad_norm": 2.971298257475111,
7954 "kl": 0.046142578125,
7955 "learning_rate": 3.471264367816092e-07,
7956 "loss": 0.0019,
7957 "reward": 1.8897184133529663,
7958 "reward_std": 0.022931650280952454,
7959 "rewards/accuracy_reward": 0.8910205364227295,
7960 "rewards/format_reward": 0.9986979365348816,
7961 "step": 568
7962 },
7963 {
7964 "clip_ratio": 0.0,
7965 "completion_length": 105.04362487792969,
7966 "epoch": 6.540229885057471,
7967 "grad_norm": 2.2629176129306514,
7968 "kl": 0.04833984375,
7969 "learning_rate": 3.4597701149425287e-07,
7970 "loss": 0.002,
7971 "reward": 1.888806700706482,
7972 "reward_std": 0.022193504497408867,
7973 "rewards/accuracy_reward": 0.8901089429855347,
7974 "rewards/format_reward": 0.9986979365348816,
7975 "step": 569
7976 },
7977 {
7978 "clip_ratio": 0.0,
7979 "completion_length": 107.546875,
7980 "epoch": 6.551724137931035,
7981 "grad_norm": 1.6844269731795065,
7982 "kl": 0.0458984375,
7983 "learning_rate": 3.4482758620689656e-07,
7984 "loss": 0.0019,
7985 "reward": 1.8828990459442139,
7986 "reward_std": 0.018883569166064262,
7987 "rewards/accuracy_reward": 0.8835498690605164,
7988 "rewards/format_reward": 0.9993489980697632,
7989 "step": 570
7990 },
7991 {
7992 "clip_ratio": 0.0,
7993 "completion_length": 106.8671875,
7994 "epoch": 6.563218390804598,
7995 "grad_norm": 2.284457618723532,
7996 "kl": 0.044921875,
7997 "learning_rate": 3.436781609195402e-07,
7998 "loss": 0.0018,
7999 "reward": 1.884222388267517,
8000 "reward_std": 0.018331531435251236,
8001 "rewards/accuracy_reward": 0.8848733305931091,
8002 "rewards/format_reward": 0.9993489980697632,
8003 "step": 571
8004 },
8005 {
8006 "clip_ratio": 0.0,
8007 "completion_length": 107.43099212646484,
8008 "epoch": 6.574712643678161,
8009 "grad_norm": 1.446359862116147,
8010 "kl": 0.04345703125,
8011 "learning_rate": 3.425287356321839e-07,
8012 "loss": 0.0018,
8013 "reward": 1.9134483337402344,
8014 "reward_std": 0.019138170406222343,
8015 "rewards/accuracy_reward": 0.9134482741355896,
8016 "rewards/format_reward": 1.0,
8017 "step": 572
8018 },
8019 {
8020 "clip_ratio": 0.0,
8021 "completion_length": 107.00130462646484,
8022 "epoch": 6.586206896551724,
8023 "grad_norm": 5.703380489491454,
8024 "kl": 0.05078125,
8025 "learning_rate": 3.413793103448276e-07,
8026 "loss": 0.0021,
8027 "reward": 1.8881677389144897,
8028 "reward_std": 0.018784310668706894,
8029 "rewards/accuracy_reward": 0.8881677389144897,
8030 "rewards/format_reward": 1.0,
8031 "step": 573
8032 },
8033 {
8034 "clip_ratio": 0.0,
8035 "completion_length": 107.427734375,
8036 "epoch": 6.597701149425287,
8037 "grad_norm": 3.5020194713310895,
8038 "kl": 0.04345703125,
8039 "learning_rate": 3.4022988505747127e-07,
8040 "loss": 0.0018,
8041 "reward": 1.8986886739730835,
8042 "reward_std": 0.01682961732149124,
8043 "rewards/accuracy_reward": 0.8986887335777283,
8044 "rewards/format_reward": 1.0,
8045 "step": 574
8046 },
8047 {
8048 "clip_ratio": 0.0,
8049 "completion_length": 109.43424987792969,
8050 "epoch": 6.609195402298851,
8051 "grad_norm": 2.303566130374915,
8052 "kl": 0.04296875,
8053 "learning_rate": 3.390804597701149e-07,
8054 "loss": 0.0018,
8055 "reward": 1.8962440490722656,
8056 "reward_std": 0.02106454037129879,
8057 "rewards/accuracy_reward": 0.8975462913513184,
8058 "rewards/format_reward": 0.9986979365348816,
8059 "step": 575
8060 },
8061 {
8062 "clip_ratio": 0.0,
8063 "completion_length": 108.39583587646484,
8064 "epoch": 6.620689655172414,
8065 "grad_norm": 1.6584764881239733,
8066 "kl": 0.045166015625,
8067 "learning_rate": 3.379310344827586e-07,
8068 "loss": 0.0019,
8069 "reward": 1.88081693649292,
8070 "reward_std": 0.018141578882932663,
8071 "rewards/accuracy_reward": 0.8808168768882751,
8072 "rewards/format_reward": 1.0,
8073 "step": 576
8074 },
8075 {
8076 "clip_ratio": 0.0,
8077 "completion_length": 108.30143737792969,
8078 "epoch": 6.6321839080459775,
8079 "grad_norm": 4.350037902113492,
8080 "kl": 0.048828125,
8081 "learning_rate": 3.367816091954023e-07,
8082 "loss": 0.002,
8083 "reward": 1.8807071447372437,
8084 "reward_std": 0.019819162786006927,
8085 "rewards/accuracy_reward": 0.8807072043418884,
8086 "rewards/format_reward": 1.0,
8087 "step": 577
8088 },
8089 {
8090 "clip_ratio": 0.0,
8091 "completion_length": 108.64453125,
8092 "epoch": 6.64367816091954,
8093 "grad_norm": 1.8183763915247184,
8094 "kl": 0.04736328125,
8095 "learning_rate": 3.35632183908046e-07,
8096 "loss": 0.002,
8097 "reward": 1.8958330154418945,
8098 "reward_std": 0.017030756920576096,
8099 "rewards/accuracy_reward": 0.8958329558372498,
8100 "rewards/format_reward": 1.0,
8101 "step": 578
8102 },
8103 {
8104 "clip_ratio": 0.0,
8105 "completion_length": 107.712890625,
8106 "epoch": 6.655172413793103,
8107 "grad_norm": 1.6800458122792343,
8108 "kl": 0.045166015625,
8109 "learning_rate": 3.3448275862068966e-07,
8110 "loss": 0.0019,
8111 "reward": 1.899322748184204,
8112 "reward_std": 0.01855292171239853,
8113 "rewards/accuracy_reward": 0.8993227481842041,
8114 "rewards/format_reward": 1.0,
8115 "step": 579
8116 },
8117 {
8118 "clip_ratio": 0.0,
8119 "completion_length": 107.41862487792969,
8120 "epoch": 6.666666666666667,
8121 "grad_norm": 1.5389866031610586,
8122 "kl": 0.048828125,
8123 "learning_rate": 3.333333333333333e-07,
8124 "loss": 0.002,
8125 "reward": 1.9041564464569092,
8126 "reward_std": 0.019357208162546158,
8127 "rewards/accuracy_reward": 0.9041563868522644,
8128 "rewards/format_reward": 1.0,
8129 "step": 580
8130 },
8131 {
8132 "clip_ratio": 0.0,
8133 "completion_length": 106.85482025146484,
8134 "epoch": 6.67816091954023,
8135 "grad_norm": 1.854004391136165,
8136 "kl": 0.046630859375,
8137 "learning_rate": 3.32183908045977e-07,
8138 "loss": 0.0019,
8139 "reward": 1.896866798400879,
8140 "reward_std": 0.02253865823149681,
8141 "rewards/accuracy_reward": 0.8981690406799316,
8142 "rewards/format_reward": 0.9986979365348816,
8143 "step": 581
8144 },
8145 {
8146 "clip_ratio": 0.0,
8147 "completion_length": 106.93359375,
8148 "epoch": 6.689655172413794,
8149 "grad_norm": 2.018641316913995,
8150 "kl": 0.048828125,
8151 "learning_rate": 3.310344827586207e-07,
8152 "loss": 0.002,
8153 "reward": 1.9180282354354858,
8154 "reward_std": 0.01753108948469162,
8155 "rewards/accuracy_reward": 0.9180280566215515,
8156 "rewards/format_reward": 1.0,
8157 "step": 582
8158 },
8159 {
8160 "clip_ratio": 0.0,
8161 "completion_length": 108.14192962646484,
8162 "epoch": 6.7011494252873565,
8163 "grad_norm": 2.590727908860308,
8164 "kl": 0.047119140625,
8165 "learning_rate": 3.298850574712643e-07,
8166 "loss": 0.002,
8167 "reward": 1.8914936780929565,
8168 "reward_std": 0.019188500940799713,
8169 "rewards/accuracy_reward": 0.8921446800231934,
8170 "rewards/format_reward": 0.9993489980697632,
8171 "step": 583
8172 },
8173 {
8174 "clip_ratio": 0.0,
8175 "completion_length": 106.40625,
8176 "epoch": 6.712643678160919,
8177 "grad_norm": 1.4866576141042993,
8178 "kl": 0.05029296875,
8179 "learning_rate": 3.2873563218390806e-07,
8180 "loss": 0.0021,
8181 "reward": 1.900843620300293,
8182 "reward_std": 0.02220628783106804,
8183 "rewards/accuracy_reward": 0.9014946818351746,
8184 "rewards/format_reward": 0.9993489980697632,
8185 "step": 584
8186 },
8187 {
8188 "clip_ratio": 0.0,
8189 "completion_length": 105.1640625,
8190 "epoch": 6.724137931034483,
8191 "grad_norm": 1.7834180698321564,
8192 "kl": 0.049560546875,
8193 "learning_rate": 3.275862068965517e-07,
8194 "loss": 0.002,
8195 "reward": 1.8919929265975952,
8196 "reward_std": 0.019489863887429237,
8197 "rewards/accuracy_reward": 0.8919928073883057,
8198 "rewards/format_reward": 1.0,
8199 "step": 585
8200 },
8201 {
8202 "clip_ratio": 0.0,
8203 "completion_length": 104.11263275146484,
8204 "epoch": 6.735632183908046,
8205 "grad_norm": 2.080295738757342,
8206 "kl": 0.0517578125,
8207 "learning_rate": 3.264367816091954e-07,
8208 "loss": 0.0021,
8209 "reward": 1.9068288803100586,
8210 "reward_std": 0.01798483356833458,
8211 "rewards/accuracy_reward": 0.9068288803100586,
8212 "rewards/format_reward": 1.0,
8213 "step": 586
8214 },
8215 {
8216 "clip_ratio": 0.0,
8217 "completion_length": 106.28515625,
8218 "epoch": 6.747126436781609,
8219 "grad_norm": 1.363971268029249,
8220 "kl": 0.05322265625,
8221 "learning_rate": 3.252873563218391e-07,
8222 "loss": 0.0022,
8223 "reward": 1.896977186203003,
8224 "reward_std": 0.01630130037665367,
8225 "rewards/accuracy_reward": 0.8969771265983582,
8226 "rewards/format_reward": 1.0,
8227 "step": 587
8228 },
8229 {
8230 "clip_ratio": 0.0,
8231 "completion_length": 106.81120300292969,
8232 "epoch": 6.758620689655173,
8233 "grad_norm": 1.8998231610338354,
8234 "kl": 0.04541015625,
8235 "learning_rate": 3.241379310344827e-07,
8236 "loss": 0.0019,
8237 "reward": 1.8929922580718994,
8238 "reward_std": 0.018269825726747513,
8239 "rewards/accuracy_reward": 0.8929921984672546,
8240 "rewards/format_reward": 1.0,
8241 "step": 588
8242 },
8243 {
8244 "clip_ratio": 0.0,
8245 "completion_length": 104.96810150146484,
8246 "epoch": 6.7701149425287355,
8247 "grad_norm": 1.4654423135239003,
8248 "kl": 0.055908203125,
8249 "learning_rate": 3.2298850574712646e-07,
8250 "loss": 0.0023,
8251 "reward": 1.9026551246643066,
8252 "reward_std": 0.018198613077402115,
8253 "rewards/accuracy_reward": 0.9026551246643066,
8254 "rewards/format_reward": 1.0,
8255 "step": 589
8256 },
8257 {
8258 "clip_ratio": 0.0,
8259 "completion_length": 105.50521087646484,
8260 "epoch": 6.781609195402299,
8261 "grad_norm": 1.720473724542019,
8262 "kl": 0.05029296875,
8263 "learning_rate": 3.218390804597701e-07,
8264 "loss": 0.0021,
8265 "reward": 1.896925687789917,
8266 "reward_std": 0.016040321439504623,
8267 "rewards/accuracy_reward": 0.8969255685806274,
8268 "rewards/format_reward": 1.0,
8269 "step": 590
8270 },
8271 {
8272 "clip_ratio": 0.0,
8273 "completion_length": 106.015625,
8274 "epoch": 6.793103448275862,
8275 "grad_norm": 1.8599065609753198,
8276 "kl": 0.046875,
8277 "learning_rate": 3.2068965517241373e-07,
8278 "loss": 0.002,
8279 "reward": 1.896074652671814,
8280 "reward_std": 0.021062329411506653,
8281 "rewards/accuracy_reward": 0.898027777671814,
8282 "rewards/format_reward": 0.998046875,
8283 "step": 591
8284 },
8285 {
8286 "clip_ratio": 0.0,
8287 "completion_length": 107.74870300292969,
8288 "epoch": 6.804597701149425,
8289 "grad_norm": 2.454880208710065,
8290 "kl": 0.04541015625,
8291 "learning_rate": 3.1954022988505747e-07,
8292 "loss": 0.0019,
8293 "reward": 1.9107999801635742,
8294 "reward_std": 0.015166133642196655,
8295 "rewards/accuracy_reward": 0.9108001589775085,
8296 "rewards/format_reward": 1.0,
8297 "step": 592
8298 },
8299 {
8300 "clip_ratio": 0.0,
8301 "completion_length": 108.056640625,
8302 "epoch": 6.816091954022989,
8303 "grad_norm": 1.7036855487787057,
8304 "kl": 0.045166015625,
8305 "learning_rate": 3.183908045977011e-07,
8306 "loss": 0.0019,
8307 "reward": 1.8942689895629883,
8308 "reward_std": 0.02249247394502163,
8309 "rewards/accuracy_reward": 0.8949202299118042,
8310 "rewards/format_reward": 0.9993489980697632,
8311 "step": 593
8312 },
8313 {
8314 "clip_ratio": 0.0,
8315 "completion_length": 107.830078125,
8316 "epoch": 6.827586206896552,
8317 "grad_norm": 53.62639568378942,
8318 "kl": 0.0458984375,
8319 "learning_rate": 3.1724137931034485e-07,
8320 "loss": 0.0019,
8321 "reward": 1.8942097425460815,
8322 "reward_std": 0.019950736314058304,
8323 "rewards/accuracy_reward": 0.8948608636856079,
8324 "rewards/format_reward": 0.9993489980697632,
8325 "step": 594
8326 },
8327 {
8328 "clip_ratio": 0.0,
8329 "completion_length": 109.10286712646484,
8330 "epoch": 6.8390804597701145,
8331 "grad_norm": 5.944879218709257,
8332 "kl": 0.040283203125,
8333 "learning_rate": 3.160919540229885e-07,
8334 "loss": 0.0017,
8335 "reward": 1.89532470703125,
8336 "reward_std": 0.016749953851103783,
8337 "rewards/accuracy_reward": 0.89532470703125,
8338 "rewards/format_reward": 1.0,
8339 "step": 595
8340 },
8341 {
8342 "clip_ratio": 0.0,
8343 "completion_length": 109.40560150146484,
8344 "epoch": 6.850574712643678,
8345 "grad_norm": 1.4735604817931964,
8346 "kl": 0.045654296875,
8347 "learning_rate": 3.149425287356321e-07,
8348 "loss": 0.002,
8349 "reward": 1.9018571376800537,
8350 "reward_std": 0.01927165314555168,
8351 "rewards/accuracy_reward": 0.9018572568893433,
8352 "rewards/format_reward": 1.0,
8353 "step": 596
8354 },
8355 {
8356 "clip_ratio": 0.0,
8357 "completion_length": 108.650390625,
8358 "epoch": 6.862068965517241,
8359 "grad_norm": 2.3771678349296113,
8360 "kl": 0.04638671875,
8361 "learning_rate": 3.1379310344827587e-07,
8362 "loss": 0.0019,
8363 "reward": 1.902691125869751,
8364 "reward_std": 0.019225675612688065,
8365 "rewards/accuracy_reward": 0.9033421277999878,
8366 "rewards/format_reward": 0.9993489980697632,
8367 "step": 597
8368 },
8369 {
8370 "clip_ratio": 0.0,
8371 "completion_length": 110.69661712646484,
8372 "epoch": 6.873563218390805,
8373 "grad_norm": 1.3517485796795934,
8374 "kl": 0.041015625,
8375 "learning_rate": 3.126436781609195e-07,
8376 "loss": 0.0017,
8377 "reward": 1.904984712600708,
8378 "reward_std": 0.01752365753054619,
8379 "rewards/accuracy_reward": 0.9049846529960632,
8380 "rewards/format_reward": 1.0,
8381 "step": 598
8382 },
8383 {
8384 "clip_ratio": 0.0,
8385 "completion_length": 108.94792175292969,
8386 "epoch": 6.885057471264368,
8387 "grad_norm": 1.7049030907812799,
8388 "kl": 0.046630859375,
8389 "learning_rate": 3.114942528735632e-07,
8390 "loss": 0.002,
8391 "reward": 1.8980730772018433,
8392 "reward_std": 0.016529429703950882,
8393 "rewards/accuracy_reward": 0.8980730772018433,
8394 "rewards/format_reward": 1.0,
8395 "step": 599
8396 },
8397 {
8398 "clip_ratio": 0.0,
8399 "completion_length": 110.34700775146484,
8400 "epoch": 6.896551724137931,
8401 "grad_norm": 1.8048662278319052,
8402 "kl": 0.04638671875,
8403 "learning_rate": 3.103448275862069e-07,
8404 "loss": 0.002,
8405 "reward": 1.9081945419311523,
8406 "reward_std": 0.01681654527783394,
8407 "rewards/accuracy_reward": 0.9081945419311523,
8408 "rewards/format_reward": 1.0,
8409 "step": 600
8410 },
8411 {
8412 "clip_ratio": 0.0,
8413 "completion_length": 110.6796875,
8414 "epoch": 6.908045977011494,
8415 "grad_norm": 1.9132885179118997,
8416 "kl": 0.04736328125,
8417 "learning_rate": 3.091954022988506e-07,
8418 "loss": 0.0019,
8419 "reward": 1.8925232887268066,
8420 "reward_std": 0.020483635365962982,
8421 "rewards/accuracy_reward": 0.8931743502616882,
8422 "rewards/format_reward": 0.9993489980697632,
8423 "step": 601
8424 },
8425 {
8426 "clip_ratio": 0.0,
8427 "completion_length": 110.28841400146484,
8428 "epoch": 6.919540229885057,
8429 "grad_norm": 2.831161118788208,
8430 "kl": 0.04541015625,
8431 "learning_rate": 3.0804597701149426e-07,
8432 "loss": 0.0019,
8433 "reward": 1.8872265815734863,
8434 "reward_std": 0.01937289349734783,
8435 "rewards/accuracy_reward": 0.8872265815734863,
8436 "rewards/format_reward": 1.0,
8437 "step": 602
8438 },
8439 {
8440 "clip_ratio": 0.0,
8441 "completion_length": 111.193359375,
8442 "epoch": 6.931034482758621,
8443 "grad_norm": 1.9008533326371801,
8444 "kl": 0.046875,
8445 "learning_rate": 3.068965517241379e-07,
8446 "loss": 0.002,
8447 "reward": 1.9161338806152344,
8448 "reward_std": 0.017956051975488663,
8449 "rewards/accuracy_reward": 0.9161338806152344,
8450 "rewards/format_reward": 1.0,
8451 "step": 603
8452 },
8453 {
8454 "clip_ratio": 0.0,
8455 "completion_length": 112.76432800292969,
8456 "epoch": 6.942528735632184,
8457 "grad_norm": 1.9049955094649353,
8458 "kl": 0.04541015625,
8459 "learning_rate": 3.057471264367816e-07,
8460 "loss": 0.0019,
8461 "reward": 1.8926061391830444,
8462 "reward_std": 0.019360072910785675,
8463 "rewards/accuracy_reward": 0.8932573199272156,
8464 "rewards/format_reward": 0.9993489980697632,
8465 "step": 604
8466 },
8467 {
8468 "clip_ratio": 0.0,
8469 "completion_length": 109.96745300292969,
8470 "epoch": 6.954022988505747,
8471 "grad_norm": 1.275994764733278,
8472 "kl": 0.0458984375,
8473 "learning_rate": 3.045977011494253e-07,
8474 "loss": 0.0019,
8475 "reward": 1.9014930725097656,
8476 "reward_std": 0.017896311357617378,
8477 "rewards/accuracy_reward": 0.9014931917190552,
8478 "rewards/format_reward": 1.0,
8479 "step": 605
8480 },
8481 {
8482 "clip_ratio": 0.0,
8483 "completion_length": 109.83528900146484,
8484 "epoch": 6.9655172413793105,
8485 "grad_norm": 1.6892176163979111,
8486 "kl": 0.044677734375,
8487 "learning_rate": 3.0344827586206897e-07,
8488 "loss": 0.0019,
8489 "reward": 1.8952045440673828,
8490 "reward_std": 0.01996638998389244,
8491 "rewards/accuracy_reward": 0.895204484462738,
8492 "rewards/format_reward": 1.0,
8493 "step": 606
8494 },
8495 {
8496 "clip_ratio": 0.0,
8497 "completion_length": 111.58659362792969,
8498 "epoch": 6.977011494252873,
8499 "grad_norm": 1.3212776547555032,
8500 "kl": 0.04541015625,
8501 "learning_rate": 3.022988505747126e-07,
8502 "loss": 0.0019,
8503 "reward": 1.9071601629257202,
8504 "reward_std": 0.020716873928904533,
8505 "rewards/accuracy_reward": 0.907811164855957,
8506 "rewards/format_reward": 0.9993489980697632,
8507 "step": 607
8508 },
8509 {
8510 "clip_ratio": 0.0,
8511 "completion_length": 110.09505462646484,
8512 "epoch": 6.988505747126437,
8513 "grad_norm": 1.3698382830038993,
8514 "kl": 0.046142578125,
8515 "learning_rate": 3.011494252873563e-07,
8516 "loss": 0.0019,
8517 "reward": 1.8822848796844482,
8518 "reward_std": 0.02124325931072235,
8519 "rewards/accuracy_reward": 0.8822848200798035,
8520 "rewards/format_reward": 1.0,
8521 "step": 608
8522 },
8523 {
8524 "clip_ratio": 0.0,
8525 "completion_length": 106.09129333496094,
8526 "epoch": 7.0,
8527 "grad_norm": 1.611157838283439,
8528 "kl": 0.04150390625,
8529 "learning_rate": 3e-07,
8530 "loss": 0.0017,
8531 "reward": 1.8909047842025757,
8532 "reward_std": 0.01773572526872158,
8533 "rewards/accuracy_reward": 0.8909049034118652,
8534 "rewards/format_reward": 1.0,
8535 "step": 609
8536 },
8537 {
8538 "clip_ratio": 0.0,
8539 "completion_length": 112.59440612792969,
8540 "epoch": 7.011494252873563,
8541 "grad_norm": 2.0128308488525946,
8542 "kl": 0.04345703125,
8543 "learning_rate": 2.988505747126437e-07,
8544 "loss": 0.0019,
8545 "reward": 1.8733458518981934,
8546 "reward_std": 0.021298928186297417,
8547 "rewards/accuracy_reward": 0.873996913433075,
8548 "rewards/format_reward": 0.9993489980697632,
8549 "step": 610
8550 },
8551 {
8552 "clip_ratio": 0.0,
8553 "completion_length": 113.76628112792969,
8554 "epoch": 7.022988505747127,
8555 "grad_norm": 1.7741703706980398,
8556 "kl": 0.04296875,
8557 "learning_rate": 2.9770114942528737e-07,
8558 "loss": 0.0018,
8559 "reward": 1.8772882223129272,
8560 "reward_std": 0.02053675800561905,
8561 "rewards/accuracy_reward": 0.8772882223129272,
8562 "rewards/format_reward": 1.0,
8563 "step": 611
8564 },
8565 {
8566 "clip_ratio": 0.0,
8567 "completion_length": 109.43229675292969,
8568 "epoch": 7.0344827586206895,
8569 "grad_norm": 2.61099721986832,
8570 "kl": 0.042724609375,
8571 "learning_rate": 2.96551724137931e-07,
8572 "loss": 0.0018,
8573 "reward": 1.8760509490966797,
8574 "reward_std": 0.023378584533929825,
8575 "rewards/accuracy_reward": 0.8767021894454956,
8576 "rewards/format_reward": 0.9993489980697632,
8577 "step": 612
8578 },
8579 {
8580 "clip_ratio": 0.0,
8581 "completion_length": 108.71745300292969,
8582 "epoch": 7.045977011494253,
8583 "grad_norm": 1.681876205953948,
8584 "kl": 0.043212890625,
8585 "learning_rate": 2.954022988505747e-07,
8586 "loss": 0.0018,
8587 "reward": 1.9030117988586426,
8588 "reward_std": 0.019194534048438072,
8589 "rewards/accuracy_reward": 0.9030117988586426,
8590 "rewards/format_reward": 1.0,
8591 "step": 613
8592 },
8593 {
8594 "clip_ratio": 0.0,
8595 "completion_length": 107.92253112792969,
8596 "epoch": 7.057471264367816,
8597 "grad_norm": 1.401507959053581,
8598 "kl": 0.042236328125,
8599 "learning_rate": 2.942528735632184e-07,
8600 "loss": 0.0018,
8601 "reward": 1.886817216873169,
8602 "reward_std": 0.0201483853161335,
8603 "rewards/accuracy_reward": 0.8868171572685242,
8604 "rewards/format_reward": 1.0,
8605 "step": 614
8606 },
8607 {
8608 "clip_ratio": 0.0,
8609 "completion_length": 111.21224212646484,
8610 "epoch": 7.068965517241379,
8611 "grad_norm": 2.3290159563411725,
8612 "kl": 0.046142578125,
8613 "learning_rate": 2.93103448275862e-07,
8614 "loss": 0.0019,
8615 "reward": 1.9030017852783203,
8616 "reward_std": 0.021339600905776024,
8617 "rewards/accuracy_reward": 0.9036529064178467,
8618 "rewards/format_reward": 0.9993489980697632,
8619 "step": 615
8620 },
8621 {
8622 "clip_ratio": 0.0,
8623 "completion_length": 107.49089050292969,
8624 "epoch": 7.080459770114943,
8625 "grad_norm": 2.406758547267441,
8626 "kl": 0.044921875,
8627 "learning_rate": 2.9195402298850576e-07,
8628 "loss": 0.0019,
8629 "reward": 1.8917362689971924,
8630 "reward_std": 0.02221861481666565,
8631 "rewards/accuracy_reward": 0.892387330532074,
8632 "rewards/format_reward": 0.9993489980697632,
8633 "step": 616
8634 },
8635 {
8636 "clip_ratio": 0.0,
8637 "completion_length": 107.44921875,
8638 "epoch": 7.091954022988506,
8639 "grad_norm": 2.4685783634645224,
8640 "kl": 0.047607421875,
8641 "learning_rate": 2.908045977011494e-07,
8642 "loss": 0.002,
8643 "reward": 1.8855441808700562,
8644 "reward_std": 0.022871162742376328,
8645 "rewards/accuracy_reward": 0.8855441808700562,
8646 "rewards/format_reward": 1.0,
8647 "step": 617
8648 },
8649 {
8650 "clip_ratio": 0.0,
8651 "completion_length": 107.88021087646484,
8652 "epoch": 7.103448275862069,
8653 "grad_norm": 6.626736993198597,
8654 "kl": 0.047119140625,
8655 "learning_rate": 2.896551724137931e-07,
8656 "loss": 0.002,
8657 "reward": 1.8934516906738281,
8658 "reward_std": 0.021955883130431175,
8659 "rewards/accuracy_reward": 0.8934516906738281,
8660 "rewards/format_reward": 1.0,
8661 "step": 618
8662 },
8663 {
8664 "clip_ratio": 0.0,
8665 "completion_length": 107.73698425292969,
8666 "epoch": 7.114942528735632,
8667 "grad_norm": 1.4755499555332456,
8668 "kl": 0.043212890625,
8669 "learning_rate": 2.885057471264368e-07,
8670 "loss": 0.0019,
8671 "reward": 1.8936246633529663,
8672 "reward_std": 0.020434757694602013,
8673 "rewards/accuracy_reward": 0.8936247229576111,
8674 "rewards/format_reward": 1.0,
8675 "step": 619
8676 },
8677 {
8678 "clip_ratio": 0.0,
8679 "completion_length": 107.470703125,
8680 "epoch": 7.126436781609195,
8681 "grad_norm": 2.394219832051532,
8682 "kl": 0.04541015625,
8683 "learning_rate": 2.873563218390804e-07,
8684 "loss": 0.0019,
8685 "reward": 1.9143403768539429,
8686 "reward_std": 0.020542293787002563,
8687 "rewards/accuracy_reward": 0.9149913787841797,
8688 "rewards/format_reward": 0.9993489980697632,
8689 "step": 620
8690 },
8691 {
8692 "clip_ratio": 0.0,
8693 "completion_length": 105.93815612792969,
8694 "epoch": 7.137931034482759,
8695 "grad_norm": 1.5127458134308953,
8696 "kl": 0.042724609375,
8697 "learning_rate": 2.8620689655172416e-07,
8698 "loss": 0.0018,
8699 "reward": 1.8820905685424805,
8700 "reward_std": 0.0194623414427042,
8701 "rewards/accuracy_reward": 0.8827415704727173,
8702 "rewards/format_reward": 0.9993489980697632,
8703 "step": 621
8704 },
8705 {
8706 "clip_ratio": 0.0,
8707 "completion_length": 109.171875,
8708 "epoch": 7.149425287356322,
8709 "grad_norm": 1.239872088979501,
8710 "kl": 0.04296875,
8711 "learning_rate": 2.850574712643678e-07,
8712 "loss": 0.0018,
8713 "reward": 1.8986458778381348,
8714 "reward_std": 0.020148858428001404,
8715 "rewards/accuracy_reward": 0.8986459970474243,
8716 "rewards/format_reward": 1.0,
8717 "step": 622
8718 },
8719 {
8720 "clip_ratio": 0.0,
8721 "completion_length": 107.97265625,
8722 "epoch": 7.160919540229885,
8723 "grad_norm": 2.458759960422581,
8724 "kl": 0.04248046875,
8725 "learning_rate": 2.8390804597701143e-07,
8726 "loss": 0.0018,
8727 "reward": 1.9095439910888672,
8728 "reward_std": 0.018151775002479553,
8729 "rewards/accuracy_reward": 0.9095439314842224,
8730 "rewards/format_reward": 1.0,
8731 "step": 623
8732 },
8733 {
8734 "clip_ratio": 0.0,
8735 "completion_length": 107.96484375,
8736 "epoch": 7.172413793103448,
8737 "grad_norm": 1.7316074752480677,
8738 "kl": 0.0439453125,
8739 "learning_rate": 2.827586206896552e-07,
8740 "loss": 0.0018,
8741 "reward": 1.916879653930664,
8742 "reward_std": 0.016119133681058884,
8743 "rewards/accuracy_reward": 0.9168797731399536,
8744 "rewards/format_reward": 1.0,
8745 "step": 624
8746 },
8747 {
8748 "clip_ratio": 0.0,
8749 "completion_length": 108.93685150146484,
8750 "epoch": 7.183908045977011,
8751 "grad_norm": 7.6966596331183545,
8752 "kl": 0.0439453125,
8753 "learning_rate": 2.816091954022988e-07,
8754 "loss": 0.0018,
8755 "reward": 1.904394507408142,
8756 "reward_std": 0.01645834557712078,
8757 "rewards/accuracy_reward": 0.9043946266174316,
8758 "rewards/format_reward": 1.0,
8759 "step": 625
8760 },
8761 {
8762 "clip_ratio": 0.0,
8763 "completion_length": 109.34310150146484,
8764 "epoch": 7.195402298850575,
8765 "grad_norm": 3.0027221313342034,
8766 "kl": 0.0419921875,
8767 "learning_rate": 2.8045977011494255e-07,
8768 "loss": 0.0017,
8769 "reward": 1.899554967880249,
8770 "reward_std": 0.022434815764427185,
8771 "rewards/accuracy_reward": 0.9002060294151306,
8772 "rewards/format_reward": 0.9993489980697632,
8773 "step": 626
8774 },
8775 {
8776 "clip_ratio": 0.0,
8777 "completion_length": 107.80989837646484,
8778 "epoch": 7.206896551724138,
8779 "grad_norm": 1.6600118557647507,
8780 "kl": 0.0400390625,
8781 "learning_rate": 2.793103448275862e-07,
8782 "loss": 0.0016,
8783 "reward": 1.8966201543807983,
8784 "reward_std": 0.019420992583036423,
8785 "rewards/accuracy_reward": 0.8972713351249695,
8786 "rewards/format_reward": 0.9993489980697632,
8787 "step": 627
8788 },
8789 {
8790 "clip_ratio": 0.0,
8791 "completion_length": 107.37174987792969,
8792 "epoch": 7.218390804597701,
8793 "grad_norm": 1.710315303436248,
8794 "kl": 0.046875,
8795 "learning_rate": 2.781609195402299e-07,
8796 "loss": 0.002,
8797 "reward": 1.9059438705444336,
8798 "reward_std": 0.01961967721581459,
8799 "rewards/accuracy_reward": 0.9059439897537231,
8800 "rewards/format_reward": 1.0,
8801 "step": 628
8802 },
8803 {
8804 "clip_ratio": 0.0,
8805 "completion_length": 106.951171875,
8806 "epoch": 7.2298850574712645,
8807 "grad_norm": 1.3756891644716789,
8808 "kl": 0.041748046875,
8809 "learning_rate": 2.7701149425287357e-07,
8810 "loss": 0.0017,
8811 "reward": 1.888430118560791,
8812 "reward_std": 0.0190572552382946,
8813 "rewards/accuracy_reward": 0.8890811800956726,
8814 "rewards/format_reward": 0.9993489980697632,
8815 "step": 629
8816 },
8817 {
8818 "clip_ratio": 0.0,
8819 "completion_length": 107.404296875,
8820 "epoch": 7.241379310344827,
8821 "grad_norm": 3970437544517841.0,
8822 "kl": 11751030521856.0,
8823 "learning_rate": 2.758620689655172e-07,
8824 "loss": 470864691200.0,
8825 "reward": 1.9054559469223022,
8826 "reward_std": 0.020271051675081253,
8827 "rewards/accuracy_reward": 0.9067580103874207,
8828 "rewards/format_reward": 0.9986979365348816,
8829 "step": 630
8830 },
8831 {
8832 "clip_ratio": 0.0,
8833 "completion_length": 107.57747650146484,
8834 "epoch": 7.252873563218391,
8835 "grad_norm": 1.8817887294402726,
8836 "kl": 0.04541015625,
8837 "learning_rate": 2.747126436781609e-07,
8838 "loss": 0.0019,
8839 "reward": 1.8884859085083008,
8840 "reward_std": 0.018849171698093414,
8841 "rewards/accuracy_reward": 0.8884860277175903,
8842 "rewards/format_reward": 1.0,
8843 "step": 631
8844 },
8845 {
8846 "clip_ratio": 0.0,
8847 "completion_length": 105.96875,
8848 "epoch": 7.264367816091954,
8849 "grad_norm": 1.7719505691506174,
8850 "kl": 0.046142578125,
8851 "learning_rate": 2.735632183908046e-07,
8852 "loss": 0.0019,
8853 "reward": 1.9029910564422607,
8854 "reward_std": 0.021298250183463097,
8855 "rewards/accuracy_reward": 0.9036421179771423,
8856 "rewards/format_reward": 0.9993489980697632,
8857 "step": 632
8858 },
8859 {
8860 "clip_ratio": 0.0,
8861 "completion_length": 104.53841400146484,
8862 "epoch": 7.275862068965517,
8863 "grad_norm": 2.449085093736059,
8864 "kl": 0.05078125,
8865 "learning_rate": 2.724137931034483e-07,
8866 "loss": 0.0021,
8867 "reward": 1.8815187215805054,
8868 "reward_std": 0.018999949097633362,
8869 "rewards/accuracy_reward": 0.8815188407897949,
8870 "rewards/format_reward": 1.0,
8871 "step": 633
8872 },
8873 {
8874 "clip_ratio": 0.0,
8875 "completion_length": 105.56640625,
8876 "epoch": 7.287356321839081,
8877 "grad_norm": 2.98961076193299,
8878 "kl": 0.044677734375,
8879 "learning_rate": 2.7126436781609197e-07,
8880 "loss": 0.0019,
8881 "reward": 1.9084595441818237,
8882 "reward_std": 0.0186910443007946,
8883 "rewards/accuracy_reward": 0.9084595441818237,
8884 "rewards/format_reward": 1.0,
8885 "step": 634
8886 },
8887 {
8888 "clip_ratio": 0.0,
8889 "completion_length": 105.57357025146484,
8890 "epoch": 7.2988505747126435,
8891 "grad_norm": 2.134961142900107,
8892 "kl": 0.04736328125,
8893 "learning_rate": 2.701149425287356e-07,
8894 "loss": 0.0019,
8895 "reward": 1.9063668251037598,
8896 "reward_std": 0.016174225136637688,
8897 "rewards/accuracy_reward": 0.906366765499115,
8898 "rewards/format_reward": 1.0,
8899 "step": 635
8900 },
8901 {
8902 "clip_ratio": 0.0,
8903 "completion_length": 105.66276550292969,
8904 "epoch": 7.310344827586207,
8905 "grad_norm": 1.4763178317390981,
8906 "kl": 0.04052734375,
8907 "learning_rate": 2.689655172413793e-07,
8908 "loss": 0.0017,
8909 "reward": 1.898642897605896,
8910 "reward_std": 0.017671559005975723,
8911 "rewards/accuracy_reward": 0.8986430168151855,
8912 "rewards/format_reward": 1.0,
8913 "step": 636
8914 },
8915 {
8916 "clip_ratio": 0.0,
8917 "completion_length": 105.041015625,
8918 "epoch": 7.32183908045977,
8919 "grad_norm": 1.6822870021161793,
8920 "kl": 0.048583984375,
8921 "learning_rate": 2.67816091954023e-07,
8922 "loss": 0.002,
8923 "reward": 1.8975701332092285,
8924 "reward_std": 0.018257655203342438,
8925 "rewards/accuracy_reward": 0.8982211947441101,
8926 "rewards/format_reward": 0.9993489980697632,
8927 "step": 637
8928 },
8929 {
8930 "clip_ratio": 0.0,
8931 "completion_length": 105.96549987792969,
8932 "epoch": 7.333333333333333,
8933 "grad_norm": 1.4931878670654746,
8934 "kl": 0.044189453125,
8935 "learning_rate": 2.6666666666666667e-07,
8936 "loss": 0.0018,
8937 "reward": 1.9090592861175537,
8938 "reward_std": 0.018635626882314682,
8939 "rewards/accuracy_reward": 0.9090592265129089,
8940 "rewards/format_reward": 1.0,
8941 "step": 638
8942 },
8943 {
8944 "clip_ratio": 0.0,
8945 "completion_length": 105.53450775146484,
8946 "epoch": 7.344827586206897,
8947 "grad_norm": 1.96698633932992,
8948 "kl": 0.044677734375,
8949 "learning_rate": 2.655172413793103e-07,
8950 "loss": 0.0019,
8951 "reward": 1.8955191373825073,
8952 "reward_std": 0.01896989531815052,
8953 "rewards/accuracy_reward": 0.8961701393127441,
8954 "rewards/format_reward": 0.9993489980697632,
8955 "step": 639
8956 },
8957 {
8958 "clip_ratio": 0.0,
8959 "completion_length": 105.40495300292969,
8960 "epoch": 7.35632183908046,
8961 "grad_norm": 2.0570123873234203,
8962 "kl": 0.0419921875,
8963 "learning_rate": 2.64367816091954e-07,
8964 "loss": 0.0018,
8965 "reward": 1.9056072235107422,
8966 "reward_std": 0.01762087456882,
8967 "rewards/accuracy_reward": 0.9062582850456238,
8968 "rewards/format_reward": 0.9993489980697632,
8969 "step": 640
8970 },
8971 {
8972 "clip_ratio": 0.0,
8973 "completion_length": 107.19857025146484,
8974 "epoch": 7.3678160919540225,
8975 "grad_norm": 1.6420815311017691,
8976 "kl": 0.043212890625,
8977 "learning_rate": 2.632183908045977e-07,
8978 "loss": 0.0018,
8979 "reward": 1.8763535022735596,
8980 "reward_std": 0.02036045491695404,
8981 "rewards/accuracy_reward": 0.8763534426689148,
8982 "rewards/format_reward": 1.0,
8983 "step": 641
8984 },
8985 {
8986 "clip_ratio": 0.0,
8987 "completion_length": 106.95247650146484,
8988 "epoch": 7.379310344827586,
8989 "grad_norm": 1.68174331233095,
8990 "kl": 0.04638671875,
8991 "learning_rate": 2.620689655172414e-07,
8992 "loss": 0.0019,
8993 "reward": 1.9021470546722412,
8994 "reward_std": 0.02229691669344902,
8995 "rewards/accuracy_reward": 0.9021470546722412,
8996 "rewards/format_reward": 1.0,
8997 "step": 642
8998 },
8999 {
9000 "clip_ratio": 0.0,
9001 "completion_length": 105.66732025146484,
9002 "epoch": 7.390804597701149,
9003 "grad_norm": 1.6367021800993964,
9004 "kl": 0.04296875,
9005 "learning_rate": 2.6091954022988507e-07,
9006 "loss": 0.0018,
9007 "reward": 1.904337763786316,
9008 "reward_std": 0.021419089287519455,
9009 "rewards/accuracy_reward": 0.9056398868560791,
9010 "rewards/format_reward": 0.9986979365348816,
9011 "step": 643
9012 },
9013 {
9014 "clip_ratio": 0.0,
9015 "completion_length": 105.353515625,
9016 "epoch": 7.402298850574713,
9017 "grad_norm": 1.7586162669562002,
9018 "kl": 0.047607421875,
9019 "learning_rate": 2.597701149425287e-07,
9020 "loss": 0.002,
9021 "reward": 1.8953742980957031,
9022 "reward_std": 0.01957518607378006,
9023 "rewards/accuracy_reward": 0.8960254192352295,
9024 "rewards/format_reward": 0.9993489980697632,
9025 "step": 644
9026 },
9027 {
9028 "clip_ratio": 0.0,
9029 "completion_length": 105.19010925292969,
9030 "epoch": 7.413793103448276,
9031 "grad_norm": 2.5073706132424447,
9032 "kl": 0.044921875,
9033 "learning_rate": 2.586206896551724e-07,
9034 "loss": 0.0018,
9035 "reward": 1.8890656232833862,
9036 "reward_std": 0.02112973853945732,
9037 "rewards/accuracy_reward": 0.8903676867485046,
9038 "rewards/format_reward": 0.9986979365348816,
9039 "step": 645
9040 },
9041 {
9042 "clip_ratio": 0.0,
9043 "completion_length": 106.71484375,
9044 "epoch": 7.425287356321839,
9045 "grad_norm": 2.802792541775325,
9046 "kl": 0.045166015625,
9047 "learning_rate": 2.574712643678161e-07,
9048 "loss": 0.0019,
9049 "reward": 1.8906625509262085,
9050 "reward_std": 0.021000966429710388,
9051 "rewards/accuracy_reward": 0.8906625509262085,
9052 "rewards/format_reward": 1.0,
9053 "step": 646
9054 },
9055 {
9056 "clip_ratio": 0.0,
9057 "completion_length": 105.3203125,
9058 "epoch": 7.436781609195402,
9059 "grad_norm": 1.531808532330182,
9060 "kl": 0.04296875,
9061 "learning_rate": 2.563218390804597e-07,
9062 "loss": 0.0018,
9063 "reward": 1.902920126914978,
9064 "reward_std": 0.018811281770467758,
9065 "rewards/accuracy_reward": 0.902920126914978,
9066 "rewards/format_reward": 1.0,
9067 "step": 647
9068 },
9069 {
9070 "clip_ratio": 0.0,
9071 "completion_length": 107.66796875,
9072 "epoch": 7.448275862068965,
9073 "grad_norm": 2.9559025913941355,
9074 "kl": 0.0439453125,
9075 "learning_rate": 2.5517241379310346e-07,
9076 "loss": 0.0018,
9077 "reward": 1.8963336944580078,
9078 "reward_std": 0.020042497664690018,
9079 "rewards/accuracy_reward": 0.8969849348068237,
9080 "rewards/format_reward": 0.9993489980697632,
9081 "step": 648
9082 },
9083 {
9084 "clip_ratio": 0.0,
9085 "completion_length": 107.14909362792969,
9086 "epoch": 7.459770114942529,
9087 "grad_norm": 1.9565147315824727,
9088 "kl": 0.04736328125,
9089 "learning_rate": 2.540229885057471e-07,
9090 "loss": 0.0019,
9091 "reward": 1.9087027311325073,
9092 "reward_std": 0.018100788816809654,
9093 "rewards/accuracy_reward": 0.9087027311325073,
9094 "rewards/format_reward": 1.0,
9095 "step": 649
9096 },
9097 {
9098 "clip_ratio": 0.0,
9099 "completion_length": 104.951171875,
9100 "epoch": 7.471264367816092,
9101 "grad_norm": 3.147083124745903,
9102 "kl": 0.04443359375,
9103 "learning_rate": 2.5287356321839084e-07,
9104 "loss": 0.0019,
9105 "reward": 1.9156570434570312,
9106 "reward_std": 0.01904350332915783,
9107 "rewards/accuracy_reward": 0.9156570434570312,
9108 "rewards/format_reward": 1.0,
9109 "step": 650
9110 },
9111 {
9112 "clip_ratio": 0.0,
9113 "completion_length": 106.552734375,
9114 "epoch": 7.482758620689655,
9115 "grad_norm": 1.6343407556749157,
9116 "kl": 0.051513671875,
9117 "learning_rate": 2.517241379310345e-07,
9118 "loss": 0.0022,
9119 "reward": 1.9167011976242065,
9120 "reward_std": 0.01607966236770153,
9121 "rewards/accuracy_reward": 0.9167011976242065,
9122 "rewards/format_reward": 1.0,
9123 "step": 651
9124 },
9125 {
9126 "clip_ratio": 0.0,
9127 "completion_length": 106.46940612792969,
9128 "epoch": 7.494252873563219,
9129 "grad_norm": 2.380370478966516,
9130 "kl": 0.04541015625,
9131 "learning_rate": 2.505747126436781e-07,
9132 "loss": 0.0019,
9133 "reward": 1.8996713161468506,
9134 "reward_std": 0.018466008827090263,
9135 "rewards/accuracy_reward": 0.9003223776817322,
9136 "rewards/format_reward": 0.9993489980697632,
9137 "step": 652
9138 },
9139 {
9140 "clip_ratio": 0.0,
9141 "completion_length": 104.76692962646484,
9142 "epoch": 7.505747126436781,
9143 "grad_norm": 1.572354097669339,
9144 "kl": 0.049560546875,
9145 "learning_rate": 2.494252873563218e-07,
9146 "loss": 0.0021,
9147 "reward": 1.8812230825424194,
9148 "reward_std": 0.020288746803998947,
9149 "rewards/accuracy_reward": 0.8818740844726562,
9150 "rewards/format_reward": 0.9993489980697632,
9151 "step": 653
9152 },
9153 {
9154 "clip_ratio": 0.0,
9155 "completion_length": 105.83984375,
9156 "epoch": 7.517241379310345,
9157 "grad_norm": 3.4037020500640507,
9158 "kl": 0.057373046875,
9159 "learning_rate": 2.482758620689655e-07,
9160 "loss": 0.0024,
9161 "reward": 1.9109630584716797,
9162 "reward_std": 0.01749694161117077,
9163 "rewards/accuracy_reward": 0.9109630584716797,
9164 "rewards/format_reward": 1.0,
9165 "step": 654
9166 },
9167 {
9168 "clip_ratio": 0.0,
9169 "completion_length": 106.07292175292969,
9170 "epoch": 7.528735632183908,
9171 "grad_norm": 1.4740870320089872,
9172 "kl": 0.049072265625,
9173 "learning_rate": 2.471264367816092e-07,
9174 "loss": 0.002,
9175 "reward": 1.9072259664535522,
9176 "reward_std": 0.016256902366876602,
9177 "rewards/accuracy_reward": 0.907226026058197,
9178 "rewards/format_reward": 1.0,
9179 "step": 655
9180 },
9181 {
9182 "clip_ratio": 0.0,
9183 "completion_length": 105.85612487792969,
9184 "epoch": 7.540229885057471,
9185 "grad_norm": 2.126019785623936,
9186 "kl": 0.04931640625,
9187 "learning_rate": 2.459770114942529e-07,
9188 "loss": 0.002,
9189 "reward": 1.8846591711044312,
9190 "reward_std": 0.018113628029823303,
9191 "rewards/accuracy_reward": 0.8846592903137207,
9192 "rewards/format_reward": 1.0,
9193 "step": 656
9194 },
9195 {
9196 "clip_ratio": 0.0,
9197 "completion_length": 107.55794525146484,
9198 "epoch": 7.551724137931035,
9199 "grad_norm": 7.89243748687455,
9200 "kl": 0.047607421875,
9201 "learning_rate": 2.448275862068965e-07,
9202 "loss": 0.002,
9203 "reward": 1.8965566158294678,
9204 "reward_std": 0.019315050914883614,
9205 "rewards/accuracy_reward": 0.896556556224823,
9206 "rewards/format_reward": 1.0,
9207 "step": 657
9208 },
9209 {
9210 "clip_ratio": 0.0,
9211 "completion_length": 105.54817962646484,
9212 "epoch": 7.563218390804598,
9213 "grad_norm": 3.4267861157909403,
9214 "kl": 0.046142578125,
9215 "learning_rate": 2.436781609195402e-07,
9216 "loss": 0.0019,
9217 "reward": 1.8947169780731201,
9218 "reward_std": 0.019318781793117523,
9219 "rewards/accuracy_reward": 0.8947169184684753,
9220 "rewards/format_reward": 1.0,
9221 "step": 658
9222 },
9223 {
9224 "clip_ratio": 0.0,
9225 "completion_length": 106.568359375,
9226 "epoch": 7.574712643678161,
9227 "grad_norm": 1.6852921029556645,
9228 "kl": 0.052490234375,
9229 "learning_rate": 2.425287356321839e-07,
9230 "loss": 0.0022,
9231 "reward": 1.9210675954818726,
9232 "reward_std": 0.015680838376283646,
9233 "rewards/accuracy_reward": 0.9210675954818726,
9234 "rewards/format_reward": 1.0,
9235 "step": 659
9236 },
9237 {
9238 "clip_ratio": 0.0,
9239 "completion_length": 109.3046875,
9240 "epoch": 7.586206896551724,
9241 "grad_norm": 2.206767755219908,
9242 "kl": 0.04541015625,
9243 "learning_rate": 2.413793103448276e-07,
9244 "loss": 0.0019,
9245 "reward": 1.8979721069335938,
9246 "reward_std": 0.016466915607452393,
9247 "rewards/accuracy_reward": 0.8979721069335938,
9248 "rewards/format_reward": 1.0,
9249 "step": 660
9250 },
9251 {
9252 "clip_ratio": 0.0,
9253 "completion_length": 109.42643737792969,
9254 "epoch": 7.597701149425287,
9255 "grad_norm": 2.116528759364446,
9256 "kl": 0.043701171875,
9257 "learning_rate": 2.402298850574712e-07,
9258 "loss": 0.0018,
9259 "reward": 1.9062350988388062,
9260 "reward_std": 0.015808025375008583,
9261 "rewards/accuracy_reward": 0.9062352180480957,
9262 "rewards/format_reward": 1.0,
9263 "step": 661
9264 },
9265 {
9266 "clip_ratio": 0.0,
9267 "completion_length": 108.13021087646484,
9268 "epoch": 7.609195402298851,
9269 "grad_norm": 2.3658556212140773,
9270 "kl": 0.04736328125,
9271 "learning_rate": 2.390804597701149e-07,
9272 "loss": 0.002,
9273 "reward": 1.9090293645858765,
9274 "reward_std": 0.020903117954730988,
9275 "rewards/accuracy_reward": 0.9096805453300476,
9276 "rewards/format_reward": 0.9993489980697632,
9277 "step": 662
9278 },
9279 {
9280 "clip_ratio": 0.0,
9281 "completion_length": 108.53060150146484,
9282 "epoch": 7.620689655172414,
9283 "grad_norm": 1.736164910774429,
9284 "kl": 0.04443359375,
9285 "learning_rate": 2.3793103448275863e-07,
9286 "loss": 0.0018,
9287 "reward": 1.892618179321289,
9288 "reward_std": 0.020376306027173996,
9289 "rewards/accuracy_reward": 0.8932693004608154,
9290 "rewards/format_reward": 0.9993489980697632,
9291 "step": 663
9292 },
9293 {
9294 "clip_ratio": 0.0,
9295 "completion_length": 109.14192962646484,
9296 "epoch": 7.6321839080459775,
9297 "grad_norm": 1.661775706838517,
9298 "kl": 0.04345703125,
9299 "learning_rate": 2.367816091954023e-07,
9300 "loss": 0.0018,
9301 "reward": 1.910132884979248,
9302 "reward_std": 0.01860654354095459,
9303 "rewards/accuracy_reward": 0.9107840061187744,
9304 "rewards/format_reward": 0.9993489980697632,
9305 "step": 664
9306 },
9307 {
9308 "clip_ratio": 0.0,
9309 "completion_length": 109.82878112792969,
9310 "epoch": 7.64367816091954,
9311 "grad_norm": 1.775794567645862,
9312 "kl": 0.047607421875,
9313 "learning_rate": 2.3563218390804595e-07,
9314 "loss": 0.002,
9315 "reward": 1.8953571319580078,
9316 "reward_std": 0.019160928204655647,
9317 "rewards/accuracy_reward": 0.8953571319580078,
9318 "rewards/format_reward": 1.0,
9319 "step": 665
9320 },
9321 {
9322 "clip_ratio": 0.0,
9323 "completion_length": 109.8515625,
9324 "epoch": 7.655172413793103,
9325 "grad_norm": 2.852042326779303,
9326 "kl": 0.05224609375,
9327 "learning_rate": 2.3448275862068964e-07,
9328 "loss": 0.0021,
9329 "reward": 1.9160139560699463,
9330 "reward_std": 0.01864560693502426,
9331 "rewards/accuracy_reward": 0.9160139560699463,
9332 "rewards/format_reward": 1.0,
9333 "step": 666
9334 },
9335 {
9336 "clip_ratio": 0.0,
9337 "completion_length": 110.22135925292969,
9338 "epoch": 7.666666666666667,
9339 "grad_norm": 2.00298655553839,
9340 "kl": 0.046142578125,
9341 "learning_rate": 2.3333333333333333e-07,
9342 "loss": 0.0019,
9343 "reward": 1.8910598754882812,
9344 "reward_std": 0.020590651780366898,
9345 "rewards/accuracy_reward": 0.8910599946975708,
9346 "rewards/format_reward": 1.0,
9347 "step": 667
9348 },
9349 {
9350 "clip_ratio": 0.0,
9351 "completion_length": 108.58268737792969,
9352 "epoch": 7.67816091954023,
9353 "grad_norm": 2.2015919771538255,
9354 "kl": 0.044189453125,
9355 "learning_rate": 2.3218390804597702e-07,
9356 "loss": 0.0018,
9357 "reward": 1.897012710571289,
9358 "reward_std": 0.02280309796333313,
9359 "rewards/accuracy_reward": 0.8983148336410522,
9360 "rewards/format_reward": 0.9986979365348816,
9361 "step": 668
9362 },
9363 {
9364 "clip_ratio": 0.0,
9365 "completion_length": 109.41471862792969,
9366 "epoch": 7.689655172413794,
9367 "grad_norm": 4.620691107539993,
9368 "kl": 0.048828125,
9369 "learning_rate": 2.3103448275862066e-07,
9370 "loss": 0.0021,
9371 "reward": 1.9026827812194824,
9372 "reward_std": 0.018936630338430405,
9373 "rewards/accuracy_reward": 0.903333842754364,
9374 "rewards/format_reward": 0.9993489980697632,
9375 "step": 669
9376 },
9377 {
9378 "clip_ratio": 0.0,
9379 "completion_length": 109.208984375,
9380 "epoch": 7.7011494252873565,
9381 "grad_norm": 1.5207850843287138,
9382 "kl": 0.052001953125,
9383 "learning_rate": 2.2988505747126435e-07,
9384 "loss": 0.0021,
9385 "reward": 1.8975833654403687,
9386 "reward_std": 0.01803259551525116,
9387 "rewards/accuracy_reward": 0.8975834846496582,
9388 "rewards/format_reward": 1.0,
9389 "step": 670
9390 },
9391 {
9392 "clip_ratio": 0.0,
9393 "completion_length": 110.48698425292969,
9394 "epoch": 7.712643678160919,
9395 "grad_norm": 1.1439576464320695,
9396 "kl": 0.04248046875,
9397 "learning_rate": 2.2873563218390804e-07,
9398 "loss": 0.0018,
9399 "reward": 1.8865493535995483,
9400 "reward_std": 0.018641415983438492,
9401 "rewards/accuracy_reward": 0.8865493535995483,
9402 "rewards/format_reward": 1.0,
9403 "step": 671
9404 },
9405 {
9406 "clip_ratio": 0.0,
9407 "completion_length": 109.46745300292969,
9408 "epoch": 7.724137931034483,
9409 "grad_norm": 1.5940069065520925,
9410 "kl": 0.0419921875,
9411 "learning_rate": 2.2758620689655173e-07,
9412 "loss": 0.0017,
9413 "reward": 1.874269723892212,
9414 "reward_std": 0.02033749222755432,
9415 "rewards/accuracy_reward": 0.8742696642875671,
9416 "rewards/format_reward": 1.0,
9417 "step": 672
9418 },
9419 {
9420 "clip_ratio": 0.0,
9421 "completion_length": 108.087890625,
9422 "epoch": 7.735632183908046,
9423 "grad_norm": 2.1488145925757234,
9424 "kl": 0.048828125,
9425 "learning_rate": 2.264367816091954e-07,
9426 "loss": 0.002,
9427 "reward": 1.9152870178222656,
9428 "reward_std": 0.015497724525630474,
9429 "rewards/accuracy_reward": 0.9152869582176208,
9430 "rewards/format_reward": 1.0,
9431 "step": 673
9432 },
9433 {
9434 "clip_ratio": 0.0,
9435 "completion_length": 108.20052337646484,
9436 "epoch": 7.747126436781609,
9437 "grad_norm": 1.5334204725557474,
9438 "kl": 0.045654296875,
9439 "learning_rate": 2.2528735632183905e-07,
9440 "loss": 0.0019,
9441 "reward": 1.9050960540771484,
9442 "reward_std": 0.021023746579885483,
9443 "rewards/accuracy_reward": 0.9057471752166748,
9444 "rewards/format_reward": 0.9993489980697632,
9445 "step": 674
9446 },
9447 {
9448 "clip_ratio": 0.0,
9449 "completion_length": 109.88671875,
9450 "epoch": 7.758620689655173,
9451 "grad_norm": 1.6306945359970868,
9452 "kl": 0.03955078125,
9453 "learning_rate": 2.2413793103448274e-07,
9454 "loss": 0.0017,
9455 "reward": 1.908623456954956,
9456 "reward_std": 0.017657993361353874,
9457 "rewards/accuracy_reward": 0.9086233973503113,
9458 "rewards/format_reward": 1.0,
9459 "step": 675
9460 },
9461 {
9462 "clip_ratio": 0.0,
9463 "completion_length": 108.23567962646484,
9464 "epoch": 7.7701149425287355,
9465 "grad_norm": 1.3620347321088389,
9466 "kl": 0.0400390625,
9467 "learning_rate": 2.2298850574712643e-07,
9468 "loss": 0.0017,
9469 "reward": 1.8560750484466553,
9470 "reward_std": 0.021627038717269897,
9471 "rewards/accuracy_reward": 0.8573770523071289,
9472 "rewards/format_reward": 0.9986979365348816,
9473 "step": 676
9474 },
9475 {
9476 "clip_ratio": 0.0,
9477 "completion_length": 111.77278900146484,
9478 "epoch": 7.781609195402299,
9479 "grad_norm": 2.4445187153033547,
9480 "kl": 0.041015625,
9481 "learning_rate": 2.218390804597701e-07,
9482 "loss": 0.0017,
9483 "reward": 1.9019120931625366,
9484 "reward_std": 0.02054600417613983,
9485 "rewards/accuracy_reward": 0.9019122123718262,
9486 "rewards/format_reward": 1.0,
9487 "step": 677
9488 },
9489 {
9490 "clip_ratio": 0.0,
9491 "completion_length": 107.171875,
9492 "epoch": 7.793103448275862,
9493 "grad_norm": 2.2620273599837537,
9494 "kl": 0.03759765625,
9495 "learning_rate": 2.206896551724138e-07,
9496 "loss": 0.0016,
9497 "reward": 1.899298071861267,
9498 "reward_std": 0.019102804362773895,
9499 "rewards/accuracy_reward": 0.8992981314659119,
9500 "rewards/format_reward": 1.0,
9501 "step": 678
9502 },
9503 {
9504 "clip_ratio": 0.0,
9505 "completion_length": 110.884765625,
9506 "epoch": 7.804597701149425,
9507 "grad_norm": 2.7848035680621446,
9508 "kl": 0.038330078125,
9509 "learning_rate": 2.1954022988505748e-07,
9510 "loss": 0.0016,
9511 "reward": 1.8842277526855469,
9512 "reward_std": 0.015971308574080467,
9513 "rewards/accuracy_reward": 0.8842277526855469,
9514 "rewards/format_reward": 1.0,
9515 "step": 679
9516 },
9517 {
9518 "clip_ratio": 0.0,
9519 "completion_length": 109.04232025146484,
9520 "epoch": 7.816091954022989,
9521 "grad_norm": 1.526409335432903,
9522 "kl": 0.04736328125,
9523 "learning_rate": 2.1839080459770114e-07,
9524 "loss": 0.002,
9525 "reward": 1.9061853885650635,
9526 "reward_std": 0.018970629200339317,
9527 "rewards/accuracy_reward": 0.9061853289604187,
9528 "rewards/format_reward": 1.0,
9529 "step": 680
9530 },
9531 {
9532 "clip_ratio": 0.0,
9533 "completion_length": 108.986328125,
9534 "epoch": 7.827586206896552,
9535 "grad_norm": 2.4205054636705627,
9536 "kl": 0.04345703125,
9537 "learning_rate": 2.172413793103448e-07,
9538 "loss": 0.0019,
9539 "reward": 1.8783591985702515,
9540 "reward_std": 0.020205635577440262,
9541 "rewards/accuracy_reward": 0.8783591985702515,
9542 "rewards/format_reward": 1.0,
9543 "step": 681
9544 },
9545 {
9546 "clip_ratio": 0.0,
9547 "completion_length": 110.24544525146484,
9548 "epoch": 7.8390804597701145,
9549 "grad_norm": 1.6438250497849267,
9550 "kl": 0.0380859375,
9551 "learning_rate": 2.160919540229885e-07,
9552 "loss": 0.0016,
9553 "reward": 1.9045305252075195,
9554 "reward_std": 0.020377201959490776,
9555 "rewards/accuracy_reward": 0.9051817655563354,
9556 "rewards/format_reward": 0.9993489980697632,
9557 "step": 682
9558 },
9559 {
9560 "clip_ratio": 0.0,
9561 "completion_length": 107.76171875,
9562 "epoch": 7.850574712643678,
9563 "grad_norm": 1.2996080329358168,
9564 "kl": 0.041748046875,
9565 "learning_rate": 2.1494252873563218e-07,
9566 "loss": 0.0017,
9567 "reward": 1.907975435256958,
9568 "reward_std": 0.017765961587429047,
9569 "rewards/accuracy_reward": 0.9079753756523132,
9570 "rewards/format_reward": 1.0,
9571 "step": 683
9572 },
9573 {
9574 "clip_ratio": 0.0,
9575 "completion_length": 109.03190612792969,
9576 "epoch": 7.862068965517241,
9577 "grad_norm": 5.191492192434162,
9578 "kl": 0.0400390625,
9579 "learning_rate": 2.1379310344827587e-07,
9580 "loss": 0.0017,
9581 "reward": 1.9033807516098022,
9582 "reward_std": 0.019389839842915535,
9583 "rewards/accuracy_reward": 0.9033806324005127,
9584 "rewards/format_reward": 1.0,
9585 "step": 684
9586 },
9587 {
9588 "clip_ratio": 0.0,
9589 "completion_length": 108.60026550292969,
9590 "epoch": 7.873563218390805,
9591 "grad_norm": 2.4150479297071725,
9592 "kl": 0.04248046875,
9593 "learning_rate": 2.126436781609195e-07,
9594 "loss": 0.0018,
9595 "reward": 1.9296969175338745,
9596 "reward_std": 0.01699325256049633,
9597 "rewards/accuracy_reward": 0.9296969175338745,
9598 "rewards/format_reward": 1.0,
9599 "step": 685
9600 },
9601 {
9602 "clip_ratio": 0.0,
9603 "completion_length": 107.80078125,
9604 "epoch": 7.885057471264368,
9605 "grad_norm": 1.3996427566274907,
9606 "kl": 0.041015625,
9607 "learning_rate": 2.114942528735632e-07,
9608 "loss": 0.0017,
9609 "reward": 1.9054107666015625,
9610 "reward_std": 0.01808765158057213,
9611 "rewards/accuracy_reward": 0.905410885810852,
9612 "rewards/format_reward": 1.0,
9613 "step": 686
9614 },
9615 {
9616 "clip_ratio": 0.0,
9617 "completion_length": 108.173828125,
9618 "epoch": 7.896551724137931,
9619 "grad_norm": 1.4978085636014402,
9620 "kl": 0.041259765625,
9621 "learning_rate": 2.103448275862069e-07,
9622 "loss": 0.0017,
9623 "reward": 1.9016507863998413,
9624 "reward_std": 0.018792547285556793,
9625 "rewards/accuracy_reward": 0.9016507863998413,
9626 "rewards/format_reward": 1.0,
9627 "step": 687
9628 },
9629 {
9630 "clip_ratio": 0.0,
9631 "completion_length": 109.52995300292969,
9632 "epoch": 7.908045977011494,
9633 "grad_norm": 3.073796889740199,
9634 "kl": 0.03857421875,
9635 "learning_rate": 2.0919540229885058e-07,
9636 "loss": 0.0016,
9637 "reward": 1.913766860961914,
9638 "reward_std": 0.01984010636806488,
9639 "rewards/accuracy_reward": 0.9144179224967957,
9640 "rewards/format_reward": 0.9993489980697632,
9641 "step": 688
9642 },
9643 {
9644 "clip_ratio": 0.0,
9645 "completion_length": 106.84049987792969,
9646 "epoch": 7.919540229885057,
9647 "grad_norm": 1.7046597310271008,
9648 "kl": 0.039306640625,
9649 "learning_rate": 2.0804597701149424e-07,
9650 "loss": 0.0017,
9651 "reward": 1.8879507780075073,
9652 "reward_std": 0.02212076261639595,
9653 "rewards/accuracy_reward": 0.8892530202865601,
9654 "rewards/format_reward": 0.9986979365348816,
9655 "step": 689
9656 },
9657 {
9658 "clip_ratio": 0.0,
9659 "completion_length": 108.87435150146484,
9660 "epoch": 7.931034482758621,
9661 "grad_norm": 2.469859985195517,
9662 "kl": 0.044921875,
9663 "learning_rate": 2.0689655172413793e-07,
9664 "loss": 0.0019,
9665 "reward": 1.9116500616073608,
9666 "reward_std": 0.021436279639601707,
9667 "rewards/accuracy_reward": 0.9129521250724792,
9668 "rewards/format_reward": 0.9986979365348816,
9669 "step": 690
9670 },
9671 {
9672 "clip_ratio": 0.0,
9673 "completion_length": 109.86979675292969,
9674 "epoch": 7.942528735632184,
9675 "grad_norm": 1.7410487864129292,
9676 "kl": 0.043212890625,
9677 "learning_rate": 2.057471264367816e-07,
9678 "loss": 0.0018,
9679 "reward": 1.9140040874481201,
9680 "reward_std": 0.018954966217279434,
9681 "rewards/accuracy_reward": 0.9140040278434753,
9682 "rewards/format_reward": 1.0,
9683 "step": 691
9684 },
9685 {
9686 "clip_ratio": 0.0,
9687 "completion_length": 105.541015625,
9688 "epoch": 7.954022988505747,
9689 "grad_norm": 1.7493728678791296,
9690 "kl": 0.04443359375,
9691 "learning_rate": 2.0459770114942528e-07,
9692 "loss": 0.0019,
9693 "reward": 1.9026793241500854,
9694 "reward_std": 0.015167943201959133,
9695 "rewards/accuracy_reward": 0.9026793241500854,
9696 "rewards/format_reward": 1.0,
9697 "step": 692
9698 },
9699 {
9700 "clip_ratio": 0.0,
9701 "completion_length": 105.72982025146484,
9702 "epoch": 7.9655172413793105,
9703 "grad_norm": 1.7859158502563095,
9704 "kl": 0.043701171875,
9705 "learning_rate": 2.0344827586206895e-07,
9706 "loss": 0.0018,
9707 "reward": 1.8844761848449707,
9708 "reward_std": 0.021765243262052536,
9709 "rewards/accuracy_reward": 0.8844761848449707,
9710 "rewards/format_reward": 1.0,
9711 "step": 693
9712 },
9713 {
9714 "clip_ratio": 0.0,
9715 "completion_length": 106.84440612792969,
9716 "epoch": 7.977011494252873,
9717 "grad_norm": 1.5585940385657546,
9718 "kl": 0.04541015625,
9719 "learning_rate": 2.0229885057471264e-07,
9720 "loss": 0.0019,
9721 "reward": 1.8896058797836304,
9722 "reward_std": 0.022624578326940536,
9723 "rewards/accuracy_reward": 0.8902568817138672,
9724 "rewards/format_reward": 0.9993489980697632,
9725 "step": 694
9726 },
9727 {
9728 "clip_ratio": 0.0,
9729 "completion_length": 104.83268737792969,
9730 "epoch": 7.988505747126437,
9731 "grad_norm": 4.611642156415871,
9732 "kl": 0.045654296875,
9733 "learning_rate": 2.0114942528735633e-07,
9734 "loss": 0.0019,
9735 "reward": 1.9184387922286987,
9736 "reward_std": 0.01827239617705345,
9737 "rewards/accuracy_reward": 0.9184388518333435,
9738 "rewards/format_reward": 1.0,
9739 "step": 695
9740 },
9741 {
9742 "clip_ratio": 0.0,
9743 "completion_length": 100.7078628540039,
9744 "epoch": 8.0,
9745 "grad_norm": 3.175089899536824,
9746 "kl": 0.043212890625,
9747 "learning_rate": 2e-07,
9748 "loss": 0.0019,
9749 "reward": 1.917198657989502,
9750 "reward_std": 0.017983654513955116,
9751 "rewards/accuracy_reward": 0.9171985983848572,
9752 "rewards/format_reward": 1.0,
9753 "step": 696
9754 },
9755 {
9756 "clip_ratio": 0.0,
9757 "completion_length": 108.88671875,
9758 "epoch": 8.011494252873563,
9759 "grad_norm": 1.599375509212271,
9760 "kl": 0.04736328125,
9761 "learning_rate": 1.9885057471264365e-07,
9762 "loss": 0.002,
9763 "reward": 1.900770902633667,
9764 "reward_std": 0.020885910838842392,
9765 "rewards/accuracy_reward": 0.9007708430290222,
9766 "rewards/format_reward": 1.0,
9767 "step": 697
9768 },
9769 {
9770 "clip_ratio": 0.0,
9771 "completion_length": 110.009765625,
9772 "epoch": 8.022988505747126,
9773 "grad_norm": 1.5228925864244678,
9774 "kl": 0.042236328125,
9775 "learning_rate": 1.9770114942528734e-07,
9776 "loss": 0.0018,
9777 "reward": 1.8776483535766602,
9778 "reward_std": 0.020006125792860985,
9779 "rewards/accuracy_reward": 0.878299355506897,
9780 "rewards/format_reward": 0.9993489980697632,
9781 "step": 698
9782 },
9783 {
9784 "clip_ratio": 0.0,
9785 "completion_length": 108.82942962646484,
9786 "epoch": 8.03448275862069,
9787 "grad_norm": 1.8238558955068145,
9788 "kl": 0.044189453125,
9789 "learning_rate": 1.9655172413793103e-07,
9790 "loss": 0.0018,
9791 "reward": 1.9061753749847412,
9792 "reward_std": 0.01833120919764042,
9793 "rewards/accuracy_reward": 0.9061753153800964,
9794 "rewards/format_reward": 1.0,
9795 "step": 699
9796 },
9797 {
9798 "clip_ratio": 0.0,
9799 "completion_length": 108.52864837646484,
9800 "epoch": 8.045977011494253,
9801 "grad_norm": 1.3341303550118901,
9802 "kl": 0.048095703125,
9803 "learning_rate": 1.9540229885057472e-07,
9804 "loss": 0.002,
9805 "reward": 1.8628864288330078,
9806 "reward_std": 0.02142808958888054,
9807 "rewards/accuracy_reward": 0.8628865480422974,
9808 "rewards/format_reward": 1.0,
9809 "step": 700
9810 },
9811 {
9812 "clip_ratio": 0.0,
9813 "completion_length": 108.52799987792969,
9814 "epoch": 8.057471264367816,
9815 "grad_norm": 1.761222226586845,
9816 "kl": 0.0439453125,
9817 "learning_rate": 1.9425287356321836e-07,
9818 "loss": 0.0018,
9819 "reward": 1.8934277296066284,
9820 "reward_std": 0.0226028673350811,
9821 "rewards/accuracy_reward": 0.8940789103507996,
9822 "rewards/format_reward": 0.9993489980697632,
9823 "step": 701
9824 },
9825 {
9826 "clip_ratio": 0.0,
9827 "completion_length": 107.29948425292969,
9828 "epoch": 8.068965517241379,
9829 "grad_norm": 1.5102446080782972,
9830 "kl": 0.0439453125,
9831 "learning_rate": 1.9310344827586205e-07,
9832 "loss": 0.0018,
9833 "reward": 1.8807815313339233,
9834 "reward_std": 0.023064211010932922,
9835 "rewards/accuracy_reward": 0.8814325332641602,
9836 "rewards/format_reward": 0.9993489980697632,
9837 "step": 702
9838 },
9839 {
9840 "clip_ratio": 0.0,
9841 "completion_length": 106.68099212646484,
9842 "epoch": 8.080459770114942,
9843 "grad_norm": 2.6463573417228234,
9844 "kl": 0.0458984375,
9845 "learning_rate": 1.9195402298850574e-07,
9846 "loss": 0.0019,
9847 "reward": 1.91706120967865,
9848 "reward_std": 0.018780669197440147,
9849 "rewards/accuracy_reward": 0.9170613288879395,
9850 "rewards/format_reward": 1.0,
9851 "step": 703
9852 },
9853 {
9854 "clip_ratio": 0.0,
9855 "completion_length": 104.86653900146484,
9856 "epoch": 8.091954022988507,
9857 "grad_norm": 1.9782556380033625,
9858 "kl": 0.046875,
9859 "learning_rate": 1.9080459770114943e-07,
9860 "loss": 0.002,
9861 "reward": 1.9006422758102417,
9862 "reward_std": 0.02227187156677246,
9863 "rewards/accuracy_reward": 0.9012932777404785,
9864 "rewards/format_reward": 0.9993489980697632,
9865 "step": 704
9866 },
9867 {
9868 "clip_ratio": 0.0,
9869 "completion_length": 106.27799987792969,
9870 "epoch": 8.10344827586207,
9871 "grad_norm": 1.571198957982024,
9872 "kl": 0.043701171875,
9873 "learning_rate": 1.896551724137931e-07,
9874 "loss": 0.0018,
9875 "reward": 1.901580810546875,
9876 "reward_std": 0.018761413171887398,
9877 "rewards/accuracy_reward": 0.901580810546875,
9878 "rewards/format_reward": 1.0,
9879 "step": 705
9880 },
9881 {
9882 "clip_ratio": 0.0,
9883 "completion_length": 105.69140625,
9884 "epoch": 8.114942528735632,
9885 "grad_norm": 2.093159658605368,
9886 "kl": 0.044921875,
9887 "learning_rate": 1.8850574712643678e-07,
9888 "loss": 0.0019,
9889 "reward": 1.891250491142273,
9890 "reward_std": 0.019944649189710617,
9891 "rewards/accuracy_reward": 0.8912505507469177,
9892 "rewards/format_reward": 1.0,
9893 "step": 706
9894 },
9895 {
9896 "clip_ratio": 0.0,
9897 "completion_length": 105.19921875,
9898 "epoch": 8.126436781609195,
9899 "grad_norm": 2.529542496168213,
9900 "kl": 0.041748046875,
9901 "learning_rate": 1.8735632183908045e-07,
9902 "loss": 0.0018,
9903 "reward": 1.8948949575424194,
9904 "reward_std": 0.020839963108301163,
9905 "rewards/accuracy_reward": 0.8948950171470642,
9906 "rewards/format_reward": 1.0,
9907 "step": 707
9908 },
9909 {
9910 "clip_ratio": 0.0,
9911 "completion_length": 105.47721862792969,
9912 "epoch": 8.137931034482758,
9913 "grad_norm": 2.451670338869928,
9914 "kl": 0.05419921875,
9915 "learning_rate": 1.8620689655172414e-07,
9916 "loss": 0.0023,
9917 "reward": 1.8975166082382202,
9918 "reward_std": 0.02222670242190361,
9919 "rewards/accuracy_reward": 0.898167610168457,
9920 "rewards/format_reward": 0.9993489980697632,
9921 "step": 708
9922 },
9923 {
9924 "clip_ratio": 0.0,
9925 "completion_length": 105.72005462646484,
9926 "epoch": 8.149425287356323,
9927 "grad_norm": 2.6109529506496525,
9928 "kl": 0.04150390625,
9929 "learning_rate": 1.850574712643678e-07,
9930 "loss": 0.0018,
9931 "reward": 1.8816875219345093,
9932 "reward_std": 0.021295679733157158,
9933 "rewards/accuracy_reward": 0.881687343120575,
9934 "rewards/format_reward": 1.0,
9935 "step": 709
9936 },
9937 {
9938 "clip_ratio": 0.0,
9939 "completion_length": 105.10482025146484,
9940 "epoch": 8.160919540229886,
9941 "grad_norm": 2.236575205027573,
9942 "kl": 0.04638671875,
9943 "learning_rate": 1.839080459770115e-07,
9944 "loss": 0.0019,
9945 "reward": 1.8859424591064453,
9946 "reward_std": 0.01964872144162655,
9947 "rewards/accuracy_reward": 0.8859424591064453,
9948 "rewards/format_reward": 1.0,
9949 "step": 710
9950 },
9951 {
9952 "clip_ratio": 0.0,
9953 "completion_length": 103.97721862792969,
9954 "epoch": 8.172413793103448,
9955 "grad_norm": 2.981879850123249,
9956 "kl": 0.044677734375,
9957 "learning_rate": 1.8275862068965518e-07,
9958 "loss": 0.0019,
9959 "reward": 1.9109057188034058,
9960 "reward_std": 0.01663116365671158,
9961 "rewards/accuracy_reward": 0.9109057188034058,
9962 "rewards/format_reward": 1.0,
9963 "step": 711
9964 },
9965 {
9966 "clip_ratio": 0.0,
9967 "completion_length": 104.57487487792969,
9968 "epoch": 8.183908045977011,
9969 "grad_norm": 2.3188147233415277,
9970 "kl": 0.05029296875,
9971 "learning_rate": 1.8160919540229884e-07,
9972 "loss": 0.0021,
9973 "reward": 1.8978303670883179,
9974 "reward_std": 0.021748527884483337,
9975 "rewards/accuracy_reward": 0.8978304266929626,
9976 "rewards/format_reward": 1.0,
9977 "step": 712
9978 },
9979 {
9980 "clip_ratio": 0.0,
9981 "completion_length": 104.91927337646484,
9982 "epoch": 8.195402298850574,
9983 "grad_norm": 3.225354097129705,
9984 "kl": 0.0439453125,
9985 "learning_rate": 1.804597701149425e-07,
9986 "loss": 0.0019,
9987 "reward": 1.8894078731536865,
9988 "reward_std": 0.018167873844504356,
9989 "rewards/accuracy_reward": 0.8894079923629761,
9990 "rewards/format_reward": 1.0,
9991 "step": 713
9992 },
9993 {
9994 "clip_ratio": 0.0,
9995 "completion_length": 102.33203125,
9996 "epoch": 8.206896551724139,
9997 "grad_norm": 2.1412928496025216,
9998 "kl": 0.0458984375,
9999 "learning_rate": 1.793103448275862e-07,
10000 "loss": 0.0019,
10001 "reward": 1.9061758518218994,
10002 "reward_std": 0.019810751080513,
10003 "rewards/accuracy_reward": 0.9061757922172546,
10004 "rewards/format_reward": 1.0,
10005 "step": 714
10006 },
10007 {
10008 "clip_ratio": 0.0,
10009 "completion_length": 103.83919525146484,
10010 "epoch": 8.218390804597702,
10011 "grad_norm": 1.4510305932233432,
10012 "kl": 0.049072265625,
10013 "learning_rate": 1.7816091954022988e-07,
10014 "loss": 0.002,
10015 "reward": 1.8892874717712402,
10016 "reward_std": 0.01888991892337799,
10017 "rewards/accuracy_reward": 0.8892874717712402,
10018 "rewards/format_reward": 1.0,
10019 "step": 715
10020 },
10021 {
10022 "clip_ratio": 0.0,
10023 "completion_length": 103.51302337646484,
10024 "epoch": 8.229885057471265,
10025 "grad_norm": 2.2029044678618663,
10026 "kl": 0.0537109375,
10027 "learning_rate": 1.7701149425287357e-07,
10028 "loss": 0.0022,
10029 "reward": 1.885032057762146,
10030 "reward_std": 0.01888071931898594,
10031 "rewards/accuracy_reward": 0.8850321173667908,
10032 "rewards/format_reward": 1.0,
10033 "step": 716
10034 },
10035 {
10036 "clip_ratio": 0.0,
10037 "completion_length": 103.80534362792969,
10038 "epoch": 8.241379310344827,
10039 "grad_norm": 2.3323680467220145,
10040 "kl": 0.04833984375,
10041 "learning_rate": 1.758620689655172e-07,
10042 "loss": 0.002,
10043 "reward": 1.9214333295822144,
10044 "reward_std": 0.021036818623542786,
10045 "rewards/accuracy_reward": 0.9220844507217407,
10046 "rewards/format_reward": 0.9993489980697632,
10047 "step": 717
10048 },
10049 {
10050 "clip_ratio": 0.0,
10051 "completion_length": 104.37435150146484,
10052 "epoch": 8.25287356321839,
10053 "grad_norm": 2.8730245319899317,
10054 "kl": 0.051513671875,
10055 "learning_rate": 1.747126436781609e-07,
10056 "loss": 0.0022,
10057 "reward": 1.8911713361740112,
10058 "reward_std": 0.019480139017105103,
10059 "rewards/accuracy_reward": 0.892473578453064,
10060 "rewards/format_reward": 0.9986979365348816,
10061 "step": 718
10062 },
10063 {
10064 "clip_ratio": 0.0,
10065 "completion_length": 102.451171875,
10066 "epoch": 8.264367816091955,
10067 "grad_norm": 1.9234276428720276,
10068 "kl": 0.045654296875,
10069 "learning_rate": 1.735632183908046e-07,
10070 "loss": 0.0019,
10071 "reward": 1.9077482223510742,
10072 "reward_std": 0.01729690097272396,
10073 "rewards/accuracy_reward": 0.9077481627464294,
10074 "rewards/format_reward": 1.0,
10075 "step": 719
10076 },
10077 {
10078 "clip_ratio": 0.0,
10079 "completion_length": 102.9140625,
10080 "epoch": 8.275862068965518,
10081 "grad_norm": 1.7140585268322626,
10082 "kl": 0.05126953125,
10083 "learning_rate": 1.7241379310344828e-07,
10084 "loss": 0.0021,
10085 "reward": 1.9137029647827148,
10086 "reward_std": 0.018116027116775513,
10087 "rewards/accuracy_reward": 0.9137029647827148,
10088 "rewards/format_reward": 1.0,
10089 "step": 720
10090 },
10091 {
10092 "clip_ratio": 0.0,
10093 "completion_length": 103.93034362792969,
10094 "epoch": 8.28735632183908,
10095 "grad_norm": 2.219230959644551,
10096 "kl": 0.04736328125,
10097 "learning_rate": 1.7126436781609194e-07,
10098 "loss": 0.0019,
10099 "reward": 1.894079566001892,
10100 "reward_std": 0.02045278623700142,
10101 "rewards/accuracy_reward": 0.8940793871879578,
10102 "rewards/format_reward": 1.0,
10103 "step": 721
10104 },
10105 {
10106 "clip_ratio": 0.0,
10107 "completion_length": 104.455078125,
10108 "epoch": 8.298850574712644,
10109 "grad_norm": 2.5331188317191087,
10110 "kl": 0.04541015625,
10111 "learning_rate": 1.7011494252873563e-07,
10112 "loss": 0.0019,
10113 "reward": 1.9165410995483398,
10114 "reward_std": 0.017832912504673004,
10115 "rewards/accuracy_reward": 0.9165410995483398,
10116 "rewards/format_reward": 1.0,
10117 "step": 722
10118 },
10119 {
10120 "clip_ratio": 0.0,
10121 "completion_length": 103.32357025146484,
10122 "epoch": 8.310344827586206,
10123 "grad_norm": 4.02373195859424,
10124 "kl": 0.0478515625,
10125 "learning_rate": 1.689655172413793e-07,
10126 "loss": 0.002,
10127 "reward": 1.894669532775879,
10128 "reward_std": 0.020446766167879105,
10129 "rewards/accuracy_reward": 0.8946696519851685,
10130 "rewards/format_reward": 1.0,
10131 "step": 723
10132 },
10133 {
10134 "clip_ratio": 0.0,
10135 "completion_length": 104.5,
10136 "epoch": 8.32183908045977,
10137 "grad_norm": 1.8288623944358686,
10138 "kl": 0.051513671875,
10139 "learning_rate": 1.67816091954023e-07,
10140 "loss": 0.0021,
10141 "reward": 1.9076260328292847,
10142 "reward_std": 0.019866902381181717,
10143 "rewards/accuracy_reward": 0.9082770347595215,
10144 "rewards/format_reward": 0.9993489980697632,
10145 "step": 724
10146 },
10147 {
10148 "clip_ratio": 0.0,
10149 "completion_length": 104.72786712646484,
10150 "epoch": 8.333333333333334,
10151 "grad_norm": 5.4298852737123635,
10152 "kl": 0.048095703125,
10153 "learning_rate": 1.6666666666666665e-07,
10154 "loss": 0.002,
10155 "reward": 1.9031809568405151,
10156 "reward_std": 0.02082451805472374,
10157 "rewards/accuracy_reward": 0.903831958770752,
10158 "rewards/format_reward": 0.9993489980697632,
10159 "step": 725
10160 },
10161 {
10162 "clip_ratio": 0.0,
10163 "completion_length": 104.61458587646484,
10164 "epoch": 8.344827586206897,
10165 "grad_norm": 2.2093808314572296,
10166 "kl": 0.049072265625,
10167 "learning_rate": 1.6551724137931034e-07,
10168 "loss": 0.002,
10169 "reward": 1.8967069387435913,
10170 "reward_std": 0.018487486988306046,
10171 "rewards/accuracy_reward": 0.8967069983482361,
10172 "rewards/format_reward": 1.0,
10173 "step": 726
10174 },
10175 {
10176 "clip_ratio": 0.0,
10177 "completion_length": 105.33268737792969,
10178 "epoch": 8.35632183908046,
10179 "grad_norm": 1.6410241094216826,
10180 "kl": 0.0478515625,
10181 "learning_rate": 1.6436781609195403e-07,
10182 "loss": 0.002,
10183 "reward": 1.9131641387939453,
10184 "reward_std": 0.016289152204990387,
10185 "rewards/accuracy_reward": 0.9131642580032349,
10186 "rewards/format_reward": 1.0,
10187 "step": 727
10188 },
10189 {
10190 "clip_ratio": 0.0,
10191 "completion_length": 104.259765625,
10192 "epoch": 8.367816091954023,
10193 "grad_norm": 1.7686301784709073,
10194 "kl": 0.049560546875,
10195 "learning_rate": 1.632183908045977e-07,
10196 "loss": 0.002,
10197 "reward": 1.9046128988265991,
10198 "reward_std": 0.0201382078230381,
10199 "rewards/accuracy_reward": 0.9046128988265991,
10200 "rewards/format_reward": 1.0,
10201 "step": 728
10202 },
10203 {
10204 "clip_ratio": 0.0,
10205 "completion_length": 105.443359375,
10206 "epoch": 8.379310344827585,
10207 "grad_norm": 2.16179386604187,
10208 "kl": 0.0458984375,
10209 "learning_rate": 1.6206896551724136e-07,
10210 "loss": 0.0019,
10211 "reward": 1.8986444473266602,
10212 "reward_std": 0.01659722626209259,
10213 "rewards/accuracy_reward": 0.8986444473266602,
10214 "rewards/format_reward": 1.0,
10215 "step": 729
10216 },
10217 {
10218 "clip_ratio": 0.0,
10219 "completion_length": 104.22721862792969,
10220 "epoch": 8.39080459770115,
10221 "grad_norm": 2.8856095934176857,
10222 "kl": 0.048828125,
10223 "learning_rate": 1.6091954022988505e-07,
10224 "loss": 0.002,
10225 "reward": 1.9096076488494873,
10226 "reward_std": 0.018815144896507263,
10227 "rewards/accuracy_reward": 0.9096077680587769,
10228 "rewards/format_reward": 1.0,
10229 "step": 730
10230 },
10231 {
10232 "clip_ratio": 0.0,
10233 "completion_length": 104.619140625,
10234 "epoch": 8.402298850574713,
10235 "grad_norm": 2.9006809475096658,
10236 "kl": 0.051513671875,
10237 "learning_rate": 1.5977011494252874e-07,
10238 "loss": 0.0022,
10239 "reward": 1.9206702709197998,
10240 "reward_std": 0.01633971370756626,
10241 "rewards/accuracy_reward": 0.920670211315155,
10242 "rewards/format_reward": 1.0,
10243 "step": 731
10244 },
10245 {
10246 "clip_ratio": 0.0,
10247 "completion_length": 104.259765625,
10248 "epoch": 8.413793103448276,
10249 "grad_norm": 1.9751932337603062,
10250 "kl": 0.052734375,
10251 "learning_rate": 1.5862068965517243e-07,
10252 "loss": 0.0022,
10253 "reward": 1.893907070159912,
10254 "reward_std": 0.02224368415772915,
10255 "rewards/accuracy_reward": 0.8939072489738464,
10256 "rewards/format_reward": 1.0,
10257 "step": 732
10258 },
10259 {
10260 "clip_ratio": 0.0,
10261 "completion_length": 106.00846862792969,
10262 "epoch": 8.425287356321839,
10263 "grad_norm": 2.0957540123306364,
10264 "kl": 0.0556640625,
10265 "learning_rate": 1.5747126436781606e-07,
10266 "loss": 0.0023,
10267 "reward": 1.9105441570281982,
10268 "reward_std": 0.01823435351252556,
10269 "rewards/accuracy_reward": 0.9105440974235535,
10270 "rewards/format_reward": 1.0,
10271 "step": 733
10272 },
10273 {
10274 "clip_ratio": 0.0,
10275 "completion_length": 103.10872650146484,
10276 "epoch": 8.436781609195402,
10277 "grad_norm": 1.7574530073426011,
10278 "kl": 0.049072265625,
10279 "learning_rate": 1.5632183908045975e-07,
10280 "loss": 0.002,
10281 "reward": 1.9063684940338135,
10282 "reward_std": 0.017554182559251785,
10283 "rewards/accuracy_reward": 0.9063684940338135,
10284 "rewards/format_reward": 1.0,
10285 "step": 734
10286 },
10287 {
10288 "clip_ratio": 0.0,
10289 "completion_length": 105.40755462646484,
10290 "epoch": 8.448275862068966,
10291 "grad_norm": 1.718104369150053,
10292 "kl": 0.04638671875,
10293 "learning_rate": 1.5517241379310344e-07,
10294 "loss": 0.0019,
10295 "reward": 1.9011447429656982,
10296 "reward_std": 0.02049892395734787,
10297 "rewards/accuracy_reward": 0.9017957448959351,
10298 "rewards/format_reward": 0.9993489980697632,
10299 "step": 735
10300 },
10301 {
10302 "clip_ratio": 0.0,
10303 "completion_length": 107.095703125,
10304 "epoch": 8.459770114942529,
10305 "grad_norm": 1.8096241235073485,
10306 "kl": 0.044921875,
10307 "learning_rate": 1.5402298850574713e-07,
10308 "loss": 0.0019,
10309 "reward": 1.9035829305648804,
10310 "reward_std": 0.017755288630723953,
10311 "rewards/accuracy_reward": 0.9035829305648804,
10312 "rewards/format_reward": 1.0,
10313 "step": 736
10314 },
10315 {
10316 "clip_ratio": 0.0,
10317 "completion_length": 106.228515625,
10318 "epoch": 8.471264367816092,
10319 "grad_norm": 4.3156887737751966,
10320 "kl": 0.04541015625,
10321 "learning_rate": 1.528735632183908e-07,
10322 "loss": 0.0019,
10323 "reward": 1.8951524496078491,
10324 "reward_std": 0.01714376173913479,
10325 "rewards/accuracy_reward": 0.8951523303985596,
10326 "rewards/format_reward": 1.0,
10327 "step": 737
10328 },
10329 {
10330 "clip_ratio": 0.0,
10331 "completion_length": 107.240234375,
10332 "epoch": 8.482758620689655,
10333 "grad_norm": 1.8203610430487198,
10334 "kl": 0.046142578125,
10335 "learning_rate": 1.5172413793103449e-07,
10336 "loss": 0.0019,
10337 "reward": 1.9012572765350342,
10338 "reward_std": 0.020000826567411423,
10339 "rewards/accuracy_reward": 0.9019083976745605,
10340 "rewards/format_reward": 0.9993489980697632,
10341 "step": 738
10342 },
10343 {
10344 "clip_ratio": 0.0,
10345 "completion_length": 104.80078125,
10346 "epoch": 8.494252873563218,
10347 "grad_norm": 2.4493356638370547,
10348 "kl": 0.046630859375,
10349 "learning_rate": 1.5057471264367815e-07,
10350 "loss": 0.002,
10351 "reward": 1.8994574546813965,
10352 "reward_std": 0.01900039240717888,
10353 "rewards/accuracy_reward": 0.9001085758209229,
10354 "rewards/format_reward": 0.9993489980697632,
10355 "step": 739
10356 },
10357 {
10358 "clip_ratio": 0.0,
10359 "completion_length": 104.68815612792969,
10360 "epoch": 8.505747126436782,
10361 "grad_norm": 1.7961165376254185,
10362 "kl": 0.048583984375,
10363 "learning_rate": 1.4942528735632184e-07,
10364 "loss": 0.002,
10365 "reward": 1.8904352188110352,
10366 "reward_std": 0.018353838473558426,
10367 "rewards/accuracy_reward": 0.8904353380203247,
10368 "rewards/format_reward": 1.0,
10369 "step": 740
10370 },
10371 {
10372 "clip_ratio": 0.0,
10373 "completion_length": 107.22721862792969,
10374 "epoch": 8.517241379310345,
10375 "grad_norm": 2.124448648242551,
10376 "kl": 0.048583984375,
10377 "learning_rate": 1.482758620689655e-07,
10378 "loss": 0.002,
10379 "reward": 1.9015287160873413,
10380 "reward_std": 0.017897652462124825,
10381 "rewards/accuracy_reward": 0.9015287756919861,
10382 "rewards/format_reward": 1.0,
10383 "step": 741
10384 },
10385 {
10386 "clip_ratio": 0.0,
10387 "completion_length": 105.91081237792969,
10388 "epoch": 8.528735632183908,
10389 "grad_norm": 1.5182426183899522,
10390 "kl": 0.048095703125,
10391 "learning_rate": 1.471264367816092e-07,
10392 "loss": 0.002,
10393 "reward": 1.9014829397201538,
10394 "reward_std": 0.020442795008420944,
10395 "rewards/accuracy_reward": 0.9021339416503906,
10396 "rewards/format_reward": 0.9993489980697632,
10397 "step": 742
10398 },
10399 {
10400 "clip_ratio": 0.0,
10401 "completion_length": 105.91536712646484,
10402 "epoch": 8.540229885057471,
10403 "grad_norm": 1.5309622912667127,
10404 "kl": 0.05029296875,
10405 "learning_rate": 1.4597701149425288e-07,
10406 "loss": 0.0021,
10407 "reward": 1.9063705205917358,
10408 "reward_std": 0.019261373206973076,
10409 "rewards/accuracy_reward": 0.9070214629173279,
10410 "rewards/format_reward": 0.9993489980697632,
10411 "step": 743
10412 },
10413 {
10414 "clip_ratio": 0.0,
10415 "completion_length": 105.28190612792969,
10416 "epoch": 8.551724137931034,
10417 "grad_norm": 1.4853282790643823,
10418 "kl": 0.04296875,
10419 "learning_rate": 1.4482758620689654e-07,
10420 "loss": 0.0018,
10421 "reward": 1.9033745527267456,
10422 "reward_std": 0.01895163580775261,
10423 "rewards/accuracy_reward": 0.904025673866272,
10424 "rewards/format_reward": 0.9993489980697632,
10425 "step": 744
10426 },
10427 {
10428 "clip_ratio": 0.0,
10429 "completion_length": 104.767578125,
10430 "epoch": 8.563218390804598,
10431 "grad_norm": 1.5019420177279086,
10432 "kl": 0.053466796875,
10433 "learning_rate": 1.436781609195402e-07,
10434 "loss": 0.0022,
10435 "reward": 1.8998239040374756,
10436 "reward_std": 0.021759074181318283,
10437 "rewards/accuracy_reward": 0.9004749059677124,
10438 "rewards/format_reward": 0.9993489980697632,
10439 "step": 745
10440 },
10441 {
10442 "clip_ratio": 0.0,
10443 "completion_length": 107.77018737792969,
10444 "epoch": 8.574712643678161,
10445 "grad_norm": 1.5414148798759049,
10446 "kl": 0.04736328125,
10447 "learning_rate": 1.425287356321839e-07,
10448 "loss": 0.002,
10449 "reward": 1.9003026485443115,
10450 "reward_std": 0.021097760647535324,
10451 "rewards/accuracy_reward": 0.9009537100791931,
10452 "rewards/format_reward": 0.9993489980697632,
10453 "step": 746
10454 },
10455 {
10456 "clip_ratio": 0.0,
10457 "completion_length": 108.10872650146484,
10458 "epoch": 8.586206896551724,
10459 "grad_norm": 4.960221853934078,
10460 "kl": 0.04345703125,
10461 "learning_rate": 1.413793103448276e-07,
10462 "loss": 0.0018,
10463 "reward": 1.920248031616211,
10464 "reward_std": 0.01949543133378029,
10465 "rewards/accuracy_reward": 0.9208990931510925,
10466 "rewards/format_reward": 0.9993489980697632,
10467 "step": 747
10468 },
10469 {
10470 "clip_ratio": 0.0,
10471 "completion_length": 107.27409362792969,
10472 "epoch": 8.597701149425287,
10473 "grad_norm": 1.464489207745014,
10474 "kl": 0.044921875,
10475 "learning_rate": 1.4022988505747128e-07,
10476 "loss": 0.0019,
10477 "reward": 1.9014743566513062,
10478 "reward_std": 0.017846036702394485,
10479 "rewards/accuracy_reward": 0.9014744162559509,
10480 "rewards/format_reward": 1.0,
10481 "step": 748
10482 },
10483 {
10484 "clip_ratio": 0.0,
10485 "completion_length": 108.01302337646484,
10486 "epoch": 8.60919540229885,
10487 "grad_norm": 5.576587184664697,
10488 "kl": 0.04443359375,
10489 "learning_rate": 1.3908045977011494e-07,
10490 "loss": 0.0018,
10491 "reward": 1.8989577293395996,
10492 "reward_std": 0.019103601574897766,
10493 "rewards/accuracy_reward": 0.8989577293395996,
10494 "rewards/format_reward": 1.0,
10495 "step": 749
10496 },
10497 {
10498 "clip_ratio": 0.0,
10499 "completion_length": 107.55208587646484,
10500 "epoch": 8.620689655172415,
10501 "grad_norm": 3.391688889257715,
10502 "kl": 0.042236328125,
10503 "learning_rate": 1.379310344827586e-07,
10504 "loss": 0.0018,
10505 "reward": 1.9022835493087769,
10506 "reward_std": 0.020112819969654083,
10507 "rewards/accuracy_reward": 0.9022834300994873,
10508 "rewards/format_reward": 1.0,
10509 "step": 750
10510 },
10511 {
10512 "clip_ratio": 0.0,
10513 "completion_length": 108.23112487792969,
10514 "epoch": 8.632183908045977,
10515 "grad_norm": 1.415131831015453,
10516 "kl": 0.046630859375,
10517 "learning_rate": 1.367816091954023e-07,
10518 "loss": 0.0019,
10519 "reward": 1.8941152095794678,
10520 "reward_std": 0.02364903688430786,
10521 "rewards/accuracy_reward": 0.894115149974823,
10522 "rewards/format_reward": 1.0,
10523 "step": 751
10524 },
10525 {
10526 "clip_ratio": 0.0,
10527 "completion_length": 108.39974212646484,
10528 "epoch": 8.64367816091954,
10529 "grad_norm": 3.686963959842639,
10530 "kl": 0.043212890625,
10531 "learning_rate": 1.3563218390804598e-07,
10532 "loss": 0.0018,
10533 "reward": 1.8956639766693115,
10534 "reward_std": 0.01937917247414589,
10535 "rewards/accuracy_reward": 0.8956639170646667,
10536 "rewards/format_reward": 1.0,
10537 "step": 752
10538 },
10539 {
10540 "clip_ratio": 0.0,
10541 "completion_length": 106.33659362792969,
10542 "epoch": 8.655172413793103,
10543 "grad_norm": 11.793387997885002,
10544 "kl": 0.04443359375,
10545 "learning_rate": 1.3448275862068965e-07,
10546 "loss": 0.0018,
10547 "reward": 1.9092130661010742,
10548 "reward_std": 0.020166244357824326,
10549 "rewards/accuracy_reward": 0.9098643064498901,
10550 "rewards/format_reward": 0.9993489980697632,
10551 "step": 753
10552 },
10553 {
10554 "clip_ratio": 0.0,
10555 "completion_length": 108.69075775146484,
10556 "epoch": 8.666666666666666,
10557 "grad_norm": 1.419014822654193,
10558 "kl": 0.0517578125,
10559 "learning_rate": 1.3333333333333334e-07,
10560 "loss": 0.0021,
10561 "reward": 1.8976190090179443,
10562 "reward_std": 0.018913621082901955,
10563 "rewards/accuracy_reward": 0.8976188898086548,
10564 "rewards/format_reward": 1.0,
10565 "step": 754
10566 },
10567 {
10568 "clip_ratio": 0.0,
10569 "completion_length": 107.52799987792969,
10570 "epoch": 8.678160919540229,
10571 "grad_norm": 5.95529479625069,
10572 "kl": 0.043212890625,
10573 "learning_rate": 1.32183908045977e-07,
10574 "loss": 0.0018,
10575 "reward": 1.9074852466583252,
10576 "reward_std": 0.017688315361738205,
10577 "rewards/accuracy_reward": 0.9074852466583252,
10578 "rewards/format_reward": 1.0,
10579 "step": 755
10580 },
10581 {
10582 "clip_ratio": 0.0,
10583 "completion_length": 107.25521087646484,
10584 "epoch": 8.689655172413794,
10585 "grad_norm": 2.7625404647260265,
10586 "kl": 0.048828125,
10587 "learning_rate": 1.310344827586207e-07,
10588 "loss": 0.002,
10589 "reward": 1.8992561101913452,
10590 "reward_std": 0.021415000781416893,
10591 "rewards/accuracy_reward": 0.899907112121582,
10592 "rewards/format_reward": 0.9993489980697632,
10593 "step": 756
10594 },
10595 {
10596 "clip_ratio": 0.0,
10597 "completion_length": 109.72721862792969,
10598 "epoch": 8.701149425287356,
10599 "grad_norm": 3.132870047881132,
10600 "kl": 0.04638671875,
10601 "learning_rate": 1.2988505747126435e-07,
10602 "loss": 0.0019,
10603 "reward": 1.8953626155853271,
10604 "reward_std": 0.01965472660958767,
10605 "rewards/accuracy_reward": 0.8953627347946167,
10606 "rewards/format_reward": 1.0,
10607 "step": 757
10608 },
10609 {
10610 "clip_ratio": 0.0,
10611 "completion_length": 108.06185150146484,
10612 "epoch": 8.71264367816092,
10613 "grad_norm": 1.4067186469745852,
10614 "kl": 0.047119140625,
10615 "learning_rate": 1.2873563218390804e-07,
10616 "loss": 0.002,
10617 "reward": 1.9052462577819824,
10618 "reward_std": 0.019855869933962822,
10619 "rewards/accuracy_reward": 0.9052464365959167,
10620 "rewards/format_reward": 1.0,
10621 "step": 758
10622 },
10623 {
10624 "clip_ratio": 0.0,
10625 "completion_length": 105.248046875,
10626 "epoch": 8.724137931034482,
10627 "grad_norm": 1.6124971541846125,
10628 "kl": 0.0478515625,
10629 "learning_rate": 1.2758620689655173e-07,
10630 "loss": 0.002,
10631 "reward": 1.9074506759643555,
10632 "reward_std": 0.017859049141407013,
10633 "rewards/accuracy_reward": 0.9074506759643555,
10634 "rewards/format_reward": 1.0,
10635 "step": 759
10636 },
10637 {
10638 "clip_ratio": 0.0,
10639 "completion_length": 106.123046875,
10640 "epoch": 8.735632183908045,
10641 "grad_norm": 1.6651646299458338,
10642 "kl": 0.041259765625,
10643 "learning_rate": 1.2643678160919542e-07,
10644 "loss": 0.0017,
10645 "reward": 1.8970210552215576,
10646 "reward_std": 0.01788368821144104,
10647 "rewards/accuracy_reward": 0.8970209956169128,
10648 "rewards/format_reward": 1.0,
10649 "step": 760
10650 },
10651 {
10652 "clip_ratio": 0.0,
10653 "completion_length": 106.68099212646484,
10654 "epoch": 8.74712643678161,
10655 "grad_norm": 1.908276402264626,
10656 "kl": 0.0478515625,
10657 "learning_rate": 1.2528735632183906e-07,
10658 "loss": 0.002,
10659 "reward": 1.911945104598999,
10660 "reward_std": 0.020722268149256706,
10661 "rewards/accuracy_reward": 0.9125961065292358,
10662 "rewards/format_reward": 0.9993489980697632,
10663 "step": 761
10664 },
10665 {
10666 "clip_ratio": 0.0,
10667 "completion_length": 106.97396087646484,
10668 "epoch": 8.758620689655173,
10669 "grad_norm": 1.909218993992126,
10670 "kl": 0.04345703125,
10671 "learning_rate": 1.2413793103448275e-07,
10672 "loss": 0.0018,
10673 "reward": 1.9106658697128296,
10674 "reward_std": 0.017978399991989136,
10675 "rewards/accuracy_reward": 0.9106658697128296,
10676 "rewards/format_reward": 1.0,
10677 "step": 762
10678 },
10679 {
10680 "clip_ratio": 0.0,
10681 "completion_length": 106.734375,
10682 "epoch": 8.770114942528735,
10683 "grad_norm": 2.2467623211703014,
10684 "kl": 0.045654296875,
10685 "learning_rate": 1.2298850574712644e-07,
10686 "loss": 0.0019,
10687 "reward": 1.9027222394943237,
10688 "reward_std": 0.016572915017604828,
10689 "rewards/accuracy_reward": 0.9027222394943237,
10690 "rewards/format_reward": 1.0,
10691 "step": 763
10692 },
10693 {
10694 "clip_ratio": 0.0,
10695 "completion_length": 106.62630462646484,
10696 "epoch": 8.781609195402298,
10697 "grad_norm": 5.599771626606617,
10698 "kl": 0.04833984375,
10699 "learning_rate": 1.218390804597701e-07,
10700 "loss": 0.002,
10701 "reward": 1.9221599102020264,
10702 "reward_std": 0.016314268112182617,
10703 "rewards/accuracy_reward": 0.9221599102020264,
10704 "rewards/format_reward": 1.0,
10705 "step": 764
10706 },
10707 {
10708 "clip_ratio": 0.0,
10709 "completion_length": 105.12760925292969,
10710 "epoch": 8.793103448275861,
10711 "grad_norm": 2.4498609839357823,
10712 "kl": 0.05078125,
10713 "learning_rate": 1.206896551724138e-07,
10714 "loss": 0.0021,
10715 "reward": 1.9224801063537598,
10716 "reward_std": 0.01809362694621086,
10717 "rewards/accuracy_reward": 0.9224801063537598,
10718 "rewards/format_reward": 1.0,
10719 "step": 765
10720 },
10721 {
10722 "clip_ratio": 0.0,
10723 "completion_length": 108.19857025146484,
10724 "epoch": 8.804597701149426,
10725 "grad_norm": 2.5801196829452584,
10726 "kl": 0.0439453125,
10727 "learning_rate": 1.1954022988505745e-07,
10728 "loss": 0.0018,
10729 "reward": 1.898393154144287,
10730 "reward_std": 0.01924772374331951,
10731 "rewards/accuracy_reward": 0.8983932733535767,
10732 "rewards/format_reward": 1.0,
10733 "step": 766
10734 },
10735 {
10736 "clip_ratio": 0.0,
10737 "completion_length": 108.40690612792969,
10738 "epoch": 8.816091954022989,
10739 "grad_norm": 2.032967972913373,
10740 "kl": 0.04345703125,
10741 "learning_rate": 1.1839080459770114e-07,
10742 "loss": 0.0018,
10743 "reward": 1.9099705219268799,
10744 "reward_std": 0.02100074663758278,
10745 "rewards/accuracy_reward": 0.9106216430664062,
10746 "rewards/format_reward": 0.9993489980697632,
10747 "step": 767
10748 },
10749 {
10750 "clip_ratio": 0.0,
10751 "completion_length": 107.61653900146484,
10752 "epoch": 8.827586206896552,
10753 "grad_norm": 2.07582370315509,
10754 "kl": 0.046630859375,
10755 "learning_rate": 1.1724137931034482e-07,
10756 "loss": 0.002,
10757 "reward": 1.9123599529266357,
10758 "reward_std": 0.018274664878845215,
10759 "rewards/accuracy_reward": 0.912359893321991,
10760 "rewards/format_reward": 1.0,
10761 "step": 768
10762 },
10763 {
10764 "clip_ratio": 0.0,
10765 "completion_length": 109.04817962646484,
10766 "epoch": 8.839080459770114,
10767 "grad_norm": 2.1468683147084873,
10768 "kl": 0.04638671875,
10769 "learning_rate": 1.1609195402298851e-07,
10770 "loss": 0.0019,
10771 "reward": 1.905731439590454,
10772 "reward_std": 0.021076953038573265,
10773 "rewards/accuracy_reward": 0.9057313799858093,
10774 "rewards/format_reward": 1.0,
10775 "step": 769
10776 },
10777 {
10778 "clip_ratio": 0.0,
10779 "completion_length": 107.3046875,
10780 "epoch": 8.850574712643677,
10781 "grad_norm": 1.5666594265928468,
10782 "kl": 0.044677734375,
10783 "learning_rate": 1.1494252873563217e-07,
10784 "loss": 0.0019,
10785 "reward": 1.9021600484848022,
10786 "reward_std": 0.021117765456438065,
10787 "rewards/accuracy_reward": 0.902160108089447,
10788 "rewards/format_reward": 1.0,
10789 "step": 770
10790 },
10791 {
10792 "clip_ratio": 0.0,
10793 "completion_length": 108.53255462646484,
10794 "epoch": 8.862068965517242,
10795 "grad_norm": 3.9234683438946254,
10796 "kl": 0.044677734375,
10797 "learning_rate": 1.1379310344827586e-07,
10798 "loss": 0.0019,
10799 "reward": 1.9176257848739624,
10800 "reward_std": 0.018359411507844925,
10801 "rewards/accuracy_reward": 0.9176258444786072,
10802 "rewards/format_reward": 1.0,
10803 "step": 771
10804 },
10805 {
10806 "clip_ratio": 0.0,
10807 "completion_length": 109.14974212646484,
10808 "epoch": 8.873563218390805,
10809 "grad_norm": 3.472231746273925,
10810 "kl": 0.044189453125,
10811 "learning_rate": 1.1264367816091953e-07,
10812 "loss": 0.0018,
10813 "reward": 1.9001665115356445,
10814 "reward_std": 0.018229342997074127,
10815 "rewards/accuracy_reward": 0.9001666903495789,
10816 "rewards/format_reward": 1.0,
10817 "step": 772
10818 },
10819 {
10820 "clip_ratio": 0.0,
10821 "completion_length": 107.37825775146484,
10822 "epoch": 8.885057471264368,
10823 "grad_norm": 3.1028518982061635,
10824 "kl": 0.046875,
10825 "learning_rate": 1.1149425287356322e-07,
10826 "loss": 0.0019,
10827 "reward": 1.911559820175171,
10828 "reward_std": 0.018318263813853264,
10829 "rewards/accuracy_reward": 0.9115597605705261,
10830 "rewards/format_reward": 1.0,
10831 "step": 773
10832 },
10833 {
10834 "clip_ratio": 0.0,
10835 "completion_length": 107.07487487792969,
10836 "epoch": 8.89655172413793,
10837 "grad_norm": 2.1968963906989276,
10838 "kl": 0.047607421875,
10839 "learning_rate": 1.103448275862069e-07,
10840 "loss": 0.002,
10841 "reward": 1.9014253616333008,
10842 "reward_std": 0.02046075090765953,
10843 "rewards/accuracy_reward": 0.9020764827728271,
10844 "rewards/format_reward": 0.9993489980697632,
10845 "step": 774
10846 },
10847 {
10848 "clip_ratio": 0.0,
10849 "completion_length": 106.416015625,
10850 "epoch": 8.908045977011493,
10851 "grad_norm": 1.899936745200757,
10852 "kl": 0.046630859375,
10853 "learning_rate": 1.0919540229885057e-07,
10854 "loss": 0.002,
10855 "reward": 1.8940951824188232,
10856 "reward_std": 0.017228955402970314,
10857 "rewards/accuracy_reward": 0.8940951228141785,
10858 "rewards/format_reward": 1.0,
10859 "step": 775
10860 },
10861 {
10862 "clip_ratio": 0.0,
10863 "completion_length": 106.01692962646484,
10864 "epoch": 8.919540229885058,
10865 "grad_norm": 2.350383361616655,
10866 "kl": 0.046630859375,
10867 "learning_rate": 1.0804597701149425e-07,
10868 "loss": 0.002,
10869 "reward": 1.8946069478988647,
10870 "reward_std": 0.01980327069759369,
10871 "rewards/accuracy_reward": 0.8946070075035095,
10872 "rewards/format_reward": 1.0,
10873 "step": 776
10874 },
10875 {
10876 "clip_ratio": 0.0,
10877 "completion_length": 107.88411712646484,
10878 "epoch": 8.931034482758621,
10879 "grad_norm": 5.641269463386258,
10880 "kl": 0.04296875,
10881 "learning_rate": 1.0689655172413794e-07,
10882 "loss": 0.0018,
10883 "reward": 1.8930943012237549,
10884 "reward_std": 0.020226947963237762,
10885 "rewards/accuracy_reward": 0.8930944204330444,
10886 "rewards/format_reward": 1.0,
10887 "step": 777
10888 },
10889 {
10890 "clip_ratio": 0.0,
10891 "completion_length": 106.46224212646484,
10892 "epoch": 8.942528735632184,
10893 "grad_norm": 1.590527535893143,
10894 "kl": 0.04541015625,
10895 "learning_rate": 1.057471264367816e-07,
10896 "loss": 0.0019,
10897 "reward": 1.9104032516479492,
10898 "reward_std": 0.01798829436302185,
10899 "rewards/accuracy_reward": 0.9104033708572388,
10900 "rewards/format_reward": 1.0,
10901 "step": 778
10902 },
10903 {
10904 "clip_ratio": 0.0,
10905 "completion_length": 106.85612487792969,
10906 "epoch": 8.954022988505747,
10907 "grad_norm": 2.6107268057573365,
10908 "kl": 0.048095703125,
10909 "learning_rate": 1.0459770114942529e-07,
10910 "loss": 0.002,
10911 "reward": 1.9010858535766602,
10912 "reward_std": 0.020739298313856125,
10913 "rewards/accuracy_reward": 0.9010860323905945,
10914 "rewards/format_reward": 1.0,
10915 "step": 779
10916 },
10917 {
10918 "clip_ratio": 0.0,
10919 "completion_length": 105.03385925292969,
10920 "epoch": 8.96551724137931,
10921 "grad_norm": 1.6092109607995282,
10922 "kl": 0.053466796875,
10923 "learning_rate": 1.0344827586206897e-07,
10924 "loss": 0.0023,
10925 "reward": 1.900530219078064,
10926 "reward_std": 0.01735313981771469,
10927 "rewards/accuracy_reward": 0.900530219078064,
10928 "rewards/format_reward": 1.0,
10929 "step": 780
10930 },
10931 {
10932 "clip_ratio": 0.0,
10933 "completion_length": 106.65950775146484,
10934 "epoch": 8.977011494252874,
10935 "grad_norm": 1.4883968181068261,
10936 "kl": 0.049560546875,
10937 "learning_rate": 1.0229885057471264e-07,
10938 "loss": 0.002,
10939 "reward": 1.9065711498260498,
10940 "reward_std": 0.016935113817453384,
10941 "rewards/accuracy_reward": 0.9065712690353394,
10942 "rewards/format_reward": 1.0,
10943 "step": 781
10944 },
10945 {
10946 "clip_ratio": 0.0,
10947 "completion_length": 104.84635925292969,
10948 "epoch": 8.988505747126437,
10949 "grad_norm": 1.6246833714697817,
10950 "kl": 0.04931640625,
10951 "learning_rate": 1.0114942528735632e-07,
10952 "loss": 0.002,
10953 "reward": 1.91986083984375,
10954 "reward_std": 0.015720397233963013,
10955 "rewards/accuracy_reward": 0.91986083984375,
10956 "rewards/format_reward": 1.0,
10957 "step": 782
10958 },
10959 {
10960 "clip_ratio": 0.0,
10961 "completion_length": 99.25140380859375,
10962 "epoch": 9.0,
10963 "grad_norm": 2.248799852714355,
10964 "kl": 0.041259765625,
10965 "learning_rate": 1e-07,
10966 "loss": 0.0017,
10967 "reward": 1.899660587310791,
10968 "reward_std": 0.019224824383854866,
10969 "rewards/accuracy_reward": 0.8996607661247253,
10970 "rewards/format_reward": 1.0,
10971 "step": 783
10972 },
10973 {
10974 "clip_ratio": 0.0,
10975 "completion_length": 108.44075775146484,
10976 "epoch": 9.011494252873563,
10977 "grad_norm": 1.8721483816760687,
10978 "kl": 0.051513671875,
10979 "learning_rate": 9.885057471264367e-08,
10980 "loss": 0.0021,
10981 "reward": 1.8844293355941772,
10982 "reward_std": 0.025172477588057518,
10983 "rewards/accuracy_reward": 0.8857313990592957,
10984 "rewards/format_reward": 0.9986979365348816,
10985 "step": 784
10986 },
10987 {
10988 "clip_ratio": 0.0,
10989 "completion_length": 107.6640625,
10990 "epoch": 9.022988505747126,
10991 "grad_norm": 1.8506016770380465,
10992 "kl": 0.04150390625,
10993 "learning_rate": 9.770114942528736e-08,
10994 "loss": 0.0017,
10995 "reward": 1.911178469657898,
10996 "reward_std": 0.021104078739881516,
10997 "rewards/accuracy_reward": 0.911178469657898,
10998 "rewards/format_reward": 1.0,
10999 "step": 785
11000 },
11001 {
11002 "clip_ratio": 0.0,
11003 "completion_length": 109.47526550292969,
11004 "epoch": 9.03448275862069,
11005 "grad_norm": 2.4400240165615634,
11006 "kl": 0.04248046875,
11007 "learning_rate": 9.655172413793103e-08,
11008 "loss": 0.0018,
11009 "reward": 1.9191011190414429,
11010 "reward_std": 0.016874711960554123,
11011 "rewards/accuracy_reward": 0.9191012382507324,
11012 "rewards/format_reward": 1.0,
11013 "step": 786
11014 },
11015 {
11016 "clip_ratio": 0.0,
11017 "completion_length": 106.58268737792969,
11018 "epoch": 9.045977011494253,
11019 "grad_norm": 3.3949879642064476,
11020 "kl": 0.04150390625,
11021 "learning_rate": 9.540229885057471e-08,
11022 "loss": 0.0017,
11023 "reward": 1.8661808967590332,
11024 "reward_std": 0.02081323228776455,
11025 "rewards/accuracy_reward": 0.8661808967590332,
11026 "rewards/format_reward": 1.0,
11027 "step": 787
11028 },
11029 {
11030 "clip_ratio": 0.0,
11031 "completion_length": 107.15234375,
11032 "epoch": 9.057471264367816,
11033 "grad_norm": 1.407910897768254,
11034 "kl": 0.051025390625,
11035 "learning_rate": 9.425287356321839e-08,
11036 "loss": 0.0021,
11037 "reward": 1.8680272102355957,
11038 "reward_std": 0.02056352235376835,
11039 "rewards/accuracy_reward": 0.8680272102355957,
11040 "rewards/format_reward": 1.0,
11041 "step": 788
11042 },
11043 {
11044 "clip_ratio": 0.0,
11045 "completion_length": 108.87956237792969,
11046 "epoch": 9.068965517241379,
11047 "grad_norm": 2.6980150332453334,
11048 "kl": 0.04443359375,
11049 "learning_rate": 9.310344827586207e-08,
11050 "loss": 0.0018,
11051 "reward": 1.885411262512207,
11052 "reward_std": 0.021717345342040062,
11053 "rewards/accuracy_reward": 0.885411262512207,
11054 "rewards/format_reward": 1.0,
11055 "step": 789
11056 },
11057 {
11058 "clip_ratio": 0.0,
11059 "completion_length": 106.94010925292969,
11060 "epoch": 9.080459770114942,
11061 "grad_norm": 1.6808051261990247,
11062 "kl": 0.05615234375,
11063 "learning_rate": 9.195402298850574e-08,
11064 "loss": 0.0023,
11065 "reward": 1.901023268699646,
11066 "reward_std": 0.021362271159887314,
11067 "rewards/accuracy_reward": 0.9010233879089355,
11068 "rewards/format_reward": 1.0,
11069 "step": 790
11070 },
11071 {
11072 "clip_ratio": 0.0,
11073 "completion_length": 106.22721862792969,
11074 "epoch": 9.091954022988507,
11075 "grad_norm": 1.9651267648591069,
11076 "kl": 0.048828125,
11077 "learning_rate": 9.080459770114942e-08,
11078 "loss": 0.002,
11079 "reward": 1.8997548818588257,
11080 "reward_std": 0.018945304676890373,
11081 "rewards/accuracy_reward": 0.8997548818588257,
11082 "rewards/format_reward": 1.0,
11083 "step": 791
11084 },
11085 {
11086 "clip_ratio": 0.0,
11087 "completion_length": 107.98046875,
11088 "epoch": 9.10344827586207,
11089 "grad_norm": 1.5615310862319665,
11090 "kl": 0.0458984375,
11091 "learning_rate": 8.96551724137931e-08,
11092 "loss": 0.0019,
11093 "reward": 1.9096624851226807,
11094 "reward_std": 0.018772246316075325,
11095 "rewards/accuracy_reward": 0.9096624851226807,
11096 "rewards/format_reward": 1.0,
11097 "step": 792
11098 },
11099 {
11100 "clip_ratio": 0.0,
11101 "completion_length": 107.50456237792969,
11102 "epoch": 9.114942528735632,
11103 "grad_norm": 1.8984567348891395,
11104 "kl": 0.053466796875,
11105 "learning_rate": 8.850574712643679e-08,
11106 "loss": 0.0022,
11107 "reward": 1.8971011638641357,
11108 "reward_std": 0.02134164236485958,
11109 "rewards/accuracy_reward": 0.8971012830734253,
11110 "rewards/format_reward": 1.0,
11111 "step": 793
11112 },
11113 {
11114 "clip_ratio": 0.0,
11115 "completion_length": 107.59765625,
11116 "epoch": 9.126436781609195,
11117 "grad_norm": 1.57123496870828,
11118 "kl": 0.05126953125,
11119 "learning_rate": 8.735632183908045e-08,
11120 "loss": 0.0021,
11121 "reward": 1.8993648290634155,
11122 "reward_std": 0.02058519423007965,
11123 "rewards/accuracy_reward": 0.8993649482727051,
11124 "rewards/format_reward": 1.0,
11125 "step": 794
11126 },
11127 {
11128 "clip_ratio": 0.0,
11129 "completion_length": 109.01692962646484,
11130 "epoch": 9.137931034482758,
11131 "grad_norm": 2.1980038174190373,
11132 "kl": 0.038818359375,
11133 "learning_rate": 8.620689655172414e-08,
11134 "loss": 0.0016,
11135 "reward": 1.8727396726608276,
11136 "reward_std": 0.021485071629285812,
11137 "rewards/accuracy_reward": 0.8727396726608276,
11138 "rewards/format_reward": 1.0,
11139 "step": 795
11140 },
11141 {
11142 "clip_ratio": 0.0,
11143 "completion_length": 109.57682800292969,
11144 "epoch": 9.149425287356323,
11145 "grad_norm": 1.5298307460022529,
11146 "kl": 0.044921875,
11147 "learning_rate": 8.505747126436782e-08,
11148 "loss": 0.0019,
11149 "reward": 1.9137659072875977,
11150 "reward_std": 0.01766360178589821,
11151 "rewards/accuracy_reward": 0.9137659072875977,
11152 "rewards/format_reward": 1.0,
11153 "step": 796
11154 },
11155 {
11156 "clip_ratio": 0.0,
11157 "completion_length": 109.77474212646484,
11158 "epoch": 9.160919540229886,
11159 "grad_norm": 1.801682202262651,
11160 "kl": 0.04296875,
11161 "learning_rate": 8.39080459770115e-08,
11162 "loss": 0.0018,
11163 "reward": 1.8823485374450684,
11164 "reward_std": 0.023980390280485153,
11165 "rewards/accuracy_reward": 0.8829997777938843,
11166 "rewards/format_reward": 0.9993489980697632,
11167 "step": 797
11168 },
11169 {
11170 "clip_ratio": 0.0,
11171 "completion_length": 107.94596862792969,
11172 "epoch": 9.172413793103448,
11173 "grad_norm": 2.2238602851803284,
11174 "kl": 0.041015625,
11175 "learning_rate": 8.275862068965517e-08,
11176 "loss": 0.0017,
11177 "reward": 1.8901784420013428,
11178 "reward_std": 0.021750375628471375,
11179 "rewards/accuracy_reward": 0.8908295631408691,
11180 "rewards/format_reward": 0.9993489980697632,
11181 "step": 798
11182 },
11183 {
11184 "clip_ratio": 0.0,
11185 "completion_length": 108.58464050292969,
11186 "epoch": 9.183908045977011,
11187 "grad_norm": 3.448825022079825,
11188 "kl": 0.042724609375,
11189 "learning_rate": 8.160919540229885e-08,
11190 "loss": 0.0018,
11191 "reward": 1.902247428894043,
11192 "reward_std": 0.020935572683811188,
11193 "rewards/accuracy_reward": 0.9028984904289246,
11194 "rewards/format_reward": 0.9993489980697632,
11195 "step": 799
11196 },
11197 {
11198 "clip_ratio": 0.0,
11199 "completion_length": 109.63216400146484,
11200 "epoch": 9.195402298850574,
11201 "grad_norm": 1.7683408378496788,
11202 "kl": 0.041259765625,
11203 "learning_rate": 8.045977011494252e-08,
11204 "loss": 0.0017,
11205 "reward": 1.8915985822677612,
11206 "reward_std": 0.02043468877673149,
11207 "rewards/accuracy_reward": 0.891598641872406,
11208 "rewards/format_reward": 1.0,
11209 "step": 800
11210 },
11211 {
11212 "clip_ratio": 0.0,
11213 "completion_length": 106.3046875,
11214 "epoch": 9.206896551724139,
11215 "grad_norm": 3.4479135032415082,
11216 "kl": 0.0419921875,
11217 "learning_rate": 7.931034482758621e-08,
11218 "loss": 0.0018,
11219 "reward": 1.8980789184570312,
11220 "reward_std": 0.020334240049123764,
11221 "rewards/accuracy_reward": 0.8980789184570312,
11222 "rewards/format_reward": 1.0,
11223 "step": 801
11224 },
11225 {
11226 "clip_ratio": 0.0,
11227 "completion_length": 106.453125,
11228 "epoch": 9.218390804597702,
11229 "grad_norm": 2.8315233473682286,
11230 "kl": 0.0439453125,
11231 "learning_rate": 7.816091954022988e-08,
11232 "loss": 0.0018,
11233 "reward": 1.9106957912445068,
11234 "reward_std": 0.01965712383389473,
11235 "rewards/accuracy_reward": 0.9106957912445068,
11236 "rewards/format_reward": 1.0,
11237 "step": 802
11238 },
11239 {
11240 "clip_ratio": 0.0,
11241 "completion_length": 108.0078125,
11242 "epoch": 9.229885057471265,
11243 "grad_norm": 8.809973822334465,
11244 "kl": 0.04443359375,
11245 "learning_rate": 7.701149425287357e-08,
11246 "loss": 0.0018,
11247 "reward": 1.8973687887191772,
11248 "reward_std": 0.02227511629462242,
11249 "rewards/accuracy_reward": 0.8973686695098877,
11250 "rewards/format_reward": 1.0,
11251 "step": 803
11252 },
11253 {
11254 "clip_ratio": 0.0,
11255 "completion_length": 108.857421875,
11256 "epoch": 9.241379310344827,
11257 "grad_norm": 2.290511339554376,
11258 "kl": 0.048095703125,
11259 "learning_rate": 7.586206896551724e-08,
11260 "loss": 0.002,
11261 "reward": 1.926220417022705,
11262 "reward_std": 0.020036086440086365,
11263 "rewards/accuracy_reward": 0.9268714785575867,
11264 "rewards/format_reward": 0.9993489980697632,
11265 "step": 804
11266 },
11267 {
11268 "clip_ratio": 0.0,
11269 "completion_length": 106.24153900146484,
11270 "epoch": 9.25287356321839,
11271 "grad_norm": 10.259694200109895,
11272 "kl": 0.04541015625,
11273 "learning_rate": 7.471264367816092e-08,
11274 "loss": 0.0019,
11275 "reward": 1.9228681325912476,
11276 "reward_std": 0.02166038751602173,
11277 "rewards/accuracy_reward": 0.9228681325912476,
11278 "rewards/format_reward": 1.0,
11279 "step": 805
11280 },
11281 {
11282 "clip_ratio": 0.0,
11283 "completion_length": 107.47786712646484,
11284 "epoch": 9.264367816091955,
11285 "grad_norm": 1.888552839892748,
11286 "kl": 0.040771484375,
11287 "learning_rate": 7.35632183908046e-08,
11288 "loss": 0.0017,
11289 "reward": 1.905164122581482,
11290 "reward_std": 0.017032243311405182,
11291 "rewards/accuracy_reward": 0.9051642417907715,
11292 "rewards/format_reward": 1.0,
11293 "step": 806
11294 },
11295 {
11296 "clip_ratio": 0.0,
11297 "completion_length": 106.90560150146484,
11298 "epoch": 9.275862068965518,
11299 "grad_norm": 1.5511138521125212,
11300 "kl": 0.052001953125,
11301 "learning_rate": 7.241379310344827e-08,
11302 "loss": 0.0021,
11303 "reward": 1.912177562713623,
11304 "reward_std": 0.018603047356009483,
11305 "rewards/accuracy_reward": 0.912177562713623,
11306 "rewards/format_reward": 1.0,
11307 "step": 807
11308 },
11309 {
11310 "clip_ratio": 0.0,
11311 "completion_length": 108.09114837646484,
11312 "epoch": 9.28735632183908,
11313 "grad_norm": 2.2223963738104775,
11314 "kl": 0.04443359375,
11315 "learning_rate": 7.126436781609195e-08,
11316 "loss": 0.0018,
11317 "reward": 1.9001119136810303,
11318 "reward_std": 0.023462196812033653,
11319 "rewards/accuracy_reward": 0.9001118540763855,
11320 "rewards/format_reward": 1.0,
11321 "step": 808
11322 },
11323 {
11324 "clip_ratio": 0.0,
11325 "completion_length": 107.89388275146484,
11326 "epoch": 9.298850574712644,
11327 "grad_norm": 6.250219527341955,
11328 "kl": 0.046142578125,
11329 "learning_rate": 7.011494252873564e-08,
11330 "loss": 0.0019,
11331 "reward": 1.9060745239257812,
11332 "reward_std": 0.021167151629924774,
11333 "rewards/accuracy_reward": 0.9067255258560181,
11334 "rewards/format_reward": 0.9993489980697632,
11335 "step": 809
11336 },
11337 {
11338 "clip_ratio": 0.0,
11339 "completion_length": 107.06120300292969,
11340 "epoch": 9.310344827586206,
11341 "grad_norm": 3.0663601968471044,
11342 "kl": 0.04248046875,
11343 "learning_rate": 6.89655172413793e-08,
11344 "loss": 0.0018,
11345 "reward": 1.904802680015564,
11346 "reward_std": 0.02016662247478962,
11347 "rewards/accuracy_reward": 0.9054538011550903,
11348 "rewards/format_reward": 0.9993489980697632,
11349 "step": 810
11350 },
11351 {
11352 "clip_ratio": 0.0,
11353 "completion_length": 107.279296875,
11354 "epoch": 9.32183908045977,
11355 "grad_norm": 2.1266705913854707,
11356 "kl": 0.06591796875,
11357 "learning_rate": 6.781609195402299e-08,
11358 "loss": 0.0027,
11359 "reward": 1.8952583074569702,
11360 "reward_std": 0.023519501090049744,
11361 "rewards/accuracy_reward": 0.895909309387207,
11362 "rewards/format_reward": 0.9993489980697632,
11363 "step": 811
11364 },
11365 {
11366 "clip_ratio": 0.0,
11367 "completion_length": 107.70573425292969,
11368 "epoch": 9.333333333333334,
11369 "grad_norm": 1.2920255105728329,
11370 "kl": 0.041259765625,
11371 "learning_rate": 6.666666666666667e-08,
11372 "loss": 0.0017,
11373 "reward": 1.9121431112289429,
11374 "reward_std": 0.01818576082587242,
11375 "rewards/accuracy_reward": 0.9121431112289429,
11376 "rewards/format_reward": 1.0,
11377 "step": 812
11378 },
11379 {
11380 "clip_ratio": 0.0,
11381 "completion_length": 106.04948425292969,
11382 "epoch": 9.344827586206897,
11383 "grad_norm": 1.9698975522109214,
11384 "kl": 0.044189453125,
11385 "learning_rate": 6.551724137931034e-08,
11386 "loss": 0.0018,
11387 "reward": 1.9061200618743896,
11388 "reward_std": 0.019229721277952194,
11389 "rewards/accuracy_reward": 0.9067711234092712,
11390 "rewards/format_reward": 0.9993489980697632,
11391 "step": 813
11392 },
11393 {
11394 "clip_ratio": 0.0,
11395 "completion_length": 106.99479675292969,
11396 "epoch": 9.35632183908046,
11397 "grad_norm": 1.5466544483838331,
11398 "kl": 0.044677734375,
11399 "learning_rate": 6.436781609195402e-08,
11400 "loss": 0.0019,
11401 "reward": 1.8965463638305664,
11402 "reward_std": 0.026135286316275597,
11403 "rewards/accuracy_reward": 0.8978484869003296,
11404 "rewards/format_reward": 0.9986979365348816,
11405 "step": 814
11406 },
11407 {
11408 "clip_ratio": 0.0,
11409 "completion_length": 108.06771087646484,
11410 "epoch": 9.367816091954023,
11411 "grad_norm": 2.9008920163167327,
11412 "kl": 0.04345703125,
11413 "learning_rate": 6.321839080459771e-08,
11414 "loss": 0.0018,
11415 "reward": 1.8903131484985352,
11416 "reward_std": 0.020935822278261185,
11417 "rewards/accuracy_reward": 0.8903131484985352,
11418 "rewards/format_reward": 1.0,
11419 "step": 815
11420 },
11421 {
11422 "clip_ratio": 0.0,
11423 "completion_length": 107.97396087646484,
11424 "epoch": 9.379310344827585,
11425 "grad_norm": 2.2134917020578797,
11426 "kl": 0.043212890625,
11427 "learning_rate": 6.206896551724137e-08,
11428 "loss": 0.0018,
11429 "reward": 1.894513487815857,
11430 "reward_std": 0.019658654928207397,
11431 "rewards/accuracy_reward": 0.8945134878158569,
11432 "rewards/format_reward": 1.0,
11433 "step": 816
11434 },
11435 {
11436 "clip_ratio": 0.0,
11437 "completion_length": 107.47135925292969,
11438 "epoch": 9.39080459770115,
11439 "grad_norm": 2.0451132122580216,
11440 "kl": 0.044189453125,
11441 "learning_rate": 6.091954022988505e-08,
11442 "loss": 0.0019,
11443 "reward": 1.9205522537231445,
11444 "reward_std": 0.022365611046552658,
11445 "rewards/accuracy_reward": 0.9205522537231445,
11446 "rewards/format_reward": 1.0,
11447 "step": 817
11448 },
11449 {
11450 "clip_ratio": 0.0,
11451 "completion_length": 107.357421875,
11452 "epoch": 9.402298850574713,
11453 "grad_norm": 1.5413854537310678,
11454 "kl": 0.04345703125,
11455 "learning_rate": 5.977011494252873e-08,
11456 "loss": 0.0018,
11457 "reward": 1.906002402305603,
11458 "reward_std": 0.020225465297698975,
11459 "rewards/accuracy_reward": 0.9060024619102478,
11460 "rewards/format_reward": 1.0,
11461 "step": 818
11462 },
11463 {
11464 "clip_ratio": 0.0,
11465 "completion_length": 107.248046875,
11466 "epoch": 9.413793103448276,
11467 "grad_norm": 1.5942001444003233,
11468 "kl": 0.040771484375,
11469 "learning_rate": 5.862068965517241e-08,
11470 "loss": 0.0017,
11471 "reward": 1.9052398204803467,
11472 "reward_std": 0.018013250082731247,
11473 "rewards/accuracy_reward": 0.9052395820617676,
11474 "rewards/format_reward": 1.0,
11475 "step": 819
11476 },
11477 {
11478 "clip_ratio": 0.0,
11479 "completion_length": 107.83528900146484,
11480 "epoch": 9.425287356321839,
11481 "grad_norm": 2.321727658388936,
11482 "kl": 0.04248046875,
11483 "learning_rate": 5.747126436781609e-08,
11484 "loss": 0.0018,
11485 "reward": 1.9112927913665771,
11486 "reward_std": 0.020977940410375595,
11487 "rewards/accuracy_reward": 0.911943793296814,
11488 "rewards/format_reward": 0.9993489980697632,
11489 "step": 820
11490 },
11491 {
11492 "clip_ratio": 0.0,
11493 "completion_length": 106.5625,
11494 "epoch": 9.436781609195402,
11495 "grad_norm": 3.718818284558716,
11496 "kl": 0.0458984375,
11497 "learning_rate": 5.6321839080459764e-08,
11498 "loss": 0.002,
11499 "reward": 1.9173364639282227,
11500 "reward_std": 0.01930234208703041,
11501 "rewards/accuracy_reward": 0.9173363447189331,
11502 "rewards/format_reward": 1.0,
11503 "step": 821
11504 },
11505 {
11506 "clip_ratio": 0.0,
11507 "completion_length": 108.20833587646484,
11508 "epoch": 9.448275862068966,
11509 "grad_norm": 1.7447738271778865,
11510 "kl": 0.03857421875,
11511 "learning_rate": 5.517241379310345e-08,
11512 "loss": 0.0016,
11513 "reward": 1.905761480331421,
11514 "reward_std": 0.017823033034801483,
11515 "rewards/accuracy_reward": 0.9057614207267761,
11516 "rewards/format_reward": 1.0,
11517 "step": 822
11518 },
11519 {
11520 "clip_ratio": 0.0,
11521 "completion_length": 109.22005462646484,
11522 "epoch": 9.459770114942529,
11523 "grad_norm": 5.556438363038334,
11524 "kl": 0.044921875,
11525 "learning_rate": 5.402298850574712e-08,
11526 "loss": 0.0019,
11527 "reward": 1.912406325340271,
11528 "reward_std": 0.017947331070899963,
11529 "rewards/accuracy_reward": 0.912406325340271,
11530 "rewards/format_reward": 1.0,
11531 "step": 823
11532 },
11533 {
11534 "clip_ratio": 0.0,
11535 "completion_length": 106.21354675292969,
11536 "epoch": 9.471264367816092,
11537 "grad_norm": 1.564280665225921,
11538 "kl": 0.04296875,
11539 "learning_rate": 5.28735632183908e-08,
11540 "loss": 0.0018,
11541 "reward": 1.9196040630340576,
11542 "reward_std": 0.018538126721978188,
11543 "rewards/accuracy_reward": 0.9196040034294128,
11544 "rewards/format_reward": 1.0,
11545 "step": 824
11546 },
11547 {
11548 "clip_ratio": 0.0,
11549 "completion_length": 107.55729675292969,
11550 "epoch": 9.482758620689655,
11551 "grad_norm": 2.4144959847961758,
11552 "kl": 0.0439453125,
11553 "learning_rate": 5.172413793103448e-08,
11554 "loss": 0.0019,
11555 "reward": 1.9196009635925293,
11556 "reward_std": 0.016086673364043236,
11557 "rewards/accuracy_reward": 0.9196010828018188,
11558 "rewards/format_reward": 1.0,
11559 "step": 825
11560 },
11561 {
11562 "clip_ratio": 0.0,
11563 "completion_length": 105.38021087646484,
11564 "epoch": 9.494252873563218,
11565 "grad_norm": 4.197005794788358,
11566 "kl": 0.04345703125,
11567 "learning_rate": 5.057471264367816e-08,
11568 "loss": 0.0019,
11569 "reward": 1.898109793663025,
11570 "reward_std": 0.021654874086380005,
11571 "rewards/accuracy_reward": 0.8987607955932617,
11572 "rewards/format_reward": 0.9993489980697632,
11573 "step": 826
11574 },
11575 {
11576 "clip_ratio": 0.0,
11577 "completion_length": 105.59310150146484,
11578 "epoch": 9.505747126436782,
11579 "grad_norm": 1.8296996407650712,
11580 "kl": 0.04345703125,
11581 "learning_rate": 4.9425287356321836e-08,
11582 "loss": 0.0018,
11583 "reward": 1.9078954458236694,
11584 "reward_std": 0.018013805150985718,
11585 "rewards/accuracy_reward": 0.907895565032959,
11586 "rewards/format_reward": 1.0,
11587 "step": 827
11588 },
11589 {
11590 "clip_ratio": 0.0,
11591 "completion_length": 107.60677337646484,
11592 "epoch": 9.517241379310345,
11593 "grad_norm": 1.6814305912226317,
11594 "kl": 0.041259765625,
11595 "learning_rate": 4.827586206896551e-08,
11596 "loss": 0.0017,
11597 "reward": 1.8963592052459717,
11598 "reward_std": 0.022151313722133636,
11599 "rewards/accuracy_reward": 0.8963589668273926,
11600 "rewards/format_reward": 1.0,
11601 "step": 828
11602 },
11603 {
11604 "clip_ratio": 0.0,
11605 "completion_length": 107.73567962646484,
11606 "epoch": 9.528735632183908,
11607 "grad_norm": 1.7924163077233033,
11608 "kl": 0.04296875,
11609 "learning_rate": 4.7126436781609196e-08,
11610 "loss": 0.0018,
11611 "reward": 1.9191563129425049,
11612 "reward_std": 0.01933225616812706,
11613 "rewards/accuracy_reward": 0.9198073744773865,
11614 "rewards/format_reward": 0.9993489980697632,
11615 "step": 829
11616 },
11617 {
11618 "clip_ratio": 0.0,
11619 "completion_length": 106.1171875,
11620 "epoch": 9.540229885057471,
11621 "grad_norm": 2.5340920833447895,
11622 "kl": 0.0439453125,
11623 "learning_rate": 4.597701149425287e-08,
11624 "loss": 0.0018,
11625 "reward": 1.89328932762146,
11626 "reward_std": 0.01949036866426468,
11627 "rewards/accuracy_reward": 0.8939403295516968,
11628 "rewards/format_reward": 0.9993489980697632,
11629 "step": 830
11630 },
11631 {
11632 "clip_ratio": 0.0,
11633 "completion_length": 105.53255462646484,
11634 "epoch": 9.551724137931034,
11635 "grad_norm": 2.483443690558432,
11636 "kl": 0.052490234375,
11637 "learning_rate": 4.482758620689655e-08,
11638 "loss": 0.0022,
11639 "reward": 1.8969571590423584,
11640 "reward_std": 0.017558250576257706,
11641 "rewards/accuracy_reward": 0.8969571590423584,
11642 "rewards/format_reward": 1.0,
11643 "step": 831
11644 },
11645 {
11646 "clip_ratio": 0.0,
11647 "completion_length": 105.533203125,
11648 "epoch": 9.563218390804598,
11649 "grad_norm": 1.291578178291959,
11650 "kl": 0.045166015625,
11651 "learning_rate": 4.3678160919540225e-08,
11652 "loss": 0.0019,
11653 "reward": 1.8948633670806885,
11654 "reward_std": 0.021366355940699577,
11655 "rewards/accuracy_reward": 0.8948633074760437,
11656 "rewards/format_reward": 1.0,
11657 "step": 832
11658 },
11659 {
11660 "clip_ratio": 0.0,
11661 "completion_length": 106.82487487792969,
11662 "epoch": 9.574712643678161,
11663 "grad_norm": 2.7537847562643374,
11664 "kl": 0.0419921875,
11665 "learning_rate": 4.252873563218391e-08,
11666 "loss": 0.0017,
11667 "reward": 1.9019429683685303,
11668 "reward_std": 0.018435677513480186,
11669 "rewards/accuracy_reward": 0.9019430875778198,
11670 "rewards/format_reward": 1.0,
11671 "step": 833
11672 },
11673 {
11674 "clip_ratio": 0.0,
11675 "completion_length": 105.77799987792969,
11676 "epoch": 9.586206896551724,
11677 "grad_norm": 2.2181518752294624,
11678 "kl": 0.04443359375,
11679 "learning_rate": 4.1379310344827585e-08,
11680 "loss": 0.0018,
11681 "reward": 1.8903684616088867,
11682 "reward_std": 0.018676333129405975,
11683 "rewards/accuracy_reward": 0.8903685808181763,
11684 "rewards/format_reward": 1.0,
11685 "step": 834
11686 },
11687 {
11688 "clip_ratio": 0.0,
11689 "completion_length": 107.46224212646484,
11690 "epoch": 9.597701149425287,
11691 "grad_norm": 1.756566376667342,
11692 "kl": 0.046875,
11693 "learning_rate": 4.022988505747126e-08,
11694 "loss": 0.0019,
11695 "reward": 1.8650926351547241,
11696 "reward_std": 0.02378205582499504,
11697 "rewards/accuracy_reward": 0.8657435178756714,
11698 "rewards/format_reward": 0.9993489980697632,
11699 "step": 835
11700 },
11701 {
11702 "clip_ratio": 0.0,
11703 "completion_length": 106.02474212646484,
11704 "epoch": 9.60919540229885,
11705 "grad_norm": 1.99335590706468,
11706 "kl": 0.04345703125,
11707 "learning_rate": 3.908045977011494e-08,
11708 "loss": 0.0018,
11709 "reward": 1.8908905982971191,
11710 "reward_std": 0.01960897445678711,
11711 "rewards/accuracy_reward": 0.8915416598320007,
11712 "rewards/format_reward": 0.9993489980697632,
11713 "step": 836
11714 },
11715 {
11716 "clip_ratio": 0.0,
11717 "completion_length": 106.8046875,
11718 "epoch": 9.620689655172415,
11719 "grad_norm": 4.785304870304732,
11720 "kl": 0.04345703125,
11721 "learning_rate": 3.793103448275862e-08,
11722 "loss": 0.0018,
11723 "reward": 1.909947395324707,
11724 "reward_std": 0.0192283745855093,
11725 "rewards/accuracy_reward": 0.9099475741386414,
11726 "rewards/format_reward": 1.0,
11727 "step": 837
11728 },
11729 {
11730 "clip_ratio": 0.0,
11731 "completion_length": 107.50456237792969,
11732 "epoch": 9.632183908045977,
11733 "grad_norm": 1.6423863350528298,
11734 "kl": 0.04736328125,
11735 "learning_rate": 3.67816091954023e-08,
11736 "loss": 0.0019,
11737 "reward": 1.9007571935653687,
11738 "reward_std": 0.017737818881869316,
11739 "rewards/accuracy_reward": 0.9007573127746582,
11740 "rewards/format_reward": 1.0,
11741 "step": 838
11742 },
11743 {
11744 "clip_ratio": 0.0,
11745 "completion_length": 107.982421875,
11746 "epoch": 9.64367816091954,
11747 "grad_norm": 2.049656890092837,
11748 "kl": 0.043701171875,
11749 "learning_rate": 3.5632183908045974e-08,
11750 "loss": 0.0018,
11751 "reward": 1.9175758361816406,
11752 "reward_std": 0.01626306213438511,
11753 "rewards/accuracy_reward": 0.9175758361816406,
11754 "rewards/format_reward": 1.0,
11755 "step": 839
11756 },
11757 {
11758 "clip_ratio": 0.0,
11759 "completion_length": 106.10417175292969,
11760 "epoch": 9.655172413793103,
11761 "grad_norm": 1.8848940408267045,
11762 "kl": 0.045654296875,
11763 "learning_rate": 3.448275862068965e-08,
11764 "loss": 0.0019,
11765 "reward": 1.9053196907043457,
11766 "reward_std": 0.022480791434645653,
11767 "rewards/accuracy_reward": 0.9059707522392273,
11768 "rewards/format_reward": 0.9993489980697632,
11769 "step": 840
11770 },
11771 {
11772 "clip_ratio": 0.0,
11773 "completion_length": 106.12890625,
11774 "epoch": 9.666666666666666,
11775 "grad_norm": 3.234936396417619,
11776 "kl": 0.04736328125,
11777 "learning_rate": 3.3333333333333334e-08,
11778 "loss": 0.0019,
11779 "reward": 1.905747413635254,
11780 "reward_std": 0.02039037086069584,
11781 "rewards/accuracy_reward": 0.9057474136352539,
11782 "rewards/format_reward": 1.0,
11783 "step": 841
11784 },
11785 {
11786 "clip_ratio": 0.0,
11787 "completion_length": 107.443359375,
11788 "epoch": 9.678160919540229,
11789 "grad_norm": 1.7322466630483822,
11790 "kl": 0.04443359375,
11791 "learning_rate": 3.218390804597701e-08,
11792 "loss": 0.0018,
11793 "reward": 1.906328558921814,
11794 "reward_std": 0.022260412573814392,
11795 "rewards/accuracy_reward": 0.9076308012008667,
11796 "rewards/format_reward": 0.9986979365348816,
11797 "step": 842
11798 },
11799 {
11800 "clip_ratio": 0.0,
11801 "completion_length": 105.857421875,
11802 "epoch": 9.689655172413794,
11803 "grad_norm": 1.7097166944224358,
11804 "kl": 0.05224609375,
11805 "learning_rate": 3.103448275862069e-08,
11806 "loss": 0.0022,
11807 "reward": 1.9105417728424072,
11808 "reward_std": 0.017679734155535698,
11809 "rewards/accuracy_reward": 0.9105417728424072,
11810 "rewards/format_reward": 1.0,
11811 "step": 843
11812 },
11813 {
11814 "clip_ratio": 0.0,
11815 "completion_length": 104.515625,
11816 "epoch": 9.701149425287356,
11817 "grad_norm": 1.884287958847617,
11818 "kl": 0.047119140625,
11819 "learning_rate": 2.9885057471264364e-08,
11820 "loss": 0.002,
11821 "reward": 1.9169318675994873,
11822 "reward_std": 0.020296216011047363,
11823 "rewards/accuracy_reward": 0.9182338714599609,
11824 "rewards/format_reward": 0.9986979365348816,
11825 "step": 844
11826 },
11827 {
11828 "clip_ratio": 0.0,
11829 "completion_length": 106.80143737792969,
11830 "epoch": 9.71264367816092,
11831 "grad_norm": 1.9163910071570234,
11832 "kl": 0.0458984375,
11833 "learning_rate": 2.8735632183908043e-08,
11834 "loss": 0.002,
11835 "reward": 1.9143104553222656,
11836 "reward_std": 0.019828440621495247,
11837 "rewards/accuracy_reward": 0.9149616956710815,
11838 "rewards/format_reward": 0.9993489980697632,
11839 "step": 845
11840 },
11841 {
11842 "clip_ratio": 0.0,
11843 "completion_length": 106.53190612792969,
11844 "epoch": 9.724137931034482,
11845 "grad_norm": 1.73873304639946,
11846 "kl": 0.0439453125,
11847 "learning_rate": 2.7586206896551723e-08,
11848 "loss": 0.0019,
11849 "reward": 1.9067649841308594,
11850 "reward_std": 0.018557554110884666,
11851 "rewards/accuracy_reward": 0.9067651033401489,
11852 "rewards/format_reward": 1.0,
11853 "step": 846
11854 },
11855 {
11856 "clip_ratio": 0.0,
11857 "completion_length": 104.27604675292969,
11858 "epoch": 9.735632183908045,
11859 "grad_norm": 2.3263503442259807,
11860 "kl": 0.04345703125,
11861 "learning_rate": 2.64367816091954e-08,
11862 "loss": 0.0018,
11863 "reward": 1.9176081418991089,
11864 "reward_std": 0.018732137978076935,
11865 "rewards/accuracy_reward": 0.9176082015037537,
11866 "rewards/format_reward": 1.0,
11867 "step": 847
11868 },
11869 {
11870 "clip_ratio": 0.0,
11871 "completion_length": 105.03125,
11872 "epoch": 9.74712643678161,
11873 "grad_norm": 1.495014646664977,
11874 "kl": 0.04931640625,
11875 "learning_rate": 2.528735632183908e-08,
11876 "loss": 0.0021,
11877 "reward": 1.914261817932129,
11878 "reward_std": 0.01746547594666481,
11879 "rewards/accuracy_reward": 0.9142619371414185,
11880 "rewards/format_reward": 1.0,
11881 "step": 848
11882 },
11883 {
11884 "clip_ratio": 0.0,
11885 "completion_length": 107.43099212646484,
11886 "epoch": 9.758620689655173,
11887 "grad_norm": 2.129213121110771,
11888 "kl": 0.044189453125,
11889 "learning_rate": 2.4137931034482756e-08,
11890 "loss": 0.0018,
11891 "reward": 1.9081742763519287,
11892 "reward_std": 0.02037705108523369,
11893 "rewards/accuracy_reward": 0.9088253378868103,
11894 "rewards/format_reward": 0.9993489980697632,
11895 "step": 849
11896 },
11897 {
11898 "clip_ratio": 0.0,
11899 "completion_length": 104.64453125,
11900 "epoch": 9.770114942528735,
11901 "grad_norm": 1.5696963468743126,
11902 "kl": 0.046142578125,
11903 "learning_rate": 2.2988505747126436e-08,
11904 "loss": 0.0019,
11905 "reward": 1.9174790382385254,
11906 "reward_std": 0.017753083258867264,
11907 "rewards/accuracy_reward": 0.9174790382385254,
11908 "rewards/format_reward": 1.0,
11909 "step": 850
11910 },
11911 {
11912 "clip_ratio": 0.0,
11913 "completion_length": 106.01302337646484,
11914 "epoch": 9.781609195402298,
11915 "grad_norm": 2.3977935164505286,
11916 "kl": 0.050537109375,
11917 "learning_rate": 2.1839080459770113e-08,
11918 "loss": 0.0021,
11919 "reward": 1.8931522369384766,
11920 "reward_std": 0.019253626465797424,
11921 "rewards/accuracy_reward": 0.8931522369384766,
11922 "rewards/format_reward": 1.0,
11923 "step": 851
11924 },
11925 {
11926 "clip_ratio": 0.0,
11927 "completion_length": 104.55339050292969,
11928 "epoch": 9.793103448275861,
11929 "grad_norm": 2.3423979754131383,
11930 "kl": 0.04541015625,
11931 "learning_rate": 2.0689655172413793e-08,
11932 "loss": 0.0019,
11933 "reward": 1.9183381795883179,
11934 "reward_std": 0.019186435267329216,
11935 "rewards/accuracy_reward": 0.9183380603790283,
11936 "rewards/format_reward": 1.0,
11937 "step": 852
11938 },
11939 {
11940 "clip_ratio": 0.0,
11941 "completion_length": 106.85417175292969,
11942 "epoch": 9.804597701149426,
11943 "grad_norm": 2.073174318294833,
11944 "kl": 0.046630859375,
11945 "learning_rate": 1.954022988505747e-08,
11946 "loss": 0.0019,
11947 "reward": 1.890181303024292,
11948 "reward_std": 0.018408436328172684,
11949 "rewards/accuracy_reward": 0.8901811838150024,
11950 "rewards/format_reward": 1.0,
11951 "step": 853
11952 },
11953 {
11954 "clip_ratio": 0.0,
11955 "completion_length": 106.06640625,
11956 "epoch": 9.816091954022989,
11957 "grad_norm": 1.9299645375268184,
11958 "kl": 0.05078125,
11959 "learning_rate": 1.839080459770115e-08,
11960 "loss": 0.0021,
11961 "reward": 1.9052647352218628,
11962 "reward_std": 0.017054375261068344,
11963 "rewards/accuracy_reward": 0.9052648544311523,
11964 "rewards/format_reward": 1.0,
11965 "step": 854
11966 },
11967 {
11968 "clip_ratio": 0.0,
11969 "completion_length": 106.09896087646484,
11970 "epoch": 9.827586206896552,
11971 "grad_norm": 1.4804834876534272,
11972 "kl": 0.045166015625,
11973 "learning_rate": 1.7241379310344825e-08,
11974 "loss": 0.0019,
11975 "reward": 1.9028480052947998,
11976 "reward_std": 0.017569242045283318,
11977 "rewards/accuracy_reward": 0.9028478860855103,
11978 "rewards/format_reward": 1.0,
11979 "step": 855
11980 },
11981 {
11982 "clip_ratio": 0.0,
11983 "completion_length": 106.41862487792969,
11984 "epoch": 9.839080459770114,
11985 "grad_norm": 3.920206306053486,
11986 "kl": 0.049560546875,
11987 "learning_rate": 1.6091954022988505e-08,
11988 "loss": 0.0021,
11989 "reward": 1.9126800298690796,
11990 "reward_std": 0.018128346651792526,
11991 "rewards/accuracy_reward": 0.913331151008606,
11992 "rewards/format_reward": 0.9993489980697632,
11993 "step": 856
11994 },
11995 {
11996 "clip_ratio": 0.0,
11997 "completion_length": 105.01432800292969,
11998 "epoch": 9.850574712643677,
11999 "grad_norm": 3.684344553078877,
12000 "kl": 0.046875,
12001 "learning_rate": 1.4942528735632182e-08,
12002 "loss": 0.0019,
12003 "reward": 1.8925762176513672,
12004 "reward_std": 0.021131232380867004,
12005 "rewards/accuracy_reward": 0.8925762176513672,
12006 "rewards/format_reward": 1.0,
12007 "step": 857
12008 },
12009 {
12010 "clip_ratio": 0.0,
12011 "completion_length": 106.326171875,
12012 "epoch": 9.862068965517242,
12013 "grad_norm": 1.3668989587295126,
12014 "kl": 0.04833984375,
12015 "learning_rate": 1.3793103448275862e-08,
12016 "loss": 0.002,
12017 "reward": 1.8880128860473633,
12018 "reward_std": 0.020463991910219193,
12019 "rewards/accuracy_reward": 0.8880130648612976,
12020 "rewards/format_reward": 1.0,
12021 "step": 858
12022 },
12023 {
12024 "clip_ratio": 0.0,
12025 "completion_length": 106.76692962646484,
12026 "epoch": 9.873563218390805,
12027 "grad_norm": 2.38568885867465,
12028 "kl": 0.045166015625,
12029 "learning_rate": 1.264367816091954e-08,
12030 "loss": 0.0019,
12031 "reward": 1.9086008071899414,
12032 "reward_std": 0.020096510648727417,
12033 "rewards/accuracy_reward": 0.9086008071899414,
12034 "rewards/format_reward": 1.0,
12035 "step": 859
12036 },
12037 {
12038 "clip_ratio": 0.0,
12039 "completion_length": 106.86198425292969,
12040 "epoch": 9.885057471264368,
12041 "grad_norm": 4.216482284140485,
12042 "kl": 0.1513671875,
12043 "learning_rate": 1.1494252873563218e-08,
12044 "loss": 0.0061,
12045 "reward": 1.9235996007919312,
12046 "reward_std": 0.019205166026949883,
12047 "rewards/accuracy_reward": 0.9242507815361023,
12048 "rewards/format_reward": 0.9993489980697632,
12049 "step": 860
12050 },
12051 {
12052 "clip_ratio": 0.0,
12053 "completion_length": 105.77409362792969,
12054 "epoch": 9.89655172413793,
12055 "grad_norm": 4.8514292391747675,
12056 "kl": 0.0498046875,
12057 "learning_rate": 1.0344827586206896e-08,
12058 "loss": 0.0021,
12059 "reward": 1.9049177169799805,
12060 "reward_std": 0.017066650092601776,
12061 "rewards/accuracy_reward": 0.90491783618927,
12062 "rewards/format_reward": 1.0,
12063 "step": 861
12064 },
12065 {
12066 "clip_ratio": 0.0,
12067 "completion_length": 104.80599212646484,
12068 "epoch": 9.908045977011493,
12069 "grad_norm": 1.450621393155795,
12070 "kl": 0.043212890625,
12071 "learning_rate": 9.195402298850574e-09,
12072 "loss": 0.0019,
12073 "reward": 1.9170316457748413,
12074 "reward_std": 0.016973568126559258,
12075 "rewards/accuracy_reward": 0.9170317649841309,
12076 "rewards/format_reward": 1.0,
12077 "step": 862
12078 },
12079 {
12080 "clip_ratio": 0.0,
12081 "completion_length": 106.71614837646484,
12082 "epoch": 9.919540229885058,
12083 "grad_norm": 2.5531007269256762,
12084 "kl": 0.04931640625,
12085 "learning_rate": 8.045977011494253e-09,
12086 "loss": 0.002,
12087 "reward": 1.9102344512939453,
12088 "reward_std": 0.0186156053096056,
12089 "rewards/accuracy_reward": 0.9102343916893005,
12090 "rewards/format_reward": 1.0,
12091 "step": 863
12092 },
12093 {
12094 "clip_ratio": 0.0,
12095 "completion_length": 107.51432800292969,
12096 "epoch": 9.931034482758621,
12097 "grad_norm": 1.7598054384403226,
12098 "kl": 0.044921875,
12099 "learning_rate": 6.896551724137931e-09,
12100 "loss": 0.0019,
12101 "reward": 1.8904626369476318,
12102 "reward_std": 0.022165637463331223,
12103 "rewards/accuracy_reward": 0.8911136388778687,
12104 "rewards/format_reward": 0.9993489980697632,
12105 "step": 864
12106 },
12107 {
12108 "clip_ratio": 0.0,
12109 "completion_length": 107.57292175292969,
12110 "epoch": 9.942528735632184,
12111 "grad_norm": 341994.5350692805,
12112 "kl": 3088.0,
12113 "learning_rate": 5.747126436781609e-09,
12114 "loss": 123.6946,
12115 "reward": 1.9070371389389038,
12116 "reward_std": 0.020475659519433975,
12117 "rewards/accuracy_reward": 0.9076882600784302,
12118 "rewards/format_reward": 0.9993489980697632,
12119 "step": 865
12120 },
12121 {
12122 "clip_ratio": 0.0,
12123 "completion_length": 104.62956237792969,
12124 "epoch": 9.954022988505747,
12125 "grad_norm": 18.068040151493758,
12126 "kl": 0.052001953125,
12127 "learning_rate": 4.597701149425287e-09,
12128 "loss": 0.0022,
12129 "reward": 1.910335898399353,
12130 "reward_std": 0.018777839839458466,
12131 "rewards/accuracy_reward": 0.9103360176086426,
12132 "rewards/format_reward": 1.0,
12133 "step": 866
12134 },
12135 {
12136 "clip_ratio": 0.0,
12137 "completion_length": 106.146484375,
12138 "epoch": 9.96551724137931,
12139 "grad_norm": 4.0533200337142565,
12140 "kl": 0.046142578125,
12141 "learning_rate": 3.4482758620689654e-09,
12142 "loss": 0.0019,
12143 "reward": 1.9074934720993042,
12144 "reward_std": 0.02056286484003067,
12145 "rewards/accuracy_reward": 0.9074934720993042,
12146 "rewards/format_reward": 1.0,
12147 "step": 867
12148 },
12149 {
12150 "clip_ratio": 0.0,
12151 "completion_length": 105.169921875,
12152 "epoch": 9.977011494252874,
12153 "grad_norm": 2.988025803486446,
12154 "kl": 0.04833984375,
12155 "learning_rate": 2.2988505747126436e-09,
12156 "loss": 0.002,
12157 "reward": 1.9082448482513428,
12158 "reward_std": 0.017992818728089333,
12159 "rewards/accuracy_reward": 0.908244788646698,
12160 "rewards/format_reward": 1.0,
12161 "step": 868
12162 },
12163 {
12164 "clip_ratio": 0.0,
12165 "completion_length": 105.54622650146484,
12166 "epoch": 9.988505747126437,
12167 "grad_norm": 1.9804111140556449,
12168 "kl": 0.044677734375,
12169 "learning_rate": 1.1494252873563218e-09,
12170 "loss": 0.0018,
12171 "reward": 1.898494005203247,
12172 "reward_std": 0.01871921308338642,
12173 "rewards/accuracy_reward": 0.8984940052032471,
12174 "rewards/format_reward": 1.0,
12175 "step": 869
12176 },
12177 {
12178 "clip_ratio": 0.0,
12179 "completion_length": 98.9199447631836,
12180 "epoch": 10.0,
12181 "grad_norm": 3.2087576817168033,
12182 "kl": 0.041015625,
12183 "learning_rate": 0.0,
12184 "loss": 0.0017,
12185 "reward": 1.9117414951324463,
12186 "reward_std": 0.016582123935222626,
12187 "rewards/accuracy_reward": 0.9117417335510254,
12188 "rewards/format_reward": 1.0,
12189 "step": 870
12190 }
12191 ],
12192 "logging_steps": 1.0,
12193 "max_steps": 870,
12194 "num_input_tokens_seen": 0,
12195 "num_train_epochs": 10,
12196 "save_steps": 50,
12197 "stateful_callbacks": {
12198 "TrainerControl": {
12199 "args": {
12200 "should_epoch_stop": false,
12201 "should_evaluate": false,
12202 "should_log": false,
12203 "should_save": true,
12204 "should_training_stop": true
12205 },
12206 "attributes": {}
12207 }
12208 },
12209 "total_flos": 0.0,
12210 "train_batch_size": 48,
12211 "trial_name": null,
12212 "trial_params": null
12213 }
12214