trainer_state.json
23.5 KB · 1179 lines · json Raw
1 {
2 "best_metric": 0.03933868557214737,
3 "best_model_checkpoint": "./rorshark_outputs/checkpoint-1840",
4 "epoch": 5.0,
5 "eval_steps": 500,
6 "global_step": 1840,
7 "is_hyper_param_search": false,
8 "is_local_process_zero": true,
9 "is_world_process_zero": true,
10 "log_history": [
11 {
12 "epoch": 0.03,
13 "learning_rate": 1.989130434782609e-05,
14 "loss": 0.5675,
15 "step": 10
16 },
17 {
18 "epoch": 0.05,
19 "learning_rate": 1.9782608695652176e-05,
20 "loss": 0.4112,
21 "step": 20
22 },
23 {
24 "epoch": 0.08,
25 "learning_rate": 1.9673913043478263e-05,
26 "loss": 0.3002,
27 "step": 30
28 },
29 {
30 "epoch": 0.11,
31 "learning_rate": 1.956521739130435e-05,
32 "loss": 0.3774,
33 "step": 40
34 },
35 {
36 "epoch": 0.14,
37 "learning_rate": 1.9456521739130436e-05,
38 "loss": 0.3295,
39 "step": 50
40 },
41 {
42 "epoch": 0.16,
43 "learning_rate": 1.9347826086956523e-05,
44 "loss": 0.3067,
45 "step": 60
46 },
47 {
48 "epoch": 0.19,
49 "learning_rate": 1.923913043478261e-05,
50 "loss": 0.2457,
51 "step": 70
52 },
53 {
54 "epoch": 0.22,
55 "learning_rate": 1.9130434782608697e-05,
56 "loss": 0.3318,
57 "step": 80
58 },
59 {
60 "epoch": 0.24,
61 "learning_rate": 1.9021739130434784e-05,
62 "loss": 0.1932,
63 "step": 90
64 },
65 {
66 "epoch": 0.27,
67 "learning_rate": 1.891304347826087e-05,
68 "loss": 0.1948,
69 "step": 100
70 },
71 {
72 "epoch": 0.3,
73 "learning_rate": 1.8804347826086958e-05,
74 "loss": 0.2475,
75 "step": 110
76 },
77 {
78 "epoch": 0.33,
79 "learning_rate": 1.8695652173913045e-05,
80 "loss": 0.1432,
81 "step": 120
82 },
83 {
84 "epoch": 0.35,
85 "learning_rate": 1.8586956521739132e-05,
86 "loss": 0.2069,
87 "step": 130
88 },
89 {
90 "epoch": 0.38,
91 "learning_rate": 1.847826086956522e-05,
92 "loss": 0.1986,
93 "step": 140
94 },
95 {
96 "epoch": 0.41,
97 "learning_rate": 1.8369565217391306e-05,
98 "loss": 0.2156,
99 "step": 150
100 },
101 {
102 "epoch": 0.43,
103 "learning_rate": 1.8260869565217393e-05,
104 "loss": 0.1187,
105 "step": 160
106 },
107 {
108 "epoch": 0.46,
109 "learning_rate": 1.815217391304348e-05,
110 "loss": 0.1192,
111 "step": 170
112 },
113 {
114 "epoch": 0.49,
115 "learning_rate": 1.8043478260869567e-05,
116 "loss": 0.1748,
117 "step": 180
118 },
119 {
120 "epoch": 0.52,
121 "learning_rate": 1.7934782608695654e-05,
122 "loss": 0.0779,
123 "step": 190
124 },
125 {
126 "epoch": 0.54,
127 "learning_rate": 1.782608695652174e-05,
128 "loss": 0.1075,
129 "step": 200
130 },
131 {
132 "epoch": 0.57,
133 "learning_rate": 1.7717391304347828e-05,
134 "loss": 0.1298,
135 "step": 210
136 },
137 {
138 "epoch": 0.6,
139 "learning_rate": 1.7608695652173915e-05,
140 "loss": 0.0728,
141 "step": 220
142 },
143 {
144 "epoch": 0.62,
145 "learning_rate": 1.7500000000000002e-05,
146 "loss": 0.1189,
147 "step": 230
148 },
149 {
150 "epoch": 0.65,
151 "learning_rate": 1.739130434782609e-05,
152 "loss": 0.1102,
153 "step": 240
154 },
155 {
156 "epoch": 0.68,
157 "learning_rate": 1.7282608695652176e-05,
158 "loss": 0.1183,
159 "step": 250
160 },
161 {
162 "epoch": 0.71,
163 "learning_rate": 1.7173913043478263e-05,
164 "loss": 0.3006,
165 "step": 260
166 },
167 {
168 "epoch": 0.73,
169 "learning_rate": 1.706521739130435e-05,
170 "loss": 0.1408,
171 "step": 270
172 },
173 {
174 "epoch": 0.76,
175 "learning_rate": 1.6956521739130437e-05,
176 "loss": 0.141,
177 "step": 280
178 },
179 {
180 "epoch": 0.79,
181 "learning_rate": 1.6847826086956524e-05,
182 "loss": 0.1208,
183 "step": 290
184 },
185 {
186 "epoch": 0.82,
187 "learning_rate": 1.673913043478261e-05,
188 "loss": 0.1004,
189 "step": 300
190 },
191 {
192 "epoch": 0.84,
193 "learning_rate": 1.6630434782608698e-05,
194 "loss": 0.206,
195 "step": 310
196 },
197 {
198 "epoch": 0.87,
199 "learning_rate": 1.6521739130434785e-05,
200 "loss": 0.12,
201 "step": 320
202 },
203 {
204 "epoch": 0.9,
205 "learning_rate": 1.641304347826087e-05,
206 "loss": 0.0705,
207 "step": 330
208 },
209 {
210 "epoch": 0.92,
211 "learning_rate": 1.630434782608696e-05,
212 "loss": 0.1018,
213 "step": 340
214 },
215 {
216 "epoch": 0.95,
217 "learning_rate": 1.6195652173913045e-05,
218 "loss": 0.1501,
219 "step": 350
220 },
221 {
222 "epoch": 0.98,
223 "learning_rate": 1.6086956521739132e-05,
224 "loss": 0.0597,
225 "step": 360
226 },
227 {
228 "epoch": 1.0,
229 "eval_accuracy": 0.9865125240847784,
230 "eval_loss": 0.05456383526325226,
231 "eval_runtime": 6.5116,
232 "eval_samples_per_second": 79.704,
233 "eval_steps_per_second": 9.982,
234 "step": 368
235 },
236 {
237 "epoch": 1.01,
238 "learning_rate": 1.597826086956522e-05,
239 "loss": 0.0878,
240 "step": 370
241 },
242 {
243 "epoch": 1.03,
244 "learning_rate": 1.5869565217391306e-05,
245 "loss": 0.1651,
246 "step": 380
247 },
248 {
249 "epoch": 1.06,
250 "learning_rate": 1.576086956521739e-05,
251 "loss": 0.0645,
252 "step": 390
253 },
254 {
255 "epoch": 1.09,
256 "learning_rate": 1.565217391304348e-05,
257 "loss": 0.1085,
258 "step": 400
259 },
260 {
261 "epoch": 1.11,
262 "learning_rate": 1.5543478260869567e-05,
263 "loss": 0.0967,
264 "step": 410
265 },
266 {
267 "epoch": 1.14,
268 "learning_rate": 1.5434782608695654e-05,
269 "loss": 0.1178,
270 "step": 420
271 },
272 {
273 "epoch": 1.17,
274 "learning_rate": 1.532608695652174e-05,
275 "loss": 0.0605,
276 "step": 430
277 },
278 {
279 "epoch": 1.2,
280 "learning_rate": 1.5217391304347828e-05,
281 "loss": 0.1394,
282 "step": 440
283 },
284 {
285 "epoch": 1.22,
286 "learning_rate": 1.5108695652173915e-05,
287 "loss": 0.1113,
288 "step": 450
289 },
290 {
291 "epoch": 1.25,
292 "learning_rate": 1.5000000000000002e-05,
293 "loss": 0.0225,
294 "step": 460
295 },
296 {
297 "epoch": 1.28,
298 "learning_rate": 1.4891304347826087e-05,
299 "loss": 0.1861,
300 "step": 470
301 },
302 {
303 "epoch": 1.3,
304 "learning_rate": 1.4782608695652174e-05,
305 "loss": 0.0879,
306 "step": 480
307 },
308 {
309 "epoch": 1.33,
310 "learning_rate": 1.4673913043478263e-05,
311 "loss": 0.094,
312 "step": 490
313 },
314 {
315 "epoch": 1.36,
316 "learning_rate": 1.456521739130435e-05,
317 "loss": 0.1837,
318 "step": 500
319 },
320 {
321 "epoch": 1.39,
322 "learning_rate": 1.4456521739130435e-05,
323 "loss": 0.057,
324 "step": 510
325 },
326 {
327 "epoch": 1.41,
328 "learning_rate": 1.4347826086956522e-05,
329 "loss": 0.0504,
330 "step": 520
331 },
332 {
333 "epoch": 1.44,
334 "learning_rate": 1.423913043478261e-05,
335 "loss": 0.03,
336 "step": 530
337 },
338 {
339 "epoch": 1.47,
340 "learning_rate": 1.4130434782608698e-05,
341 "loss": 0.0637,
342 "step": 540
343 },
344 {
345 "epoch": 1.49,
346 "learning_rate": 1.4021739130434783e-05,
347 "loss": 0.1572,
348 "step": 550
349 },
350 {
351 "epoch": 1.52,
352 "learning_rate": 1.391304347826087e-05,
353 "loss": 0.2074,
354 "step": 560
355 },
356 {
357 "epoch": 1.55,
358 "learning_rate": 1.3804347826086957e-05,
359 "loss": 0.1031,
360 "step": 570
361 },
362 {
363 "epoch": 1.58,
364 "learning_rate": 1.3695652173913046e-05,
365 "loss": 0.075,
366 "step": 580
367 },
368 {
369 "epoch": 1.6,
370 "learning_rate": 1.3586956521739133e-05,
371 "loss": 0.0854,
372 "step": 590
373 },
374 {
375 "epoch": 1.63,
376 "learning_rate": 1.3478260869565218e-05,
377 "loss": 0.0897,
378 "step": 600
379 },
380 {
381 "epoch": 1.66,
382 "learning_rate": 1.3369565217391305e-05,
383 "loss": 0.1017,
384 "step": 610
385 },
386 {
387 "epoch": 1.68,
388 "learning_rate": 1.3260869565217392e-05,
389 "loss": 0.132,
390 "step": 620
391 },
392 {
393 "epoch": 1.71,
394 "learning_rate": 1.315217391304348e-05,
395 "loss": 0.0471,
396 "step": 630
397 },
398 {
399 "epoch": 1.74,
400 "learning_rate": 1.3043478260869566e-05,
401 "loss": 0.0707,
402 "step": 640
403 },
404 {
405 "epoch": 1.77,
406 "learning_rate": 1.2934782608695653e-05,
407 "loss": 0.0506,
408 "step": 650
409 },
410 {
411 "epoch": 1.79,
412 "learning_rate": 1.282608695652174e-05,
413 "loss": 0.1308,
414 "step": 660
415 },
416 {
417 "epoch": 1.82,
418 "learning_rate": 1.2717391304347828e-05,
419 "loss": 0.1188,
420 "step": 670
421 },
422 {
423 "epoch": 1.85,
424 "learning_rate": 1.2608695652173915e-05,
425 "loss": 0.1021,
426 "step": 680
427 },
428 {
429 "epoch": 1.88,
430 "learning_rate": 1.25e-05,
431 "loss": 0.1199,
432 "step": 690
433 },
434 {
435 "epoch": 1.9,
436 "learning_rate": 1.2391304347826088e-05,
437 "loss": 0.1068,
438 "step": 700
439 },
440 {
441 "epoch": 1.93,
442 "learning_rate": 1.2282608695652175e-05,
443 "loss": 0.0535,
444 "step": 710
445 },
446 {
447 "epoch": 1.96,
448 "learning_rate": 1.2173913043478263e-05,
449 "loss": 0.0723,
450 "step": 720
451 },
452 {
453 "epoch": 1.98,
454 "learning_rate": 1.2065217391304348e-05,
455 "loss": 0.2009,
456 "step": 730
457 },
458 {
459 "epoch": 2.0,
460 "eval_accuracy": 0.9865125240847784,
461 "eval_loss": 0.05307452380657196,
462 "eval_runtime": 6.4841,
463 "eval_samples_per_second": 80.043,
464 "eval_steps_per_second": 10.025,
465 "step": 736
466 },
467 {
468 "epoch": 2.01,
469 "learning_rate": 1.1956521739130435e-05,
470 "loss": 0.0156,
471 "step": 740
472 },
473 {
474 "epoch": 2.04,
475 "learning_rate": 1.1847826086956522e-05,
476 "loss": 0.169,
477 "step": 750
478 },
479 {
480 "epoch": 2.07,
481 "learning_rate": 1.1739130434782611e-05,
482 "loss": 0.0866,
483 "step": 760
484 },
485 {
486 "epoch": 2.09,
487 "learning_rate": 1.1630434782608698e-05,
488 "loss": 0.0973,
489 "step": 770
490 },
491 {
492 "epoch": 2.12,
493 "learning_rate": 1.1521739130434783e-05,
494 "loss": 0.0427,
495 "step": 780
496 },
497 {
498 "epoch": 2.15,
499 "learning_rate": 1.141304347826087e-05,
500 "loss": 0.1296,
501 "step": 790
502 },
503 {
504 "epoch": 2.17,
505 "learning_rate": 1.1304347826086957e-05,
506 "loss": 0.0265,
507 "step": 800
508 },
509 {
510 "epoch": 2.2,
511 "learning_rate": 1.1195652173913046e-05,
512 "loss": 0.1574,
513 "step": 810
514 },
515 {
516 "epoch": 2.23,
517 "learning_rate": 1.1086956521739131e-05,
518 "loss": 0.0655,
519 "step": 820
520 },
521 {
522 "epoch": 2.26,
523 "learning_rate": 1.0978260869565218e-05,
524 "loss": 0.0785,
525 "step": 830
526 },
527 {
528 "epoch": 2.28,
529 "learning_rate": 1.0869565217391305e-05,
530 "loss": 0.1273,
531 "step": 840
532 },
533 {
534 "epoch": 2.31,
535 "learning_rate": 1.076086956521739e-05,
536 "loss": 0.0374,
537 "step": 850
538 },
539 {
540 "epoch": 2.34,
541 "learning_rate": 1.0652173913043479e-05,
542 "loss": 0.2576,
543 "step": 860
544 },
545 {
546 "epoch": 2.36,
547 "learning_rate": 1.0543478260869566e-05,
548 "loss": 0.0417,
549 "step": 870
550 },
551 {
552 "epoch": 2.39,
553 "learning_rate": 1.0434782608695653e-05,
554 "loss": 0.115,
555 "step": 880
556 },
557 {
558 "epoch": 2.42,
559 "learning_rate": 1.032608695652174e-05,
560 "loss": 0.105,
561 "step": 890
562 },
563 {
564 "epoch": 2.45,
565 "learning_rate": 1.0217391304347829e-05,
566 "loss": 0.1704,
567 "step": 900
568 },
569 {
570 "epoch": 2.47,
571 "learning_rate": 1.0108695652173914e-05,
572 "loss": 0.0442,
573 "step": 910
574 },
575 {
576 "epoch": 2.5,
577 "learning_rate": 1e-05,
578 "loss": 0.079,
579 "step": 920
580 },
581 {
582 "epoch": 2.53,
583 "learning_rate": 9.891304347826088e-06,
584 "loss": 0.0214,
585 "step": 930
586 },
587 {
588 "epoch": 2.55,
589 "learning_rate": 9.782608695652175e-06,
590 "loss": 0.112,
591 "step": 940
592 },
593 {
594 "epoch": 2.58,
595 "learning_rate": 9.673913043478262e-06,
596 "loss": 0.0467,
597 "step": 950
598 },
599 {
600 "epoch": 2.61,
601 "learning_rate": 9.565217391304349e-06,
602 "loss": 0.0944,
603 "step": 960
604 },
605 {
606 "epoch": 2.64,
607 "learning_rate": 9.456521739130436e-06,
608 "loss": 0.0195,
609 "step": 970
610 },
611 {
612 "epoch": 2.66,
613 "learning_rate": 9.347826086956523e-06,
614 "loss": 0.1084,
615 "step": 980
616 },
617 {
618 "epoch": 2.69,
619 "learning_rate": 9.23913043478261e-06,
620 "loss": 0.0598,
621 "step": 990
622 },
623 {
624 "epoch": 2.72,
625 "learning_rate": 9.130434782608697e-06,
626 "loss": 0.0563,
627 "step": 1000
628 },
629 {
630 "epoch": 2.74,
631 "learning_rate": 9.021739130434784e-06,
632 "loss": 0.1212,
633 "step": 1010
634 },
635 {
636 "epoch": 2.77,
637 "learning_rate": 8.91304347826087e-06,
638 "loss": 0.103,
639 "step": 1020
640 },
641 {
642 "epoch": 2.8,
643 "learning_rate": 8.804347826086957e-06,
644 "loss": 0.0708,
645 "step": 1030
646 },
647 {
648 "epoch": 2.83,
649 "learning_rate": 8.695652173913044e-06,
650 "loss": 0.0639,
651 "step": 1040
652 },
653 {
654 "epoch": 2.85,
655 "learning_rate": 8.586956521739131e-06,
656 "loss": 0.0153,
657 "step": 1050
658 },
659 {
660 "epoch": 2.88,
661 "learning_rate": 8.478260869565218e-06,
662 "loss": 0.028,
663 "step": 1060
664 },
665 {
666 "epoch": 2.91,
667 "learning_rate": 8.369565217391305e-06,
668 "loss": 0.029,
669 "step": 1070
670 },
671 {
672 "epoch": 2.93,
673 "learning_rate": 8.260869565217392e-06,
674 "loss": 0.0915,
675 "step": 1080
676 },
677 {
678 "epoch": 2.96,
679 "learning_rate": 8.15217391304348e-06,
680 "loss": 0.0186,
681 "step": 1090
682 },
683 {
684 "epoch": 2.99,
685 "learning_rate": 8.043478260869566e-06,
686 "loss": 0.0114,
687 "step": 1100
688 },
689 {
690 "epoch": 3.0,
691 "eval_accuracy": 0.9903660886319846,
692 "eval_loss": 0.04182479530572891,
693 "eval_runtime": 6.3668,
694 "eval_samples_per_second": 81.517,
695 "eval_steps_per_second": 10.209,
696 "step": 1104
697 },
698 {
699 "epoch": 3.02,
700 "learning_rate": 7.934782608695653e-06,
701 "loss": 0.2106,
702 "step": 1110
703 },
704 {
705 "epoch": 3.04,
706 "learning_rate": 7.82608695652174e-06,
707 "loss": 0.0515,
708 "step": 1120
709 },
710 {
711 "epoch": 3.07,
712 "learning_rate": 7.717391304347827e-06,
713 "loss": 0.0406,
714 "step": 1130
715 },
716 {
717 "epoch": 3.1,
718 "learning_rate": 7.608695652173914e-06,
719 "loss": 0.0355,
720 "step": 1140
721 },
722 {
723 "epoch": 3.12,
724 "learning_rate": 7.500000000000001e-06,
725 "loss": 0.1842,
726 "step": 1150
727 },
728 {
729 "epoch": 3.15,
730 "learning_rate": 7.391304347826087e-06,
731 "loss": 0.0545,
732 "step": 1160
733 },
734 {
735 "epoch": 3.18,
736 "learning_rate": 7.282608695652175e-06,
737 "loss": 0.1349,
738 "step": 1170
739 },
740 {
741 "epoch": 3.21,
742 "learning_rate": 7.173913043478261e-06,
743 "loss": 0.0104,
744 "step": 1180
745 },
746 {
747 "epoch": 3.23,
748 "learning_rate": 7.065217391304349e-06,
749 "loss": 0.1324,
750 "step": 1190
751 },
752 {
753 "epoch": 3.26,
754 "learning_rate": 6.956521739130435e-06,
755 "loss": 0.0934,
756 "step": 1200
757 },
758 {
759 "epoch": 3.29,
760 "learning_rate": 6.847826086956523e-06,
761 "loss": 0.0966,
762 "step": 1210
763 },
764 {
765 "epoch": 3.32,
766 "learning_rate": 6.739130434782609e-06,
767 "loss": 0.0588,
768 "step": 1220
769 },
770 {
771 "epoch": 3.34,
772 "learning_rate": 6.630434782608696e-06,
773 "loss": 0.0802,
774 "step": 1230
775 },
776 {
777 "epoch": 3.37,
778 "learning_rate": 6.521739130434783e-06,
779 "loss": 0.0576,
780 "step": 1240
781 },
782 {
783 "epoch": 3.4,
784 "learning_rate": 6.41304347826087e-06,
785 "loss": 0.0419,
786 "step": 1250
787 },
788 {
789 "epoch": 3.42,
790 "learning_rate": 6.304347826086958e-06,
791 "loss": 0.0481,
792 "step": 1260
793 },
794 {
795 "epoch": 3.45,
796 "learning_rate": 6.195652173913044e-06,
797 "loss": 0.0861,
798 "step": 1270
799 },
800 {
801 "epoch": 3.48,
802 "learning_rate": 6.086956521739132e-06,
803 "loss": 0.1023,
804 "step": 1280
805 },
806 {
807 "epoch": 3.51,
808 "learning_rate": 5.978260869565218e-06,
809 "loss": 0.0584,
810 "step": 1290
811 },
812 {
813 "epoch": 3.53,
814 "learning_rate": 5.8695652173913055e-06,
815 "loss": 0.1282,
816 "step": 1300
817 },
818 {
819 "epoch": 3.56,
820 "learning_rate": 5.760869565217392e-06,
821 "loss": 0.0277,
822 "step": 1310
823 },
824 {
825 "epoch": 3.59,
826 "learning_rate": 5.652173913043479e-06,
827 "loss": 0.1837,
828 "step": 1320
829 },
830 {
831 "epoch": 3.61,
832 "learning_rate": 5.543478260869566e-06,
833 "loss": 0.0264,
834 "step": 1330
835 },
836 {
837 "epoch": 3.64,
838 "learning_rate": 5.4347826086956525e-06,
839 "loss": 0.1224,
840 "step": 1340
841 },
842 {
843 "epoch": 3.67,
844 "learning_rate": 5.3260869565217395e-06,
845 "loss": 0.0434,
846 "step": 1350
847 },
848 {
849 "epoch": 3.7,
850 "learning_rate": 5.2173913043478265e-06,
851 "loss": 0.1337,
852 "step": 1360
853 },
854 {
855 "epoch": 3.72,
856 "learning_rate": 5.108695652173914e-06,
857 "loss": 0.0071,
858 "step": 1370
859 },
860 {
861 "epoch": 3.75,
862 "learning_rate": 5e-06,
863 "loss": 0.0568,
864 "step": 1380
865 },
866 {
867 "epoch": 3.78,
868 "learning_rate": 4.891304347826087e-06,
869 "loss": 0.043,
870 "step": 1390
871 },
872 {
873 "epoch": 3.8,
874 "learning_rate": 4.782608695652174e-06,
875 "loss": 0.0719,
876 "step": 1400
877 },
878 {
879 "epoch": 3.83,
880 "learning_rate": 4.673913043478261e-06,
881 "loss": 0.1128,
882 "step": 1410
883 },
884 {
885 "epoch": 3.86,
886 "learning_rate": 4.565217391304348e-06,
887 "loss": 0.0477,
888 "step": 1420
889 },
890 {
891 "epoch": 3.89,
892 "learning_rate": 4.456521739130435e-06,
893 "loss": 0.0791,
894 "step": 1430
895 },
896 {
897 "epoch": 3.91,
898 "learning_rate": 4.347826086956522e-06,
899 "loss": 0.037,
900 "step": 1440
901 },
902 {
903 "epoch": 3.94,
904 "learning_rate": 4.239130434782609e-06,
905 "loss": 0.1466,
906 "step": 1450
907 },
908 {
909 "epoch": 3.97,
910 "learning_rate": 4.130434782608696e-06,
911 "loss": 0.0467,
912 "step": 1460
913 },
914 {
915 "epoch": 3.99,
916 "learning_rate": 4.021739130434783e-06,
917 "loss": 0.0998,
918 "step": 1470
919 },
920 {
921 "epoch": 4.0,
922 "eval_accuracy": 0.9903660886319846,
923 "eval_loss": 0.04251210391521454,
924 "eval_runtime": 6.5268,
925 "eval_samples_per_second": 79.518,
926 "eval_steps_per_second": 9.959,
927 "step": 1472
928 },
929 {
930 "epoch": 4.02,
931 "learning_rate": 3.91304347826087e-06,
932 "loss": 0.1285,
933 "step": 1480
934 },
935 {
936 "epoch": 4.05,
937 "learning_rate": 3.804347826086957e-06,
938 "loss": 0.1634,
939 "step": 1490
940 },
941 {
942 "epoch": 4.08,
943 "learning_rate": 3.6956521739130436e-06,
944 "loss": 0.0462,
945 "step": 1500
946 },
947 {
948 "epoch": 4.1,
949 "learning_rate": 3.5869565217391305e-06,
950 "loss": 0.0846,
951 "step": 1510
952 },
953 {
954 "epoch": 4.13,
955 "learning_rate": 3.4782608695652175e-06,
956 "loss": 0.1239,
957 "step": 1520
958 },
959 {
960 "epoch": 4.16,
961 "learning_rate": 3.3695652173913045e-06,
962 "loss": 0.1818,
963 "step": 1530
964 },
965 {
966 "epoch": 4.18,
967 "learning_rate": 3.2608695652173914e-06,
968 "loss": 0.021,
969 "step": 1540
970 },
971 {
972 "epoch": 4.21,
973 "learning_rate": 3.152173913043479e-06,
974 "loss": 0.0741,
975 "step": 1550
976 },
977 {
978 "epoch": 4.24,
979 "learning_rate": 3.043478260869566e-06,
980 "loss": 0.182,
981 "step": 1560
982 },
983 {
984 "epoch": 4.27,
985 "learning_rate": 2.9347826086956528e-06,
986 "loss": 0.0433,
987 "step": 1570
988 },
989 {
990 "epoch": 4.29,
991 "learning_rate": 2.8260869565217393e-06,
992 "loss": 0.0437,
993 "step": 1580
994 },
995 {
996 "epoch": 4.32,
997 "learning_rate": 2.7173913043478263e-06,
998 "loss": 0.0382,
999 "step": 1590
1000 },
1001 {
1002 "epoch": 4.35,
1003 "learning_rate": 2.6086956521739132e-06,
1004 "loss": 0.046,
1005 "step": 1600
1006 },
1007 {
1008 "epoch": 4.38,
1009 "learning_rate": 2.5e-06,
1010 "loss": 0.0213,
1011 "step": 1610
1012 },
1013 {
1014 "epoch": 4.4,
1015 "learning_rate": 2.391304347826087e-06,
1016 "loss": 0.0186,
1017 "step": 1620
1018 },
1019 {
1020 "epoch": 4.43,
1021 "learning_rate": 2.282608695652174e-06,
1022 "loss": 0.0671,
1023 "step": 1630
1024 },
1025 {
1026 "epoch": 4.46,
1027 "learning_rate": 2.173913043478261e-06,
1028 "loss": 0.0908,
1029 "step": 1640
1030 },
1031 {
1032 "epoch": 4.48,
1033 "learning_rate": 2.065217391304348e-06,
1034 "loss": 0.0697,
1035 "step": 1650
1036 },
1037 {
1038 "epoch": 4.51,
1039 "learning_rate": 1.956521739130435e-06,
1040 "loss": 0.0637,
1041 "step": 1660
1042 },
1043 {
1044 "epoch": 4.54,
1045 "learning_rate": 1.8478260869565218e-06,
1046 "loss": 0.0819,
1047 "step": 1670
1048 },
1049 {
1050 "epoch": 4.57,
1051 "learning_rate": 1.7391304347826088e-06,
1052 "loss": 0.0623,
1053 "step": 1680
1054 },
1055 {
1056 "epoch": 4.59,
1057 "learning_rate": 1.6304347826086957e-06,
1058 "loss": 0.0114,
1059 "step": 1690
1060 },
1061 {
1062 "epoch": 4.62,
1063 "learning_rate": 1.521739130434783e-06,
1064 "loss": 0.0342,
1065 "step": 1700
1066 },
1067 {
1068 "epoch": 4.65,
1069 "learning_rate": 1.4130434782608697e-06,
1070 "loss": 0.0859,
1071 "step": 1710
1072 },
1073 {
1074 "epoch": 4.67,
1075 "learning_rate": 1.3043478260869566e-06,
1076 "loss": 0.0462,
1077 "step": 1720
1078 },
1079 {
1080 "epoch": 4.7,
1081 "learning_rate": 1.1956521739130436e-06,
1082 "loss": 0.1022,
1083 "step": 1730
1084 },
1085 {
1086 "epoch": 4.73,
1087 "learning_rate": 1.0869565217391306e-06,
1088 "loss": 0.0571,
1089 "step": 1740
1090 },
1091 {
1092 "epoch": 4.76,
1093 "learning_rate": 9.782608695652175e-07,
1094 "loss": 0.0108,
1095 "step": 1750
1096 },
1097 {
1098 "epoch": 4.78,
1099 "learning_rate": 8.695652173913044e-07,
1100 "loss": 0.0893,
1101 "step": 1760
1102 },
1103 {
1104 "epoch": 4.81,
1105 "learning_rate": 7.608695652173914e-07,
1106 "loss": 0.0214,
1107 "step": 1770
1108 },
1109 {
1110 "epoch": 4.84,
1111 "learning_rate": 6.521739130434783e-07,
1112 "loss": 0.0416,
1113 "step": 1780
1114 },
1115 {
1116 "epoch": 4.86,
1117 "learning_rate": 5.434782608695653e-07,
1118 "loss": 0.1022,
1119 "step": 1790
1120 },
1121 {
1122 "epoch": 4.89,
1123 "learning_rate": 4.347826086956522e-07,
1124 "loss": 0.0628,
1125 "step": 1800
1126 },
1127 {
1128 "epoch": 4.92,
1129 "learning_rate": 3.2608695652173915e-07,
1130 "loss": 0.0691,
1131 "step": 1810
1132 },
1133 {
1134 "epoch": 4.95,
1135 "learning_rate": 2.173913043478261e-07,
1136 "loss": 0.0371,
1137 "step": 1820
1138 },
1139 {
1140 "epoch": 4.97,
1141 "learning_rate": 1.0869565217391305e-07,
1142 "loss": 0.0714,
1143 "step": 1830
1144 },
1145 {
1146 "epoch": 5.0,
1147 "learning_rate": 0.0,
1148 "loss": 0.1244,
1149 "step": 1840
1150 },
1151 {
1152 "epoch": 5.0,
1153 "eval_accuracy": 0.9922928709055877,
1154 "eval_loss": 0.03933868557214737,
1155 "eval_runtime": 6.3674,
1156 "eval_samples_per_second": 81.509,
1157 "eval_steps_per_second": 10.208,
1158 "step": 1840
1159 },
1160 {
1161 "epoch": 5.0,
1162 "step": 1840,
1163 "total_flos": 1.1387447873864294e+18,
1164 "train_loss": 0.10440107471431079,
1165 "train_runtime": 430.0921,
1166 "train_samples_per_second": 34.167,
1167 "train_steps_per_second": 4.278
1168 }
1169 ],
1170 "logging_steps": 10,
1171 "max_steps": 1840,
1172 "num_input_tokens_seen": 0,
1173 "num_train_epochs": 5,
1174 "save_steps": 500,
1175 "total_flos": 1.1387447873864294e+18,
1176 "trial_name": null,
1177 "trial_params": null
1178 }
1179