trainer_state.json
29.8 KB · 1494 lines · json Raw
1 {
2 "best_metric": null,
3 "best_model_checkpoint": null,
4 "epoch": 5.0,
5 "global_step": 122720,
6 "is_hyper_param_search": false,
7 "is_local_process_zero": true,
8 "is_world_process_zero": true,
9 "log_history": [
10 {
11 "epoch": 0.08,
12 "learning_rate": 1.9456757931334205e-05,
13 "loss": 0.8613,
14 "step": 500
15 },
16 {
17 "epoch": 0.16,
18 "learning_rate": 1.8913515862668405e-05,
19 "loss": 0.7137,
20 "step": 1000
21 },
22 {
23 "epoch": 0.24,
24 "learning_rate": 1.837027379400261e-05,
25 "loss": 0.664,
26 "step": 1500
27 },
28 {
29 "epoch": 0.33,
30 "learning_rate": 1.7827031725336812e-05,
31 "loss": 0.6249,
32 "step": 2000
33 },
34 {
35 "epoch": 0.41,
36 "learning_rate": 1.7283789656671015e-05,
37 "loss": 0.6086,
38 "step": 2500
39 },
40 {
41 "epoch": 0.49,
42 "learning_rate": 1.6740547588005215e-05,
43 "loss": 0.5861,
44 "step": 3000
45 },
46 {
47 "epoch": 0.57,
48 "learning_rate": 1.619730551933942e-05,
49 "loss": 0.5828,
50 "step": 3500
51 },
52 {
53 "epoch": 0.65,
54 "learning_rate": 1.5654063450673622e-05,
55 "loss": 0.5786,
56 "step": 4000
57 },
58 {
59 "epoch": 0.73,
60 "learning_rate": 1.5110821382007822e-05,
61 "loss": 0.5533,
62 "step": 4500
63 },
64 {
65 "epoch": 0.81,
66 "learning_rate": 1.4567579313342026e-05,
67 "loss": 0.582,
68 "step": 5000
69 },
70 {
71 "epoch": 0.9,
72 "learning_rate": 1.402433724467623e-05,
73 "loss": 0.5415,
74 "step": 5500
75 },
76 {
77 "epoch": 0.98,
78 "learning_rate": 1.3481095176010431e-05,
79 "loss": 0.5371,
80 "step": 6000
81 },
82 {
83 "epoch": 0.26,
84 "learning_rate": 1.8940677966101697e-05,
85 "loss": 0.5397,
86 "step": 6500
87 },
88 {
89 "epoch": 0.29,
90 "learning_rate": 1.8859191655801828e-05,
91 "loss": 0.5201,
92 "step": 7000
93 },
94 {
95 "epoch": 0.31,
96 "learning_rate": 1.8777705345501956e-05,
97 "loss": 0.5129,
98 "step": 7500
99 },
100 {
101 "epoch": 0.33,
102 "learning_rate": 1.8696219035202087e-05,
103 "loss": 0.5152,
104 "step": 8000
105 },
106 {
107 "epoch": 0.35,
108 "learning_rate": 1.861473272490222e-05,
109 "loss": 0.5041,
110 "step": 8500
111 },
112 {
113 "epoch": 0.37,
114 "learning_rate": 1.853324641460235e-05,
115 "loss": 0.5183,
116 "step": 9000
117 },
118 {
119 "epoch": 0.39,
120 "learning_rate": 1.8451760104302477e-05,
121 "loss": 0.5183,
122 "step": 9500
123 },
124 {
125 "epoch": 0.41,
126 "learning_rate": 1.837027379400261e-05,
127 "loss": 0.5126,
128 "step": 10000
129 },
130 {
131 "epoch": 0.43,
132 "learning_rate": 1.828878748370274e-05,
133 "loss": 0.5179,
134 "step": 10500
135 },
136 {
137 "epoch": 0.45,
138 "learning_rate": 1.820730117340287e-05,
139 "loss": 0.4975,
140 "step": 11000
141 },
142 {
143 "epoch": 0.47,
144 "learning_rate": 1.8125814863103e-05,
145 "loss": 0.5099,
146 "step": 11500
147 },
148 {
149 "epoch": 0.49,
150 "learning_rate": 1.804432855280313e-05,
151 "loss": 0.4978,
152 "step": 12000
153 },
154 {
155 "epoch": 0.51,
156 "learning_rate": 1.796284224250326e-05,
157 "loss": 0.4882,
158 "step": 12500
159 },
160 {
161 "epoch": 0.53,
162 "learning_rate": 1.788135593220339e-05,
163 "loss": 0.4891,
164 "step": 13000
165 },
166 {
167 "epoch": 0.55,
168 "learning_rate": 1.7799869621903524e-05,
169 "loss": 0.4992,
170 "step": 13500
171 },
172 {
173 "epoch": 0.57,
174 "learning_rate": 1.771838331160365e-05,
175 "loss": 0.4922,
176 "step": 14000
177 },
178 {
179 "epoch": 0.59,
180 "learning_rate": 1.7636897001303783e-05,
181 "loss": 0.4901,
182 "step": 14500
183 },
184 {
185 "epoch": 0.61,
186 "learning_rate": 1.7555410691003914e-05,
187 "loss": 0.4972,
188 "step": 15000
189 },
190 {
191 "epoch": 0.63,
192 "learning_rate": 1.7473924380704045e-05,
193 "loss": 0.476,
194 "step": 15500
195 },
196 {
197 "epoch": 0.65,
198 "learning_rate": 1.7392438070404173e-05,
199 "loss": 0.4918,
200 "step": 16000
201 },
202 {
203 "epoch": 0.67,
204 "learning_rate": 1.7310951760104304e-05,
205 "loss": 0.4856,
206 "step": 16500
207 },
208 {
209 "epoch": 0.69,
210 "learning_rate": 1.7229465449804435e-05,
211 "loss": 0.4855,
212 "step": 17000
213 },
214 {
215 "epoch": 0.71,
216 "learning_rate": 1.7147979139504566e-05,
217 "loss": 0.485,
218 "step": 17500
219 },
220 {
221 "epoch": 0.73,
222 "learning_rate": 1.7066492829204694e-05,
223 "loss": 0.4855,
224 "step": 18000
225 },
226 {
227 "epoch": 0.75,
228 "learning_rate": 1.6985006518904825e-05,
229 "loss": 0.4817,
230 "step": 18500
231 },
232 {
233 "epoch": 0.77,
234 "learning_rate": 1.6903520208604957e-05,
235 "loss": 0.5006,
236 "step": 19000
237 },
238 {
239 "epoch": 0.79,
240 "learning_rate": 1.6822033898305084e-05,
241 "loss": 0.4814,
242 "step": 19500
243 },
244 {
245 "epoch": 0.81,
246 "learning_rate": 1.6740547588005215e-05,
247 "loss": 0.4944,
248 "step": 20000
249 },
250 {
251 "epoch": 0.84,
252 "learning_rate": 1.6659061277705347e-05,
253 "loss": 0.4805,
254 "step": 20500
255 },
256 {
257 "epoch": 0.86,
258 "learning_rate": 1.6577574967405478e-05,
259 "loss": 0.4927,
260 "step": 21000
261 },
262 {
263 "epoch": 0.88,
264 "learning_rate": 1.6496088657105606e-05,
265 "loss": 0.4699,
266 "step": 21500
267 },
268 {
269 "epoch": 0.9,
270 "learning_rate": 1.6414602346805737e-05,
271 "loss": 0.4839,
272 "step": 22000
273 },
274 {
275 "epoch": 0.92,
276 "learning_rate": 1.6333116036505868e-05,
277 "loss": 0.4795,
278 "step": 22500
279 },
280 {
281 "epoch": 0.94,
282 "learning_rate": 1.6251629726206e-05,
283 "loss": 0.4888,
284 "step": 23000
285 },
286 {
287 "epoch": 0.96,
288 "learning_rate": 1.617014341590613e-05,
289 "loss": 0.4744,
290 "step": 23500
291 },
292 {
293 "epoch": 0.98,
294 "learning_rate": 1.608865710560626e-05,
295 "loss": 0.473,
296 "step": 24000
297 },
298 {
299 "epoch": 1.0,
300 "learning_rate": 1.600717079530639e-05,
301 "loss": 0.4739,
302 "step": 24500
303 },
304 {
305 "epoch": 1.02,
306 "learning_rate": 1.592568448500652e-05,
307 "loss": 0.3848,
308 "step": 25000
309 },
310 {
311 "epoch": 1.04,
312 "learning_rate": 1.5844198174706652e-05,
313 "loss": 0.3872,
314 "step": 25500
315 },
316 {
317 "epoch": 1.06,
318 "learning_rate": 1.576271186440678e-05,
319 "loss": 0.3878,
320 "step": 26000
321 },
322 {
323 "epoch": 1.08,
324 "learning_rate": 1.568122555410691e-05,
325 "loss": 0.3813,
326 "step": 26500
327 },
328 {
329 "epoch": 1.1,
330 "learning_rate": 1.5599739243807042e-05,
331 "loss": 0.3873,
332 "step": 27000
333 },
334 {
335 "epoch": 1.12,
336 "learning_rate": 1.5518252933507173e-05,
337 "loss": 0.3923,
338 "step": 27500
339 },
340 {
341 "epoch": 1.14,
342 "learning_rate": 1.54367666232073e-05,
343 "loss": 0.3901,
344 "step": 28000
345 },
346 {
347 "epoch": 1.16,
348 "learning_rate": 1.5355280312907432e-05,
349 "loss": 0.3889,
350 "step": 28500
351 },
352 {
353 "epoch": 1.18,
354 "learning_rate": 1.5273794002607563e-05,
355 "loss": 0.38,
356 "step": 29000
357 },
358 {
359 "epoch": 1.2,
360 "learning_rate": 1.5192307692307693e-05,
361 "loss": 0.388,
362 "step": 29500
363 },
364 {
365 "epoch": 1.22,
366 "learning_rate": 1.5110821382007822e-05,
367 "loss": 0.3993,
368 "step": 30000
369 },
370 {
371 "epoch": 1.24,
372 "learning_rate": 1.5029335071707954e-05,
373 "loss": 0.3911,
374 "step": 30500
375 },
376 {
377 "epoch": 1.26,
378 "learning_rate": 1.4947848761408083e-05,
379 "loss": 0.393,
380 "step": 31000
381 },
382 {
383 "epoch": 1.28,
384 "learning_rate": 1.4866362451108216e-05,
385 "loss": 0.3804,
386 "step": 31500
387 },
388 {
389 "epoch": 1.3,
390 "learning_rate": 1.4784876140808346e-05,
391 "loss": 0.3983,
392 "step": 32000
393 },
394 {
395 "epoch": 1.32,
396 "learning_rate": 1.4703389830508477e-05,
397 "loss": 0.3978,
398 "step": 32500
399 },
400 {
401 "epoch": 1.34,
402 "learning_rate": 1.4621903520208606e-05,
403 "loss": 0.4031,
404 "step": 33000
405 },
406 {
407 "epoch": 1.36,
408 "learning_rate": 1.4540417209908737e-05,
409 "loss": 0.3871,
410 "step": 33500
411 },
412 {
413 "epoch": 1.39,
414 "learning_rate": 1.4458930899608867e-05,
415 "loss": 0.4003,
416 "step": 34000
417 },
418 {
419 "epoch": 1.41,
420 "learning_rate": 1.4377444589308998e-05,
421 "loss": 0.3789,
422 "step": 34500
423 },
424 {
425 "epoch": 1.43,
426 "learning_rate": 1.4295958279009128e-05,
427 "loss": 0.3932,
428 "step": 35000
429 },
430 {
431 "epoch": 1.45,
432 "learning_rate": 1.4214471968709259e-05,
433 "loss": 0.3867,
434 "step": 35500
435 },
436 {
437 "epoch": 1.47,
438 "learning_rate": 1.4132985658409388e-05,
439 "loss": 0.391,
440 "step": 36000
441 },
442 {
443 "epoch": 1.49,
444 "learning_rate": 1.4051499348109518e-05,
445 "loss": 0.3852,
446 "step": 36500
447 },
448 {
449 "epoch": 1.51,
450 "learning_rate": 1.3970013037809649e-05,
451 "loss": 0.3865,
452 "step": 37000
453 },
454 {
455 "epoch": 1.53,
456 "learning_rate": 1.3888526727509778e-05,
457 "loss": 0.3932,
458 "step": 37500
459 },
460 {
461 "epoch": 1.55,
462 "learning_rate": 1.380704041720991e-05,
463 "loss": 0.3957,
464 "step": 38000
465 },
466 {
467 "epoch": 1.57,
468 "learning_rate": 1.372555410691004e-05,
469 "loss": 0.3908,
470 "step": 38500
471 },
472 {
473 "epoch": 1.59,
474 "learning_rate": 1.364406779661017e-05,
475 "loss": 0.3984,
476 "step": 39000
477 },
478 {
479 "epoch": 1.61,
480 "learning_rate": 1.35625814863103e-05,
481 "loss": 0.394,
482 "step": 39500
483 },
484 {
485 "epoch": 1.63,
486 "learning_rate": 1.3481095176010431e-05,
487 "loss": 0.3937,
488 "step": 40000
489 },
490 {
491 "epoch": 1.65,
492 "learning_rate": 1.3399608865710562e-05,
493 "loss": 0.4022,
494 "step": 40500
495 },
496 {
497 "epoch": 1.67,
498 "learning_rate": 1.3318122555410693e-05,
499 "loss": 0.3872,
500 "step": 41000
501 },
502 {
503 "epoch": 1.69,
504 "learning_rate": 1.3236636245110823e-05,
505 "loss": 0.3815,
506 "step": 41500
507 },
508 {
509 "epoch": 1.71,
510 "learning_rate": 1.3155149934810954e-05,
511 "loss": 0.3853,
512 "step": 42000
513 },
514 {
515 "epoch": 1.73,
516 "learning_rate": 1.3073663624511084e-05,
517 "loss": 0.3913,
518 "step": 42500
519 },
520 {
521 "epoch": 1.75,
522 "learning_rate": 1.2992177314211213e-05,
523 "loss": 0.3913,
524 "step": 43000
525 },
526 {
527 "epoch": 1.77,
528 "learning_rate": 1.2910691003911344e-05,
529 "loss": 0.3931,
530 "step": 43500
531 },
532 {
533 "epoch": 1.79,
534 "learning_rate": 1.2829204693611474e-05,
535 "loss": 0.3929,
536 "step": 44000
537 },
538 {
539 "epoch": 1.81,
540 "learning_rate": 1.2747718383311605e-05,
541 "loss": 0.3772,
542 "step": 44500
543 },
544 {
545 "epoch": 1.83,
546 "learning_rate": 1.2666232073011735e-05,
547 "loss": 0.3892,
548 "step": 45000
549 },
550 {
551 "epoch": 1.85,
552 "learning_rate": 1.2584745762711866e-05,
553 "loss": 0.3939,
554 "step": 45500
555 },
556 {
557 "epoch": 1.87,
558 "learning_rate": 1.2503259452411995e-05,
559 "loss": 0.3979,
560 "step": 46000
561 },
562 {
563 "epoch": 1.89,
564 "learning_rate": 1.2421773142112126e-05,
565 "loss": 0.3779,
566 "step": 46500
567 },
568 {
569 "epoch": 1.91,
570 "learning_rate": 1.2340286831812256e-05,
571 "loss": 0.3921,
572 "step": 47000
573 },
574 {
575 "epoch": 1.94,
576 "learning_rate": 1.2258800521512385e-05,
577 "loss": 0.3848,
578 "step": 47500
579 },
580 {
581 "epoch": 1.96,
582 "learning_rate": 1.2177314211212517e-05,
583 "loss": 0.3868,
584 "step": 48000
585 },
586 {
587 "epoch": 1.98,
588 "learning_rate": 1.2095827900912646e-05,
589 "loss": 0.3853,
590 "step": 48500
591 },
592 {
593 "epoch": 2.0,
594 "learning_rate": 1.2014341590612777e-05,
595 "loss": 0.3865,
596 "step": 49000
597 },
598 {
599 "epoch": 2.02,
600 "learning_rate": 1.193285528031291e-05,
601 "loss": 0.2994,
602 "step": 49500
603 },
604 {
605 "epoch": 2.04,
606 "learning_rate": 1.185136897001304e-05,
607 "loss": 0.2864,
608 "step": 50000
609 },
610 {
611 "epoch": 2.06,
612 "learning_rate": 1.176988265971317e-05,
613 "loss": 0.2874,
614 "step": 50500
615 },
616 {
617 "epoch": 2.08,
618 "learning_rate": 1.16883963494133e-05,
619 "loss": 0.2832,
620 "step": 51000
621 },
622 {
623 "epoch": 2.1,
624 "learning_rate": 1.160691003911343e-05,
625 "loss": 0.27,
626 "step": 51500
627 },
628 {
629 "epoch": 2.12,
630 "learning_rate": 1.1525423728813561e-05,
631 "loss": 0.2907,
632 "step": 52000
633 },
634 {
635 "epoch": 2.14,
636 "learning_rate": 1.144393741851369e-05,
637 "loss": 0.2772,
638 "step": 52500
639 },
640 {
641 "epoch": 2.16,
642 "learning_rate": 1.1362451108213822e-05,
643 "loss": 0.2823,
644 "step": 53000
645 },
646 {
647 "epoch": 2.18,
648 "learning_rate": 1.1280964797913951e-05,
649 "loss": 0.2755,
650 "step": 53500
651 },
652 {
653 "epoch": 2.2,
654 "learning_rate": 1.1199478487614082e-05,
655 "loss": 0.2722,
656 "step": 54000
657 },
658 {
659 "epoch": 2.22,
660 "learning_rate": 1.1117992177314212e-05,
661 "loss": 0.2835,
662 "step": 54500
663 },
664 {
665 "epoch": 2.24,
666 "learning_rate": 1.1036505867014341e-05,
667 "loss": 0.2825,
668 "step": 55000
669 },
670 {
671 "epoch": 2.26,
672 "learning_rate": 1.0955019556714473e-05,
673 "loss": 0.2781,
674 "step": 55500
675 },
676 {
677 "epoch": 2.28,
678 "learning_rate": 1.0873533246414602e-05,
679 "loss": 0.2866,
680 "step": 56000
681 },
682 {
683 "epoch": 2.3,
684 "learning_rate": 1.0792046936114733e-05,
685 "loss": 0.2839,
686 "step": 56500
687 },
688 {
689 "epoch": 2.32,
690 "learning_rate": 1.0710560625814863e-05,
691 "loss": 0.2915,
692 "step": 57000
693 },
694 {
695 "epoch": 2.34,
696 "learning_rate": 1.0629074315514994e-05,
697 "loss": 0.2876,
698 "step": 57500
699 },
700 {
701 "epoch": 2.36,
702 "learning_rate": 1.0547588005215125e-05,
703 "loss": 0.2768,
704 "step": 58000
705 },
706 {
707 "epoch": 2.38,
708 "learning_rate": 1.0466101694915256e-05,
709 "loss": 0.2825,
710 "step": 58500
711 },
712 {
713 "epoch": 2.4,
714 "learning_rate": 1.0384615384615386e-05,
715 "loss": 0.264,
716 "step": 59000
717 },
718 {
719 "epoch": 2.42,
720 "learning_rate": 1.0303129074315517e-05,
721 "loss": 0.2975,
722 "step": 59500
723 },
724 {
725 "epoch": 2.44,
726 "learning_rate": 1.0221642764015647e-05,
727 "loss": 0.2829,
728 "step": 60000
729 },
730 {
731 "epoch": 2.46,
732 "learning_rate": 1.0140156453715778e-05,
733 "loss": 0.2979,
734 "step": 60500
735 },
736 {
737 "epoch": 2.49,
738 "learning_rate": 1.0058670143415907e-05,
739 "loss": 0.2833,
740 "step": 61000
741 },
742 {
743 "epoch": 2.51,
744 "learning_rate": 9.977183833116037e-06,
745 "loss": 0.2856,
746 "step": 61500
747 },
748 {
749 "epoch": 2.53,
750 "learning_rate": 9.895697522816168e-06,
751 "loss": 0.291,
752 "step": 62000
753 },
754 {
755 "epoch": 2.55,
756 "learning_rate": 9.814211212516298e-06,
757 "loss": 0.2841,
758 "step": 62500
759 },
760 {
761 "epoch": 2.57,
762 "learning_rate": 9.732724902216429e-06,
763 "loss": 0.2806,
764 "step": 63000
765 },
766 {
767 "epoch": 2.59,
768 "learning_rate": 9.651238591916558e-06,
769 "loss": 0.2952,
770 "step": 63500
771 },
772 {
773 "epoch": 2.61,
774 "learning_rate": 9.56975228161669e-06,
775 "loss": 0.2876,
776 "step": 64000
777 },
778 {
779 "epoch": 2.63,
780 "learning_rate": 9.488265971316819e-06,
781 "loss": 0.285,
782 "step": 64500
783 },
784 {
785 "epoch": 2.65,
786 "learning_rate": 9.40677966101695e-06,
787 "loss": 0.28,
788 "step": 65000
789 },
790 {
791 "epoch": 2.67,
792 "learning_rate": 9.325293350717081e-06,
793 "loss": 0.2823,
794 "step": 65500
795 },
796 {
797 "epoch": 2.69,
798 "learning_rate": 9.24380704041721e-06,
799 "loss": 0.2918,
800 "step": 66000
801 },
802 {
803 "epoch": 2.71,
804 "learning_rate": 9.162320730117342e-06,
805 "loss": 0.2919,
806 "step": 66500
807 },
808 {
809 "epoch": 2.73,
810 "learning_rate": 9.080834419817471e-06,
811 "loss": 0.2888,
812 "step": 67000
813 },
814 {
815 "epoch": 2.75,
816 "learning_rate": 8.999348109517601e-06,
817 "loss": 0.2677,
818 "step": 67500
819 },
820 {
821 "epoch": 2.77,
822 "learning_rate": 8.917861799217732e-06,
823 "loss": 0.2879,
824 "step": 68000
825 },
826 {
827 "epoch": 2.79,
828 "learning_rate": 8.836375488917862e-06,
829 "loss": 0.2856,
830 "step": 68500
831 },
832 {
833 "epoch": 2.81,
834 "learning_rate": 8.754889178617993e-06,
835 "loss": 0.2923,
836 "step": 69000
837 },
838 {
839 "epoch": 2.83,
840 "learning_rate": 8.673402868318124e-06,
841 "loss": 0.2949,
842 "step": 69500
843 },
844 {
845 "epoch": 2.85,
846 "learning_rate": 8.591916558018254e-06,
847 "loss": 0.2826,
848 "step": 70000
849 },
850 {
851 "epoch": 2.87,
852 "learning_rate": 8.510430247718385e-06,
853 "loss": 0.2716,
854 "step": 70500
855 },
856 {
857 "epoch": 2.89,
858 "learning_rate": 8.428943937418514e-06,
859 "loss": 0.2858,
860 "step": 71000
861 },
862 {
863 "epoch": 2.91,
864 "learning_rate": 8.347457627118645e-06,
865 "loss": 0.284,
866 "step": 71500
867 },
868 {
869 "epoch": 2.93,
870 "learning_rate": 8.265971316818775e-06,
871 "loss": 0.2869,
872 "step": 72000
873 },
874 {
875 "epoch": 2.95,
876 "learning_rate": 8.184485006518904e-06,
877 "loss": 0.2937,
878 "step": 72500
879 },
880 {
881 "epoch": 2.97,
882 "learning_rate": 8.102998696219036e-06,
883 "loss": 0.2837,
884 "step": 73000
885 },
886 {
887 "epoch": 2.99,
888 "learning_rate": 8.021512385919165e-06,
889 "loss": 0.2783,
890 "step": 73500
891 },
892 {
893 "epoch": 3.01,
894 "learning_rate": 7.940026075619296e-06,
895 "loss": 0.2272,
896 "step": 74000
897 },
898 {
899 "epoch": 3.04,
900 "learning_rate": 7.858539765319428e-06,
901 "loss": 0.1973,
902 "step": 74500
903 },
904 {
905 "epoch": 3.06,
906 "learning_rate": 7.777053455019557e-06,
907 "loss": 0.2154,
908 "step": 75000
909 },
910 {
911 "epoch": 3.08,
912 "learning_rate": 7.695567144719688e-06,
913 "loss": 0.1972,
914 "step": 75500
915 },
916 {
917 "epoch": 3.1,
918 "learning_rate": 7.614080834419818e-06,
919 "loss": 0.213,
920 "step": 76000
921 },
922 {
923 "epoch": 3.12,
924 "learning_rate": 7.532594524119948e-06,
925 "loss": 0.1933,
926 "step": 76500
927 },
928 {
929 "epoch": 3.14,
930 "learning_rate": 7.451108213820078e-06,
931 "loss": 0.2167,
932 "step": 77000
933 },
934 {
935 "epoch": 3.16,
936 "learning_rate": 7.369621903520209e-06,
937 "loss": 0.1957,
938 "step": 77500
939 },
940 {
941 "epoch": 3.18,
942 "learning_rate": 7.288135593220339e-06,
943 "loss": 0.2187,
944 "step": 78000
945 },
946 {
947 "epoch": 3.2,
948 "learning_rate": 7.20664928292047e-06,
949 "loss": 0.2148,
950 "step": 78500
951 },
952 {
953 "epoch": 3.22,
954 "learning_rate": 7.125162972620601e-06,
955 "loss": 0.2187,
956 "step": 79000
957 },
958 {
959 "epoch": 3.24,
960 "learning_rate": 7.043676662320731e-06,
961 "loss": 0.2128,
962 "step": 79500
963 },
964 {
965 "epoch": 3.26,
966 "learning_rate": 6.962190352020861e-06,
967 "loss": 0.207,
968 "step": 80000
969 },
970 {
971 "epoch": 3.28,
972 "learning_rate": 6.880704041720992e-06,
973 "loss": 0.2116,
974 "step": 80500
975 },
976 {
977 "epoch": 3.3,
978 "learning_rate": 6.799217731421122e-06,
979 "loss": 0.2049,
980 "step": 81000
981 },
982 {
983 "epoch": 3.32,
984 "learning_rate": 6.7177314211212515e-06,
985 "loss": 0.2031,
986 "step": 81500
987 },
988 {
989 "epoch": 3.34,
990 "learning_rate": 6.636245110821382e-06,
991 "loss": 0.216,
992 "step": 82000
993 },
994 {
995 "epoch": 3.36,
996 "learning_rate": 6.554758800521513e-06,
997 "loss": 0.2082,
998 "step": 82500
999 },
1000 {
1001 "epoch": 3.38,
1002 "learning_rate": 6.473272490221643e-06,
1003 "loss": 0.2197,
1004 "step": 83000
1005 },
1006 {
1007 "epoch": 3.4,
1008 "learning_rate": 6.391786179921774e-06,
1009 "loss": 0.2109,
1010 "step": 83500
1011 },
1012 {
1013 "epoch": 3.42,
1014 "learning_rate": 6.310299869621904e-06,
1015 "loss": 0.2187,
1016 "step": 84000
1017 },
1018 {
1019 "epoch": 3.44,
1020 "learning_rate": 6.2288135593220344e-06,
1021 "loss": 0.2206,
1022 "step": 84500
1023 },
1024 {
1025 "epoch": 3.46,
1026 "learning_rate": 6.147327249022165e-06,
1027 "loss": 0.2038,
1028 "step": 85000
1029 },
1030 {
1031 "epoch": 3.48,
1032 "learning_rate": 6.065840938722295e-06,
1033 "loss": 0.2197,
1034 "step": 85500
1035 },
1036 {
1037 "epoch": 3.5,
1038 "learning_rate": 5.9843546284224255e-06,
1039 "loss": 0.2222,
1040 "step": 86000
1041 },
1042 {
1043 "epoch": 3.52,
1044 "learning_rate": 5.902868318122556e-06,
1045 "loss": 0.2109,
1046 "step": 86500
1047 },
1048 {
1049 "epoch": 3.54,
1050 "learning_rate": 5.821382007822687e-06,
1051 "loss": 0.2128,
1052 "step": 87000
1053 },
1054 {
1055 "epoch": 3.57,
1056 "learning_rate": 5.739895697522817e-06,
1057 "loss": 0.236,
1058 "step": 87500
1059 },
1060 {
1061 "epoch": 3.59,
1062 "learning_rate": 5.658409387222948e-06,
1063 "loss": 0.1958,
1064 "step": 88000
1065 },
1066 {
1067 "epoch": 3.61,
1068 "learning_rate": 5.576923076923077e-06,
1069 "loss": 0.2258,
1070 "step": 88500
1071 },
1072 {
1073 "epoch": 3.63,
1074 "learning_rate": 5.4954367666232076e-06,
1075 "loss": 0.2214,
1076 "step": 89000
1077 },
1078 {
1079 "epoch": 3.65,
1080 "learning_rate": 5.413950456323338e-06,
1081 "loss": 0.2241,
1082 "step": 89500
1083 },
1084 {
1085 "epoch": 3.67,
1086 "learning_rate": 5.332464146023468e-06,
1087 "loss": 0.2093,
1088 "step": 90000
1089 },
1090 {
1091 "epoch": 3.69,
1092 "learning_rate": 5.250977835723599e-06,
1093 "loss": 0.2215,
1094 "step": 90500
1095 },
1096 {
1097 "epoch": 3.71,
1098 "learning_rate": 5.169491525423729e-06,
1099 "loss": 0.2118,
1100 "step": 91000
1101 },
1102 {
1103 "epoch": 3.73,
1104 "learning_rate": 5.08800521512386e-06,
1105 "loss": 0.2216,
1106 "step": 91500
1107 },
1108 {
1109 "epoch": 3.75,
1110 "learning_rate": 5.0065189048239905e-06,
1111 "loss": 0.2036,
1112 "step": 92000
1113 },
1114 {
1115 "epoch": 3.77,
1116 "learning_rate": 4.92503259452412e-06,
1117 "loss": 0.2186,
1118 "step": 92500
1119 },
1120 {
1121 "epoch": 3.79,
1122 "learning_rate": 4.843546284224251e-06,
1123 "loss": 0.2201,
1124 "step": 93000
1125 },
1126 {
1127 "epoch": 3.81,
1128 "learning_rate": 4.7620599739243815e-06,
1129 "loss": 0.2036,
1130 "step": 93500
1131 },
1132 {
1133 "epoch": 3.83,
1134 "learning_rate": 4.680573663624511e-06,
1135 "loss": 0.2319,
1136 "step": 94000
1137 },
1138 {
1139 "epoch": 3.85,
1140 "learning_rate": 4.599087353324641e-06,
1141 "loss": 0.2126,
1142 "step": 94500
1143 },
1144 {
1145 "epoch": 3.87,
1146 "learning_rate": 4.5176010430247726e-06,
1147 "loss": 0.2246,
1148 "step": 95000
1149 },
1150 {
1151 "epoch": 3.89,
1152 "learning_rate": 4.436114732724903e-06,
1153 "loss": 0.2269,
1154 "step": 95500
1155 },
1156 {
1157 "epoch": 3.91,
1158 "learning_rate": 4.354628422425033e-06,
1159 "loss": 0.2074,
1160 "step": 96000
1161 },
1162 {
1163 "epoch": 3.93,
1164 "learning_rate": 4.273142112125163e-06,
1165 "loss": 0.2219,
1166 "step": 96500
1167 },
1168 {
1169 "epoch": 3.95,
1170 "learning_rate": 4.191655801825294e-06,
1171 "loss": 0.1998,
1172 "step": 97000
1173 },
1174 {
1175 "epoch": 3.97,
1176 "learning_rate": 4.110169491525424e-06,
1177 "loss": 0.2124,
1178 "step": 97500
1179 },
1180 {
1181 "epoch": 3.99,
1182 "learning_rate": 4.028683181225555e-06,
1183 "loss": 0.2225,
1184 "step": 98000
1185 },
1186 {
1187 "epoch": 4.01,
1188 "learning_rate": 3.947196870925685e-06,
1189 "loss": 0.1845,
1190 "step": 98500
1191 },
1192 {
1193 "epoch": 4.03,
1194 "learning_rate": 3.865710560625815e-06,
1195 "loss": 0.156,
1196 "step": 99000
1197 },
1198 {
1199 "epoch": 4.05,
1200 "learning_rate": 3.7842242503259457e-06,
1201 "loss": 0.1695,
1202 "step": 99500
1203 },
1204 {
1205 "epoch": 4.07,
1206 "learning_rate": 3.702737940026076e-06,
1207 "loss": 0.1729,
1208 "step": 100000
1209 },
1210 {
1211 "epoch": 4.09,
1212 "learning_rate": 3.6212516297262064e-06,
1213 "loss": 0.159,
1214 "step": 100500
1215 },
1216 {
1217 "epoch": 4.12,
1218 "learning_rate": 3.5397653194263363e-06,
1219 "loss": 0.1735,
1220 "step": 101000
1221 },
1222 {
1223 "epoch": 4.14,
1224 "learning_rate": 3.4582790091264675e-06,
1225 "loss": 0.1683,
1226 "step": 101500
1227 },
1228 {
1229 "epoch": 4.16,
1230 "learning_rate": 3.3767926988265974e-06,
1231 "loss": 0.1734,
1232 "step": 102000
1233 },
1234 {
1235 "epoch": 4.18,
1236 "learning_rate": 3.2953063885267278e-06,
1237 "loss": 0.1575,
1238 "step": 102500
1239 },
1240 {
1241 "epoch": 4.2,
1242 "learning_rate": 3.213820078226858e-06,
1243 "loss": 0.1643,
1244 "step": 103000
1245 },
1246 {
1247 "epoch": 4.22,
1248 "learning_rate": 3.1323337679269885e-06,
1249 "loss": 0.1626,
1250 "step": 103500
1251 },
1252 {
1253 "epoch": 4.24,
1254 "learning_rate": 3.0508474576271192e-06,
1255 "loss": 0.1631,
1256 "step": 104000
1257 },
1258 {
1259 "epoch": 4.26,
1260 "learning_rate": 2.969361147327249e-06,
1261 "loss": 0.1731,
1262 "step": 104500
1263 },
1264 {
1265 "epoch": 4.28,
1266 "learning_rate": 2.8878748370273795e-06,
1267 "loss": 0.1729,
1268 "step": 105000
1269 },
1270 {
1271 "epoch": 4.3,
1272 "learning_rate": 2.80638852672751e-06,
1273 "loss": 0.1658,
1274 "step": 105500
1275 },
1276 {
1277 "epoch": 4.32,
1278 "learning_rate": 2.7249022164276406e-06,
1279 "loss": 0.1641,
1280 "step": 106000
1281 },
1282 {
1283 "epoch": 4.34,
1284 "learning_rate": 2.643415906127771e-06,
1285 "loss": 0.1716,
1286 "step": 106500
1287 },
1288 {
1289 "epoch": 4.36,
1290 "learning_rate": 2.5619295958279013e-06,
1291 "loss": 0.1765,
1292 "step": 107000
1293 },
1294 {
1295 "epoch": 4.38,
1296 "learning_rate": 2.4804432855280312e-06,
1297 "loss": 0.1713,
1298 "step": 107500
1299 },
1300 {
1301 "epoch": 4.4,
1302 "learning_rate": 2.398956975228162e-06,
1303 "loss": 0.1716,
1304 "step": 108000
1305 },
1306 {
1307 "epoch": 4.42,
1308 "learning_rate": 2.3174706649282924e-06,
1309 "loss": 0.1754,
1310 "step": 108500
1311 },
1312 {
1313 "epoch": 4.44,
1314 "learning_rate": 2.2359843546284227e-06,
1315 "loss": 0.1551,
1316 "step": 109000
1317 },
1318 {
1319 "epoch": 4.46,
1320 "learning_rate": 2.154498044328553e-06,
1321 "loss": 0.1704,
1322 "step": 109500
1323 },
1324 {
1325 "epoch": 4.48,
1326 "learning_rate": 2.0730117340286834e-06,
1327 "loss": 0.1744,
1328 "step": 110000
1329 },
1330 {
1331 "epoch": 4.5,
1332 "learning_rate": 1.9915254237288137e-06,
1333 "loss": 0.1608,
1334 "step": 110500
1335 },
1336 {
1337 "epoch": 4.52,
1338 "learning_rate": 1.910039113428944e-06,
1339 "loss": 0.1731,
1340 "step": 111000
1341 },
1342 {
1343 "epoch": 4.54,
1344 "learning_rate": 1.8285528031290744e-06,
1345 "loss": 0.1635,
1346 "step": 111500
1347 },
1348 {
1349 "epoch": 4.56,
1350 "learning_rate": 1.7470664928292048e-06,
1351 "loss": 0.1539,
1352 "step": 112000
1353 },
1354 {
1355 "epoch": 4.58,
1356 "learning_rate": 1.6655801825293353e-06,
1357 "loss": 0.1608,
1358 "step": 112500
1359 },
1360 {
1361 "epoch": 4.6,
1362 "learning_rate": 1.5840938722294655e-06,
1363 "loss": 0.1732,
1364 "step": 113000
1365 },
1366 {
1367 "epoch": 4.62,
1368 "learning_rate": 1.502607561929596e-06,
1369 "loss": 0.1554,
1370 "step": 113500
1371 },
1372 {
1373 "epoch": 4.64,
1374 "learning_rate": 1.4211212516297262e-06,
1375 "loss": 0.1719,
1376 "step": 114000
1377 },
1378 {
1379 "epoch": 4.67,
1380 "learning_rate": 1.3396349413298567e-06,
1381 "loss": 0.1605,
1382 "step": 114500
1383 },
1384 {
1385 "epoch": 4.69,
1386 "learning_rate": 1.258148631029987e-06,
1387 "loss": 0.1698,
1388 "step": 115000
1389 },
1390 {
1391 "epoch": 4.71,
1392 "learning_rate": 1.1766623207301174e-06,
1393 "loss": 0.1686,
1394 "step": 115500
1395 },
1396 {
1397 "epoch": 4.73,
1398 "learning_rate": 1.0951760104302478e-06,
1399 "loss": 0.1651,
1400 "step": 116000
1401 },
1402 {
1403 "epoch": 4.75,
1404 "learning_rate": 1.0136897001303781e-06,
1405 "loss": 0.1647,
1406 "step": 116500
1407 },
1408 {
1409 "epoch": 4.77,
1410 "learning_rate": 9.322033898305086e-07,
1411 "loss": 0.1735,
1412 "step": 117000
1413 },
1414 {
1415 "epoch": 4.79,
1416 "learning_rate": 8.507170795306389e-07,
1417 "loss": 0.1578,
1418 "step": 117500
1419 },
1420 {
1421 "epoch": 4.81,
1422 "learning_rate": 7.692307692307694e-07,
1423 "loss": 0.1729,
1424 "step": 118000
1425 },
1426 {
1427 "epoch": 4.83,
1428 "learning_rate": 6.877444589308997e-07,
1429 "loss": 0.1682,
1430 "step": 118500
1431 },
1432 {
1433 "epoch": 4.85,
1434 "learning_rate": 6.0625814863103e-07,
1435 "loss": 0.1585,
1436 "step": 119000
1437 },
1438 {
1439 "epoch": 4.87,
1440 "learning_rate": 5.247718383311604e-07,
1441 "loss": 0.1605,
1442 "step": 119500
1443 },
1444 {
1445 "epoch": 4.89,
1446 "learning_rate": 4.432855280312908e-07,
1447 "loss": 0.1629,
1448 "step": 120000
1449 },
1450 {
1451 "epoch": 4.91,
1452 "learning_rate": 3.6179921773142114e-07,
1453 "loss": 0.1504,
1454 "step": 120500
1455 },
1456 {
1457 "epoch": 4.93,
1458 "learning_rate": 2.803129074315515e-07,
1459 "loss": 0.1706,
1460 "step": 121000
1461 },
1462 {
1463 "epoch": 4.95,
1464 "learning_rate": 1.988265971316819e-07,
1465 "loss": 0.16,
1466 "step": 121500
1467 },
1468 {
1469 "epoch": 4.97,
1470 "learning_rate": 1.1734028683181226e-07,
1471 "loss": 0.1641,
1472 "step": 122000
1473 },
1474 {
1475 "epoch": 4.99,
1476 "learning_rate": 3.585397653194264e-08,
1477 "loss": 0.1709,
1478 "step": 122500
1479 },
1480 {
1481 "epoch": 5.0,
1482 "step": 122720,
1483 "total_flos": 98499428530398720,
1484 "train_runtime": 14248.9446,
1485 "train_samples_per_second": 8.613
1486 }
1487 ],
1488 "max_steps": 122720,
1489 "num_train_epochs": 5,
1490 "total_flos": 98499428530398720,
1491 "trial_name": null,
1492 "trial_params": null
1493 }
1494