trainer_state.json
| 1 | { |
| 2 | "best_metric": 0.03933868557214737, |
| 3 | "best_model_checkpoint": "./rorshark_outputs/checkpoint-1840", |
| 4 | "epoch": 5.0, |
| 5 | "eval_steps": 500, |
| 6 | "global_step": 1840, |
| 7 | "is_hyper_param_search": false, |
| 8 | "is_local_process_zero": true, |
| 9 | "is_world_process_zero": true, |
| 10 | "log_history": [ |
| 11 | { |
| 12 | "epoch": 0.03, |
| 13 | "learning_rate": 1.989130434782609e-05, |
| 14 | "loss": 0.5675, |
| 15 | "step": 10 |
| 16 | }, |
| 17 | { |
| 18 | "epoch": 0.05, |
| 19 | "learning_rate": 1.9782608695652176e-05, |
| 20 | "loss": 0.4112, |
| 21 | "step": 20 |
| 22 | }, |
| 23 | { |
| 24 | "epoch": 0.08, |
| 25 | "learning_rate": 1.9673913043478263e-05, |
| 26 | "loss": 0.3002, |
| 27 | "step": 30 |
| 28 | }, |
| 29 | { |
| 30 | "epoch": 0.11, |
| 31 | "learning_rate": 1.956521739130435e-05, |
| 32 | "loss": 0.3774, |
| 33 | "step": 40 |
| 34 | }, |
| 35 | { |
| 36 | "epoch": 0.14, |
| 37 | "learning_rate": 1.9456521739130436e-05, |
| 38 | "loss": 0.3295, |
| 39 | "step": 50 |
| 40 | }, |
| 41 | { |
| 42 | "epoch": 0.16, |
| 43 | "learning_rate": 1.9347826086956523e-05, |
| 44 | "loss": 0.3067, |
| 45 | "step": 60 |
| 46 | }, |
| 47 | { |
| 48 | "epoch": 0.19, |
| 49 | "learning_rate": 1.923913043478261e-05, |
| 50 | "loss": 0.2457, |
| 51 | "step": 70 |
| 52 | }, |
| 53 | { |
| 54 | "epoch": 0.22, |
| 55 | "learning_rate": 1.9130434782608697e-05, |
| 56 | "loss": 0.3318, |
| 57 | "step": 80 |
| 58 | }, |
| 59 | { |
| 60 | "epoch": 0.24, |
| 61 | "learning_rate": 1.9021739130434784e-05, |
| 62 | "loss": 0.1932, |
| 63 | "step": 90 |
| 64 | }, |
| 65 | { |
| 66 | "epoch": 0.27, |
| 67 | "learning_rate": 1.891304347826087e-05, |
| 68 | "loss": 0.1948, |
| 69 | "step": 100 |
| 70 | }, |
| 71 | { |
| 72 | "epoch": 0.3, |
| 73 | "learning_rate": 1.8804347826086958e-05, |
| 74 | "loss": 0.2475, |
| 75 | "step": 110 |
| 76 | }, |
| 77 | { |
| 78 | "epoch": 0.33, |
| 79 | "learning_rate": 1.8695652173913045e-05, |
| 80 | "loss": 0.1432, |
| 81 | "step": 120 |
| 82 | }, |
| 83 | { |
| 84 | "epoch": 0.35, |
| 85 | "learning_rate": 1.8586956521739132e-05, |
| 86 | "loss": 0.2069, |
| 87 | "step": 130 |
| 88 | }, |
| 89 | { |
| 90 | "epoch": 0.38, |
| 91 | "learning_rate": 1.847826086956522e-05, |
| 92 | "loss": 0.1986, |
| 93 | "step": 140 |
| 94 | }, |
| 95 | { |
| 96 | "epoch": 0.41, |
| 97 | "learning_rate": 1.8369565217391306e-05, |
| 98 | "loss": 0.2156, |
| 99 | "step": 150 |
| 100 | }, |
| 101 | { |
| 102 | "epoch": 0.43, |
| 103 | "learning_rate": 1.8260869565217393e-05, |
| 104 | "loss": 0.1187, |
| 105 | "step": 160 |
| 106 | }, |
| 107 | { |
| 108 | "epoch": 0.46, |
| 109 | "learning_rate": 1.815217391304348e-05, |
| 110 | "loss": 0.1192, |
| 111 | "step": 170 |
| 112 | }, |
| 113 | { |
| 114 | "epoch": 0.49, |
| 115 | "learning_rate": 1.8043478260869567e-05, |
| 116 | "loss": 0.1748, |
| 117 | "step": 180 |
| 118 | }, |
| 119 | { |
| 120 | "epoch": 0.52, |
| 121 | "learning_rate": 1.7934782608695654e-05, |
| 122 | "loss": 0.0779, |
| 123 | "step": 190 |
| 124 | }, |
| 125 | { |
| 126 | "epoch": 0.54, |
| 127 | "learning_rate": 1.782608695652174e-05, |
| 128 | "loss": 0.1075, |
| 129 | "step": 200 |
| 130 | }, |
| 131 | { |
| 132 | "epoch": 0.57, |
| 133 | "learning_rate": 1.7717391304347828e-05, |
| 134 | "loss": 0.1298, |
| 135 | "step": 210 |
| 136 | }, |
| 137 | { |
| 138 | "epoch": 0.6, |
| 139 | "learning_rate": 1.7608695652173915e-05, |
| 140 | "loss": 0.0728, |
| 141 | "step": 220 |
| 142 | }, |
| 143 | { |
| 144 | "epoch": 0.62, |
| 145 | "learning_rate": 1.7500000000000002e-05, |
| 146 | "loss": 0.1189, |
| 147 | "step": 230 |
| 148 | }, |
| 149 | { |
| 150 | "epoch": 0.65, |
| 151 | "learning_rate": 1.739130434782609e-05, |
| 152 | "loss": 0.1102, |
| 153 | "step": 240 |
| 154 | }, |
| 155 | { |
| 156 | "epoch": 0.68, |
| 157 | "learning_rate": 1.7282608695652176e-05, |
| 158 | "loss": 0.1183, |
| 159 | "step": 250 |
| 160 | }, |
| 161 | { |
| 162 | "epoch": 0.71, |
| 163 | "learning_rate": 1.7173913043478263e-05, |
| 164 | "loss": 0.3006, |
| 165 | "step": 260 |
| 166 | }, |
| 167 | { |
| 168 | "epoch": 0.73, |
| 169 | "learning_rate": 1.706521739130435e-05, |
| 170 | "loss": 0.1408, |
| 171 | "step": 270 |
| 172 | }, |
| 173 | { |
| 174 | "epoch": 0.76, |
| 175 | "learning_rate": 1.6956521739130437e-05, |
| 176 | "loss": 0.141, |
| 177 | "step": 280 |
| 178 | }, |
| 179 | { |
| 180 | "epoch": 0.79, |
| 181 | "learning_rate": 1.6847826086956524e-05, |
| 182 | "loss": 0.1208, |
| 183 | "step": 290 |
| 184 | }, |
| 185 | { |
| 186 | "epoch": 0.82, |
| 187 | "learning_rate": 1.673913043478261e-05, |
| 188 | "loss": 0.1004, |
| 189 | "step": 300 |
| 190 | }, |
| 191 | { |
| 192 | "epoch": 0.84, |
| 193 | "learning_rate": 1.6630434782608698e-05, |
| 194 | "loss": 0.206, |
| 195 | "step": 310 |
| 196 | }, |
| 197 | { |
| 198 | "epoch": 0.87, |
| 199 | "learning_rate": 1.6521739130434785e-05, |
| 200 | "loss": 0.12, |
| 201 | "step": 320 |
| 202 | }, |
| 203 | { |
| 204 | "epoch": 0.9, |
| 205 | "learning_rate": 1.641304347826087e-05, |
| 206 | "loss": 0.0705, |
| 207 | "step": 330 |
| 208 | }, |
| 209 | { |
| 210 | "epoch": 0.92, |
| 211 | "learning_rate": 1.630434782608696e-05, |
| 212 | "loss": 0.1018, |
| 213 | "step": 340 |
| 214 | }, |
| 215 | { |
| 216 | "epoch": 0.95, |
| 217 | "learning_rate": 1.6195652173913045e-05, |
| 218 | "loss": 0.1501, |
| 219 | "step": 350 |
| 220 | }, |
| 221 | { |
| 222 | "epoch": 0.98, |
| 223 | "learning_rate": 1.6086956521739132e-05, |
| 224 | "loss": 0.0597, |
| 225 | "step": 360 |
| 226 | }, |
| 227 | { |
| 228 | "epoch": 1.0, |
| 229 | "eval_accuracy": 0.9865125240847784, |
| 230 | "eval_loss": 0.05456383526325226, |
| 231 | "eval_runtime": 6.5116, |
| 232 | "eval_samples_per_second": 79.704, |
| 233 | "eval_steps_per_second": 9.982, |
| 234 | "step": 368 |
| 235 | }, |
| 236 | { |
| 237 | "epoch": 1.01, |
| 238 | "learning_rate": 1.597826086956522e-05, |
| 239 | "loss": 0.0878, |
| 240 | "step": 370 |
| 241 | }, |
| 242 | { |
| 243 | "epoch": 1.03, |
| 244 | "learning_rate": 1.5869565217391306e-05, |
| 245 | "loss": 0.1651, |
| 246 | "step": 380 |
| 247 | }, |
| 248 | { |
| 249 | "epoch": 1.06, |
| 250 | "learning_rate": 1.576086956521739e-05, |
| 251 | "loss": 0.0645, |
| 252 | "step": 390 |
| 253 | }, |
| 254 | { |
| 255 | "epoch": 1.09, |
| 256 | "learning_rate": 1.565217391304348e-05, |
| 257 | "loss": 0.1085, |
| 258 | "step": 400 |
| 259 | }, |
| 260 | { |
| 261 | "epoch": 1.11, |
| 262 | "learning_rate": 1.5543478260869567e-05, |
| 263 | "loss": 0.0967, |
| 264 | "step": 410 |
| 265 | }, |
| 266 | { |
| 267 | "epoch": 1.14, |
| 268 | "learning_rate": 1.5434782608695654e-05, |
| 269 | "loss": 0.1178, |
| 270 | "step": 420 |
| 271 | }, |
| 272 | { |
| 273 | "epoch": 1.17, |
| 274 | "learning_rate": 1.532608695652174e-05, |
| 275 | "loss": 0.0605, |
| 276 | "step": 430 |
| 277 | }, |
| 278 | { |
| 279 | "epoch": 1.2, |
| 280 | "learning_rate": 1.5217391304347828e-05, |
| 281 | "loss": 0.1394, |
| 282 | "step": 440 |
| 283 | }, |
| 284 | { |
| 285 | "epoch": 1.22, |
| 286 | "learning_rate": 1.5108695652173915e-05, |
| 287 | "loss": 0.1113, |
| 288 | "step": 450 |
| 289 | }, |
| 290 | { |
| 291 | "epoch": 1.25, |
| 292 | "learning_rate": 1.5000000000000002e-05, |
| 293 | "loss": 0.0225, |
| 294 | "step": 460 |
| 295 | }, |
| 296 | { |
| 297 | "epoch": 1.28, |
| 298 | "learning_rate": 1.4891304347826087e-05, |
| 299 | "loss": 0.1861, |
| 300 | "step": 470 |
| 301 | }, |
| 302 | { |
| 303 | "epoch": 1.3, |
| 304 | "learning_rate": 1.4782608695652174e-05, |
| 305 | "loss": 0.0879, |
| 306 | "step": 480 |
| 307 | }, |
| 308 | { |
| 309 | "epoch": 1.33, |
| 310 | "learning_rate": 1.4673913043478263e-05, |
| 311 | "loss": 0.094, |
| 312 | "step": 490 |
| 313 | }, |
| 314 | { |
| 315 | "epoch": 1.36, |
| 316 | "learning_rate": 1.456521739130435e-05, |
| 317 | "loss": 0.1837, |
| 318 | "step": 500 |
| 319 | }, |
| 320 | { |
| 321 | "epoch": 1.39, |
| 322 | "learning_rate": 1.4456521739130435e-05, |
| 323 | "loss": 0.057, |
| 324 | "step": 510 |
| 325 | }, |
| 326 | { |
| 327 | "epoch": 1.41, |
| 328 | "learning_rate": 1.4347826086956522e-05, |
| 329 | "loss": 0.0504, |
| 330 | "step": 520 |
| 331 | }, |
| 332 | { |
| 333 | "epoch": 1.44, |
| 334 | "learning_rate": 1.423913043478261e-05, |
| 335 | "loss": 0.03, |
| 336 | "step": 530 |
| 337 | }, |
| 338 | { |
| 339 | "epoch": 1.47, |
| 340 | "learning_rate": 1.4130434782608698e-05, |
| 341 | "loss": 0.0637, |
| 342 | "step": 540 |
| 343 | }, |
| 344 | { |
| 345 | "epoch": 1.49, |
| 346 | "learning_rate": 1.4021739130434783e-05, |
| 347 | "loss": 0.1572, |
| 348 | "step": 550 |
| 349 | }, |
| 350 | { |
| 351 | "epoch": 1.52, |
| 352 | "learning_rate": 1.391304347826087e-05, |
| 353 | "loss": 0.2074, |
| 354 | "step": 560 |
| 355 | }, |
| 356 | { |
| 357 | "epoch": 1.55, |
| 358 | "learning_rate": 1.3804347826086957e-05, |
| 359 | "loss": 0.1031, |
| 360 | "step": 570 |
| 361 | }, |
| 362 | { |
| 363 | "epoch": 1.58, |
| 364 | "learning_rate": 1.3695652173913046e-05, |
| 365 | "loss": 0.075, |
| 366 | "step": 580 |
| 367 | }, |
| 368 | { |
| 369 | "epoch": 1.6, |
| 370 | "learning_rate": 1.3586956521739133e-05, |
| 371 | "loss": 0.0854, |
| 372 | "step": 590 |
| 373 | }, |
| 374 | { |
| 375 | "epoch": 1.63, |
| 376 | "learning_rate": 1.3478260869565218e-05, |
| 377 | "loss": 0.0897, |
| 378 | "step": 600 |
| 379 | }, |
| 380 | { |
| 381 | "epoch": 1.66, |
| 382 | "learning_rate": 1.3369565217391305e-05, |
| 383 | "loss": 0.1017, |
| 384 | "step": 610 |
| 385 | }, |
| 386 | { |
| 387 | "epoch": 1.68, |
| 388 | "learning_rate": 1.3260869565217392e-05, |
| 389 | "loss": 0.132, |
| 390 | "step": 620 |
| 391 | }, |
| 392 | { |
| 393 | "epoch": 1.71, |
| 394 | "learning_rate": 1.315217391304348e-05, |
| 395 | "loss": 0.0471, |
| 396 | "step": 630 |
| 397 | }, |
| 398 | { |
| 399 | "epoch": 1.74, |
| 400 | "learning_rate": 1.3043478260869566e-05, |
| 401 | "loss": 0.0707, |
| 402 | "step": 640 |
| 403 | }, |
| 404 | { |
| 405 | "epoch": 1.77, |
| 406 | "learning_rate": 1.2934782608695653e-05, |
| 407 | "loss": 0.0506, |
| 408 | "step": 650 |
| 409 | }, |
| 410 | { |
| 411 | "epoch": 1.79, |
| 412 | "learning_rate": 1.282608695652174e-05, |
| 413 | "loss": 0.1308, |
| 414 | "step": 660 |
| 415 | }, |
| 416 | { |
| 417 | "epoch": 1.82, |
| 418 | "learning_rate": 1.2717391304347828e-05, |
| 419 | "loss": 0.1188, |
| 420 | "step": 670 |
| 421 | }, |
| 422 | { |
| 423 | "epoch": 1.85, |
| 424 | "learning_rate": 1.2608695652173915e-05, |
| 425 | "loss": 0.1021, |
| 426 | "step": 680 |
| 427 | }, |
| 428 | { |
| 429 | "epoch": 1.88, |
| 430 | "learning_rate": 1.25e-05, |
| 431 | "loss": 0.1199, |
| 432 | "step": 690 |
| 433 | }, |
| 434 | { |
| 435 | "epoch": 1.9, |
| 436 | "learning_rate": 1.2391304347826088e-05, |
| 437 | "loss": 0.1068, |
| 438 | "step": 700 |
| 439 | }, |
| 440 | { |
| 441 | "epoch": 1.93, |
| 442 | "learning_rate": 1.2282608695652175e-05, |
| 443 | "loss": 0.0535, |
| 444 | "step": 710 |
| 445 | }, |
| 446 | { |
| 447 | "epoch": 1.96, |
| 448 | "learning_rate": 1.2173913043478263e-05, |
| 449 | "loss": 0.0723, |
| 450 | "step": 720 |
| 451 | }, |
| 452 | { |
| 453 | "epoch": 1.98, |
| 454 | "learning_rate": 1.2065217391304348e-05, |
| 455 | "loss": 0.2009, |
| 456 | "step": 730 |
| 457 | }, |
| 458 | { |
| 459 | "epoch": 2.0, |
| 460 | "eval_accuracy": 0.9865125240847784, |
| 461 | "eval_loss": 0.05307452380657196, |
| 462 | "eval_runtime": 6.4841, |
| 463 | "eval_samples_per_second": 80.043, |
| 464 | "eval_steps_per_second": 10.025, |
| 465 | "step": 736 |
| 466 | }, |
| 467 | { |
| 468 | "epoch": 2.01, |
| 469 | "learning_rate": 1.1956521739130435e-05, |
| 470 | "loss": 0.0156, |
| 471 | "step": 740 |
| 472 | }, |
| 473 | { |
| 474 | "epoch": 2.04, |
| 475 | "learning_rate": 1.1847826086956522e-05, |
| 476 | "loss": 0.169, |
| 477 | "step": 750 |
| 478 | }, |
| 479 | { |
| 480 | "epoch": 2.07, |
| 481 | "learning_rate": 1.1739130434782611e-05, |
| 482 | "loss": 0.0866, |
| 483 | "step": 760 |
| 484 | }, |
| 485 | { |
| 486 | "epoch": 2.09, |
| 487 | "learning_rate": 1.1630434782608698e-05, |
| 488 | "loss": 0.0973, |
| 489 | "step": 770 |
| 490 | }, |
| 491 | { |
| 492 | "epoch": 2.12, |
| 493 | "learning_rate": 1.1521739130434783e-05, |
| 494 | "loss": 0.0427, |
| 495 | "step": 780 |
| 496 | }, |
| 497 | { |
| 498 | "epoch": 2.15, |
| 499 | "learning_rate": 1.141304347826087e-05, |
| 500 | "loss": 0.1296, |
| 501 | "step": 790 |
| 502 | }, |
| 503 | { |
| 504 | "epoch": 2.17, |
| 505 | "learning_rate": 1.1304347826086957e-05, |
| 506 | "loss": 0.0265, |
| 507 | "step": 800 |
| 508 | }, |
| 509 | { |
| 510 | "epoch": 2.2, |
| 511 | "learning_rate": 1.1195652173913046e-05, |
| 512 | "loss": 0.1574, |
| 513 | "step": 810 |
| 514 | }, |
| 515 | { |
| 516 | "epoch": 2.23, |
| 517 | "learning_rate": 1.1086956521739131e-05, |
| 518 | "loss": 0.0655, |
| 519 | "step": 820 |
| 520 | }, |
| 521 | { |
| 522 | "epoch": 2.26, |
| 523 | "learning_rate": 1.0978260869565218e-05, |
| 524 | "loss": 0.0785, |
| 525 | "step": 830 |
| 526 | }, |
| 527 | { |
| 528 | "epoch": 2.28, |
| 529 | "learning_rate": 1.0869565217391305e-05, |
| 530 | "loss": 0.1273, |
| 531 | "step": 840 |
| 532 | }, |
| 533 | { |
| 534 | "epoch": 2.31, |
| 535 | "learning_rate": 1.076086956521739e-05, |
| 536 | "loss": 0.0374, |
| 537 | "step": 850 |
| 538 | }, |
| 539 | { |
| 540 | "epoch": 2.34, |
| 541 | "learning_rate": 1.0652173913043479e-05, |
| 542 | "loss": 0.2576, |
| 543 | "step": 860 |
| 544 | }, |
| 545 | { |
| 546 | "epoch": 2.36, |
| 547 | "learning_rate": 1.0543478260869566e-05, |
| 548 | "loss": 0.0417, |
| 549 | "step": 870 |
| 550 | }, |
| 551 | { |
| 552 | "epoch": 2.39, |
| 553 | "learning_rate": 1.0434782608695653e-05, |
| 554 | "loss": 0.115, |
| 555 | "step": 880 |
| 556 | }, |
| 557 | { |
| 558 | "epoch": 2.42, |
| 559 | "learning_rate": 1.032608695652174e-05, |
| 560 | "loss": 0.105, |
| 561 | "step": 890 |
| 562 | }, |
| 563 | { |
| 564 | "epoch": 2.45, |
| 565 | "learning_rate": 1.0217391304347829e-05, |
| 566 | "loss": 0.1704, |
| 567 | "step": 900 |
| 568 | }, |
| 569 | { |
| 570 | "epoch": 2.47, |
| 571 | "learning_rate": 1.0108695652173914e-05, |
| 572 | "loss": 0.0442, |
| 573 | "step": 910 |
| 574 | }, |
| 575 | { |
| 576 | "epoch": 2.5, |
| 577 | "learning_rate": 1e-05, |
| 578 | "loss": 0.079, |
| 579 | "step": 920 |
| 580 | }, |
| 581 | { |
| 582 | "epoch": 2.53, |
| 583 | "learning_rate": 9.891304347826088e-06, |
| 584 | "loss": 0.0214, |
| 585 | "step": 930 |
| 586 | }, |
| 587 | { |
| 588 | "epoch": 2.55, |
| 589 | "learning_rate": 9.782608695652175e-06, |
| 590 | "loss": 0.112, |
| 591 | "step": 940 |
| 592 | }, |
| 593 | { |
| 594 | "epoch": 2.58, |
| 595 | "learning_rate": 9.673913043478262e-06, |
| 596 | "loss": 0.0467, |
| 597 | "step": 950 |
| 598 | }, |
| 599 | { |
| 600 | "epoch": 2.61, |
| 601 | "learning_rate": 9.565217391304349e-06, |
| 602 | "loss": 0.0944, |
| 603 | "step": 960 |
| 604 | }, |
| 605 | { |
| 606 | "epoch": 2.64, |
| 607 | "learning_rate": 9.456521739130436e-06, |
| 608 | "loss": 0.0195, |
| 609 | "step": 970 |
| 610 | }, |
| 611 | { |
| 612 | "epoch": 2.66, |
| 613 | "learning_rate": 9.347826086956523e-06, |
| 614 | "loss": 0.1084, |
| 615 | "step": 980 |
| 616 | }, |
| 617 | { |
| 618 | "epoch": 2.69, |
| 619 | "learning_rate": 9.23913043478261e-06, |
| 620 | "loss": 0.0598, |
| 621 | "step": 990 |
| 622 | }, |
| 623 | { |
| 624 | "epoch": 2.72, |
| 625 | "learning_rate": 9.130434782608697e-06, |
| 626 | "loss": 0.0563, |
| 627 | "step": 1000 |
| 628 | }, |
| 629 | { |
| 630 | "epoch": 2.74, |
| 631 | "learning_rate": 9.021739130434784e-06, |
| 632 | "loss": 0.1212, |
| 633 | "step": 1010 |
| 634 | }, |
| 635 | { |
| 636 | "epoch": 2.77, |
| 637 | "learning_rate": 8.91304347826087e-06, |
| 638 | "loss": 0.103, |
| 639 | "step": 1020 |
| 640 | }, |
| 641 | { |
| 642 | "epoch": 2.8, |
| 643 | "learning_rate": 8.804347826086957e-06, |
| 644 | "loss": 0.0708, |
| 645 | "step": 1030 |
| 646 | }, |
| 647 | { |
| 648 | "epoch": 2.83, |
| 649 | "learning_rate": 8.695652173913044e-06, |
| 650 | "loss": 0.0639, |
| 651 | "step": 1040 |
| 652 | }, |
| 653 | { |
| 654 | "epoch": 2.85, |
| 655 | "learning_rate": 8.586956521739131e-06, |
| 656 | "loss": 0.0153, |
| 657 | "step": 1050 |
| 658 | }, |
| 659 | { |
| 660 | "epoch": 2.88, |
| 661 | "learning_rate": 8.478260869565218e-06, |
| 662 | "loss": 0.028, |
| 663 | "step": 1060 |
| 664 | }, |
| 665 | { |
| 666 | "epoch": 2.91, |
| 667 | "learning_rate": 8.369565217391305e-06, |
| 668 | "loss": 0.029, |
| 669 | "step": 1070 |
| 670 | }, |
| 671 | { |
| 672 | "epoch": 2.93, |
| 673 | "learning_rate": 8.260869565217392e-06, |
| 674 | "loss": 0.0915, |
| 675 | "step": 1080 |
| 676 | }, |
| 677 | { |
| 678 | "epoch": 2.96, |
| 679 | "learning_rate": 8.15217391304348e-06, |
| 680 | "loss": 0.0186, |
| 681 | "step": 1090 |
| 682 | }, |
| 683 | { |
| 684 | "epoch": 2.99, |
| 685 | "learning_rate": 8.043478260869566e-06, |
| 686 | "loss": 0.0114, |
| 687 | "step": 1100 |
| 688 | }, |
| 689 | { |
| 690 | "epoch": 3.0, |
| 691 | "eval_accuracy": 0.9903660886319846, |
| 692 | "eval_loss": 0.04182479530572891, |
| 693 | "eval_runtime": 6.3668, |
| 694 | "eval_samples_per_second": 81.517, |
| 695 | "eval_steps_per_second": 10.209, |
| 696 | "step": 1104 |
| 697 | }, |
| 698 | { |
| 699 | "epoch": 3.02, |
| 700 | "learning_rate": 7.934782608695653e-06, |
| 701 | "loss": 0.2106, |
| 702 | "step": 1110 |
| 703 | }, |
| 704 | { |
| 705 | "epoch": 3.04, |
| 706 | "learning_rate": 7.82608695652174e-06, |
| 707 | "loss": 0.0515, |
| 708 | "step": 1120 |
| 709 | }, |
| 710 | { |
| 711 | "epoch": 3.07, |
| 712 | "learning_rate": 7.717391304347827e-06, |
| 713 | "loss": 0.0406, |
| 714 | "step": 1130 |
| 715 | }, |
| 716 | { |
| 717 | "epoch": 3.1, |
| 718 | "learning_rate": 7.608695652173914e-06, |
| 719 | "loss": 0.0355, |
| 720 | "step": 1140 |
| 721 | }, |
| 722 | { |
| 723 | "epoch": 3.12, |
| 724 | "learning_rate": 7.500000000000001e-06, |
| 725 | "loss": 0.1842, |
| 726 | "step": 1150 |
| 727 | }, |
| 728 | { |
| 729 | "epoch": 3.15, |
| 730 | "learning_rate": 7.391304347826087e-06, |
| 731 | "loss": 0.0545, |
| 732 | "step": 1160 |
| 733 | }, |
| 734 | { |
| 735 | "epoch": 3.18, |
| 736 | "learning_rate": 7.282608695652175e-06, |
| 737 | "loss": 0.1349, |
| 738 | "step": 1170 |
| 739 | }, |
| 740 | { |
| 741 | "epoch": 3.21, |
| 742 | "learning_rate": 7.173913043478261e-06, |
| 743 | "loss": 0.0104, |
| 744 | "step": 1180 |
| 745 | }, |
| 746 | { |
| 747 | "epoch": 3.23, |
| 748 | "learning_rate": 7.065217391304349e-06, |
| 749 | "loss": 0.1324, |
| 750 | "step": 1190 |
| 751 | }, |
| 752 | { |
| 753 | "epoch": 3.26, |
| 754 | "learning_rate": 6.956521739130435e-06, |
| 755 | "loss": 0.0934, |
| 756 | "step": 1200 |
| 757 | }, |
| 758 | { |
| 759 | "epoch": 3.29, |
| 760 | "learning_rate": 6.847826086956523e-06, |
| 761 | "loss": 0.0966, |
| 762 | "step": 1210 |
| 763 | }, |
| 764 | { |
| 765 | "epoch": 3.32, |
| 766 | "learning_rate": 6.739130434782609e-06, |
| 767 | "loss": 0.0588, |
| 768 | "step": 1220 |
| 769 | }, |
| 770 | { |
| 771 | "epoch": 3.34, |
| 772 | "learning_rate": 6.630434782608696e-06, |
| 773 | "loss": 0.0802, |
| 774 | "step": 1230 |
| 775 | }, |
| 776 | { |
| 777 | "epoch": 3.37, |
| 778 | "learning_rate": 6.521739130434783e-06, |
| 779 | "loss": 0.0576, |
| 780 | "step": 1240 |
| 781 | }, |
| 782 | { |
| 783 | "epoch": 3.4, |
| 784 | "learning_rate": 6.41304347826087e-06, |
| 785 | "loss": 0.0419, |
| 786 | "step": 1250 |
| 787 | }, |
| 788 | { |
| 789 | "epoch": 3.42, |
| 790 | "learning_rate": 6.304347826086958e-06, |
| 791 | "loss": 0.0481, |
| 792 | "step": 1260 |
| 793 | }, |
| 794 | { |
| 795 | "epoch": 3.45, |
| 796 | "learning_rate": 6.195652173913044e-06, |
| 797 | "loss": 0.0861, |
| 798 | "step": 1270 |
| 799 | }, |
| 800 | { |
| 801 | "epoch": 3.48, |
| 802 | "learning_rate": 6.086956521739132e-06, |
| 803 | "loss": 0.1023, |
| 804 | "step": 1280 |
| 805 | }, |
| 806 | { |
| 807 | "epoch": 3.51, |
| 808 | "learning_rate": 5.978260869565218e-06, |
| 809 | "loss": 0.0584, |
| 810 | "step": 1290 |
| 811 | }, |
| 812 | { |
| 813 | "epoch": 3.53, |
| 814 | "learning_rate": 5.8695652173913055e-06, |
| 815 | "loss": 0.1282, |
| 816 | "step": 1300 |
| 817 | }, |
| 818 | { |
| 819 | "epoch": 3.56, |
| 820 | "learning_rate": 5.760869565217392e-06, |
| 821 | "loss": 0.0277, |
| 822 | "step": 1310 |
| 823 | }, |
| 824 | { |
| 825 | "epoch": 3.59, |
| 826 | "learning_rate": 5.652173913043479e-06, |
| 827 | "loss": 0.1837, |
| 828 | "step": 1320 |
| 829 | }, |
| 830 | { |
| 831 | "epoch": 3.61, |
| 832 | "learning_rate": 5.543478260869566e-06, |
| 833 | "loss": 0.0264, |
| 834 | "step": 1330 |
| 835 | }, |
| 836 | { |
| 837 | "epoch": 3.64, |
| 838 | "learning_rate": 5.4347826086956525e-06, |
| 839 | "loss": 0.1224, |
| 840 | "step": 1340 |
| 841 | }, |
| 842 | { |
| 843 | "epoch": 3.67, |
| 844 | "learning_rate": 5.3260869565217395e-06, |
| 845 | "loss": 0.0434, |
| 846 | "step": 1350 |
| 847 | }, |
| 848 | { |
| 849 | "epoch": 3.7, |
| 850 | "learning_rate": 5.2173913043478265e-06, |
| 851 | "loss": 0.1337, |
| 852 | "step": 1360 |
| 853 | }, |
| 854 | { |
| 855 | "epoch": 3.72, |
| 856 | "learning_rate": 5.108695652173914e-06, |
| 857 | "loss": 0.0071, |
| 858 | "step": 1370 |
| 859 | }, |
| 860 | { |
| 861 | "epoch": 3.75, |
| 862 | "learning_rate": 5e-06, |
| 863 | "loss": 0.0568, |
| 864 | "step": 1380 |
| 865 | }, |
| 866 | { |
| 867 | "epoch": 3.78, |
| 868 | "learning_rate": 4.891304347826087e-06, |
| 869 | "loss": 0.043, |
| 870 | "step": 1390 |
| 871 | }, |
| 872 | { |
| 873 | "epoch": 3.8, |
| 874 | "learning_rate": 4.782608695652174e-06, |
| 875 | "loss": 0.0719, |
| 876 | "step": 1400 |
| 877 | }, |
| 878 | { |
| 879 | "epoch": 3.83, |
| 880 | "learning_rate": 4.673913043478261e-06, |
| 881 | "loss": 0.1128, |
| 882 | "step": 1410 |
| 883 | }, |
| 884 | { |
| 885 | "epoch": 3.86, |
| 886 | "learning_rate": 4.565217391304348e-06, |
| 887 | "loss": 0.0477, |
| 888 | "step": 1420 |
| 889 | }, |
| 890 | { |
| 891 | "epoch": 3.89, |
| 892 | "learning_rate": 4.456521739130435e-06, |
| 893 | "loss": 0.0791, |
| 894 | "step": 1430 |
| 895 | }, |
| 896 | { |
| 897 | "epoch": 3.91, |
| 898 | "learning_rate": 4.347826086956522e-06, |
| 899 | "loss": 0.037, |
| 900 | "step": 1440 |
| 901 | }, |
| 902 | { |
| 903 | "epoch": 3.94, |
| 904 | "learning_rate": 4.239130434782609e-06, |
| 905 | "loss": 0.1466, |
| 906 | "step": 1450 |
| 907 | }, |
| 908 | { |
| 909 | "epoch": 3.97, |
| 910 | "learning_rate": 4.130434782608696e-06, |
| 911 | "loss": 0.0467, |
| 912 | "step": 1460 |
| 913 | }, |
| 914 | { |
| 915 | "epoch": 3.99, |
| 916 | "learning_rate": 4.021739130434783e-06, |
| 917 | "loss": 0.0998, |
| 918 | "step": 1470 |
| 919 | }, |
| 920 | { |
| 921 | "epoch": 4.0, |
| 922 | "eval_accuracy": 0.9903660886319846, |
| 923 | "eval_loss": 0.04251210391521454, |
| 924 | "eval_runtime": 6.5268, |
| 925 | "eval_samples_per_second": 79.518, |
| 926 | "eval_steps_per_second": 9.959, |
| 927 | "step": 1472 |
| 928 | }, |
| 929 | { |
| 930 | "epoch": 4.02, |
| 931 | "learning_rate": 3.91304347826087e-06, |
| 932 | "loss": 0.1285, |
| 933 | "step": 1480 |
| 934 | }, |
| 935 | { |
| 936 | "epoch": 4.05, |
| 937 | "learning_rate": 3.804347826086957e-06, |
| 938 | "loss": 0.1634, |
| 939 | "step": 1490 |
| 940 | }, |
| 941 | { |
| 942 | "epoch": 4.08, |
| 943 | "learning_rate": 3.6956521739130436e-06, |
| 944 | "loss": 0.0462, |
| 945 | "step": 1500 |
| 946 | }, |
| 947 | { |
| 948 | "epoch": 4.1, |
| 949 | "learning_rate": 3.5869565217391305e-06, |
| 950 | "loss": 0.0846, |
| 951 | "step": 1510 |
| 952 | }, |
| 953 | { |
| 954 | "epoch": 4.13, |
| 955 | "learning_rate": 3.4782608695652175e-06, |
| 956 | "loss": 0.1239, |
| 957 | "step": 1520 |
| 958 | }, |
| 959 | { |
| 960 | "epoch": 4.16, |
| 961 | "learning_rate": 3.3695652173913045e-06, |
| 962 | "loss": 0.1818, |
| 963 | "step": 1530 |
| 964 | }, |
| 965 | { |
| 966 | "epoch": 4.18, |
| 967 | "learning_rate": 3.2608695652173914e-06, |
| 968 | "loss": 0.021, |
| 969 | "step": 1540 |
| 970 | }, |
| 971 | { |
| 972 | "epoch": 4.21, |
| 973 | "learning_rate": 3.152173913043479e-06, |
| 974 | "loss": 0.0741, |
| 975 | "step": 1550 |
| 976 | }, |
| 977 | { |
| 978 | "epoch": 4.24, |
| 979 | "learning_rate": 3.043478260869566e-06, |
| 980 | "loss": 0.182, |
| 981 | "step": 1560 |
| 982 | }, |
| 983 | { |
| 984 | "epoch": 4.27, |
| 985 | "learning_rate": 2.9347826086956528e-06, |
| 986 | "loss": 0.0433, |
| 987 | "step": 1570 |
| 988 | }, |
| 989 | { |
| 990 | "epoch": 4.29, |
| 991 | "learning_rate": 2.8260869565217393e-06, |
| 992 | "loss": 0.0437, |
| 993 | "step": 1580 |
| 994 | }, |
| 995 | { |
| 996 | "epoch": 4.32, |
| 997 | "learning_rate": 2.7173913043478263e-06, |
| 998 | "loss": 0.0382, |
| 999 | "step": 1590 |
| 1000 | }, |
| 1001 | { |
| 1002 | "epoch": 4.35, |
| 1003 | "learning_rate": 2.6086956521739132e-06, |
| 1004 | "loss": 0.046, |
| 1005 | "step": 1600 |
| 1006 | }, |
| 1007 | { |
| 1008 | "epoch": 4.38, |
| 1009 | "learning_rate": 2.5e-06, |
| 1010 | "loss": 0.0213, |
| 1011 | "step": 1610 |
| 1012 | }, |
| 1013 | { |
| 1014 | "epoch": 4.4, |
| 1015 | "learning_rate": 2.391304347826087e-06, |
| 1016 | "loss": 0.0186, |
| 1017 | "step": 1620 |
| 1018 | }, |
| 1019 | { |
| 1020 | "epoch": 4.43, |
| 1021 | "learning_rate": 2.282608695652174e-06, |
| 1022 | "loss": 0.0671, |
| 1023 | "step": 1630 |
| 1024 | }, |
| 1025 | { |
| 1026 | "epoch": 4.46, |
| 1027 | "learning_rate": 2.173913043478261e-06, |
| 1028 | "loss": 0.0908, |
| 1029 | "step": 1640 |
| 1030 | }, |
| 1031 | { |
| 1032 | "epoch": 4.48, |
| 1033 | "learning_rate": 2.065217391304348e-06, |
| 1034 | "loss": 0.0697, |
| 1035 | "step": 1650 |
| 1036 | }, |
| 1037 | { |
| 1038 | "epoch": 4.51, |
| 1039 | "learning_rate": 1.956521739130435e-06, |
| 1040 | "loss": 0.0637, |
| 1041 | "step": 1660 |
| 1042 | }, |
| 1043 | { |
| 1044 | "epoch": 4.54, |
| 1045 | "learning_rate": 1.8478260869565218e-06, |
| 1046 | "loss": 0.0819, |
| 1047 | "step": 1670 |
| 1048 | }, |
| 1049 | { |
| 1050 | "epoch": 4.57, |
| 1051 | "learning_rate": 1.7391304347826088e-06, |
| 1052 | "loss": 0.0623, |
| 1053 | "step": 1680 |
| 1054 | }, |
| 1055 | { |
| 1056 | "epoch": 4.59, |
| 1057 | "learning_rate": 1.6304347826086957e-06, |
| 1058 | "loss": 0.0114, |
| 1059 | "step": 1690 |
| 1060 | }, |
| 1061 | { |
| 1062 | "epoch": 4.62, |
| 1063 | "learning_rate": 1.521739130434783e-06, |
| 1064 | "loss": 0.0342, |
| 1065 | "step": 1700 |
| 1066 | }, |
| 1067 | { |
| 1068 | "epoch": 4.65, |
| 1069 | "learning_rate": 1.4130434782608697e-06, |
| 1070 | "loss": 0.0859, |
| 1071 | "step": 1710 |
| 1072 | }, |
| 1073 | { |
| 1074 | "epoch": 4.67, |
| 1075 | "learning_rate": 1.3043478260869566e-06, |
| 1076 | "loss": 0.0462, |
| 1077 | "step": 1720 |
| 1078 | }, |
| 1079 | { |
| 1080 | "epoch": 4.7, |
| 1081 | "learning_rate": 1.1956521739130436e-06, |
| 1082 | "loss": 0.1022, |
| 1083 | "step": 1730 |
| 1084 | }, |
| 1085 | { |
| 1086 | "epoch": 4.73, |
| 1087 | "learning_rate": 1.0869565217391306e-06, |
| 1088 | "loss": 0.0571, |
| 1089 | "step": 1740 |
| 1090 | }, |
| 1091 | { |
| 1092 | "epoch": 4.76, |
| 1093 | "learning_rate": 9.782608695652175e-07, |
| 1094 | "loss": 0.0108, |
| 1095 | "step": 1750 |
| 1096 | }, |
| 1097 | { |
| 1098 | "epoch": 4.78, |
| 1099 | "learning_rate": 8.695652173913044e-07, |
| 1100 | "loss": 0.0893, |
| 1101 | "step": 1760 |
| 1102 | }, |
| 1103 | { |
| 1104 | "epoch": 4.81, |
| 1105 | "learning_rate": 7.608695652173914e-07, |
| 1106 | "loss": 0.0214, |
| 1107 | "step": 1770 |
| 1108 | }, |
| 1109 | { |
| 1110 | "epoch": 4.84, |
| 1111 | "learning_rate": 6.521739130434783e-07, |
| 1112 | "loss": 0.0416, |
| 1113 | "step": 1780 |
| 1114 | }, |
| 1115 | { |
| 1116 | "epoch": 4.86, |
| 1117 | "learning_rate": 5.434782608695653e-07, |
| 1118 | "loss": 0.1022, |
| 1119 | "step": 1790 |
| 1120 | }, |
| 1121 | { |
| 1122 | "epoch": 4.89, |
| 1123 | "learning_rate": 4.347826086956522e-07, |
| 1124 | "loss": 0.0628, |
| 1125 | "step": 1800 |
| 1126 | }, |
| 1127 | { |
| 1128 | "epoch": 4.92, |
| 1129 | "learning_rate": 3.2608695652173915e-07, |
| 1130 | "loss": 0.0691, |
| 1131 | "step": 1810 |
| 1132 | }, |
| 1133 | { |
| 1134 | "epoch": 4.95, |
| 1135 | "learning_rate": 2.173913043478261e-07, |
| 1136 | "loss": 0.0371, |
| 1137 | "step": 1820 |
| 1138 | }, |
| 1139 | { |
| 1140 | "epoch": 4.97, |
| 1141 | "learning_rate": 1.0869565217391305e-07, |
| 1142 | "loss": 0.0714, |
| 1143 | "step": 1830 |
| 1144 | }, |
| 1145 | { |
| 1146 | "epoch": 5.0, |
| 1147 | "learning_rate": 0.0, |
| 1148 | "loss": 0.1244, |
| 1149 | "step": 1840 |
| 1150 | }, |
| 1151 | { |
| 1152 | "epoch": 5.0, |
| 1153 | "eval_accuracy": 0.9922928709055877, |
| 1154 | "eval_loss": 0.03933868557214737, |
| 1155 | "eval_runtime": 6.3674, |
| 1156 | "eval_samples_per_second": 81.509, |
| 1157 | "eval_steps_per_second": 10.208, |
| 1158 | "step": 1840 |
| 1159 | }, |
| 1160 | { |
| 1161 | "epoch": 5.0, |
| 1162 | "step": 1840, |
| 1163 | "total_flos": 1.1387447873864294e+18, |
| 1164 | "train_loss": 0.10440107471431079, |
| 1165 | "train_runtime": 430.0921, |
| 1166 | "train_samples_per_second": 34.167, |
| 1167 | "train_steps_per_second": 4.278 |
| 1168 | } |
| 1169 | ], |
| 1170 | "logging_steps": 10, |
| 1171 | "max_steps": 1840, |
| 1172 | "num_input_tokens_seen": 0, |
| 1173 | "num_train_epochs": 5, |
| 1174 | "save_steps": 500, |
| 1175 | "total_flos": 1.1387447873864294e+18, |
| 1176 | "trial_name": null, |
| 1177 | "trial_params": null |
| 1178 | } |
| 1179 | |