README.md
127.8 KB · 6973 lines · markdown Raw
1 ---
2 language:
3 - ab
4 - ace
5 - ady
6 - af
7 - alt
8 - am
9 - ami
10 - an
11 - ang
12 - anp
13 - ar
14 - arc
15 - ary
16 - arz
17 - as
18 - ast
19 - atj
20 - av
21 - avk
22 - awa
23 - ay
24 - az
25 - azb
26 - ba
27 - ban
28 - bar
29 - bbc
30 - bcl
31 - be
32 - bg
33 - bh
34 - bi
35 - bjn
36 - blk
37 - bm
38 - bn
39 - bo
40 - bpy
41 - br
42 - bs
43 - bug
44 - bxr
45 - ca
46 - cbk
47 - cdo
48 - ce
49 - ceb
50 - ch
51 - chr
52 - chy
53 - ckb
54 - co
55 - cr
56 - crh
57 - cs
58 - csb
59 - cu
60 - cv
61 - cy
62 - da
63 - dag
64 - de
65 - dga
66 - din
67 - diq
68 - dsb
69 - dty
70 - dv
71 - dz
72 - ee
73 - el
74 - eml
75 - en
76 - eo
77 - es
78 - et
79 - eu
80 - ext
81 - fa
82 - fat
83 - ff
84 - fi
85 - fj
86 - fo
87 - fon
88 - fr
89 - frp
90 - frr
91 - fur
92 - fy
93 - ga
94 - gag
95 - gan
96 - gcr
97 - gd
98 - gl
99 - glk
100 - gn
101 - gom
102 - gor
103 - got
104 - gpe
105 - gsw
106 - gu
107 - guc
108 - gur
109 - guw
110 - gv
111 - ha
112 - hak
113 - haw
114 - hbs
115 - he
116 - hi
117 - hif
118 - hr
119 - hsb
120 - ht
121 - hu
122 - hy
123 - hyw
124 - ia
125 - id
126 - ie
127 - ig
128 - ik
129 - ilo
130 - inh
131 - io
132 - is
133 - it
134 - iu
135 - ja
136 - jam
137 - jbo
138 - jv
139 - ka
140 - kaa
141 - kab
142 - kbd
143 - kbp
144 - kcg
145 - kg
146 - ki
147 - kk
148 - kl
149 - km
150 - kn
151 - ko
152 - koi
153 - krc
154 - ks
155 - ksh
156 - ku
157 - kv
158 - kw
159 - ky
160 - la
161 - lad
162 - lb
163 - lbe
164 - lez
165 - lfn
166 - lg
167 - li
168 - lij
169 - lld
170 - lmo
171 - ln
172 - lo
173 - lt
174 - ltg
175 - lv
176 - lzh
177 - mad
178 - mai
179 - map
180 - mdf
181 - mg
182 - mhr
183 - mi
184 - min
185 - mk
186 - ml
187 - mn
188 - mni
189 - mnw
190 - mr
191 - mrj
192 - ms
193 - mt
194 - mwl
195 - my
196 - myv
197 - mzn
198 - nah
199 - nan
200 - nap
201 - nds
202 - ne
203 - new
204 - nia
205 - nl
206 - nn
207 - 'no'
208 - nov
209 - nqo
210 - nrf
211 - nso
212 - nv
213 - ny
214 - oc
215 - olo
216 - om
217 - or
218 - os
219 - pa
220 - pag
221 - pam
222 - pap
223 - pcd
224 - pcm
225 - pdc
226 - pfl
227 - pi
228 - pih
229 - pl
230 - pms
231 - pnb
232 - pnt
233 - ps
234 - pt
235 - pwn
236 - qu
237 - rm
238 - rmy
239 - rn
240 - ro
241 - ru
242 - rue
243 - rup
244 - rw
245 - sa
246 - sah
247 - sat
248 - sc
249 - scn
250 - sco
251 - sd
252 - se
253 - sg
254 - sgs
255 - shi
256 - shn
257 - si
258 - sk
259 - skr
260 - sl
261 - sm
262 - smn
263 - sn
264 - so
265 - sq
266 - sr
267 - srn
268 - ss
269 - st
270 - stq
271 - su
272 - sv
273 - sw
274 - szl
275 - szy
276 - ta
277 - tay
278 - tcy
279 - te
280 - tet
281 - tg
282 - th
283 - ti
284 - tk
285 - tl
286 - tly
287 - tn
288 - to
289 - tpi
290 - tr
291 - trv
292 - ts
293 - tt
294 - tum
295 - tw
296 - ty
297 - tyv
298 - udm
299 - ug
300 - uk
301 - ur
302 - uz
303 - ve
304 - vec
305 - vep
306 - vi
307 - vls
308 - vo
309 - vro
310 - wa
311 - war
312 - wo
313 - wuu
314 - xal
315 - xh
316 - xmf
317 - yi
318 - yo
319 - yue
320 - za
321 - zea
322 - zgh
323 - zh
324 - zu
325 license:
326 - cc-by-sa-3.0
327 - gfdl
328 size_categories:
329 - n<1K
330 - 1K<n<10K
331 - 10K<n<100K
332 - 100K<n<1M
333 - 1M<n<10M
334 task_categories:
335 - text-generation
336 - fill-mask
337 task_ids:
338 - language-modeling
339 - masked-language-modeling
340 configs:
341 - config_name: 20231101.ab
342 data_files:
343 - split: train
344 path: 20231101.ab/train-*
345 - config_name: 20231101.ace
346 data_files:
347 - split: train
348 path: 20231101.ace/train-*
349 - config_name: 20231101.ady
350 data_files:
351 - split: train
352 path: 20231101.ady/train-*
353 - config_name: 20231101.af
354 data_files:
355 - split: train
356 path: 20231101.af/train-*
357 - config_name: 20231101.als
358 data_files:
359 - split: train
360 path: 20231101.als/train-*
361 - config_name: 20231101.alt
362 data_files:
363 - split: train
364 path: 20231101.alt/train-*
365 - config_name: 20231101.am
366 data_files:
367 - split: train
368 path: 20231101.am/train-*
369 - config_name: 20231101.ami
370 data_files:
371 - split: train
372 path: 20231101.ami/train-*
373 - config_name: 20231101.an
374 data_files:
375 - split: train
376 path: 20231101.an/train-*
377 - config_name: 20231101.ang
378 data_files:
379 - split: train
380 path: 20231101.ang/train-*
381 - config_name: 20231101.anp
382 data_files:
383 - split: train
384 path: 20231101.anp/train-*
385 - config_name: 20231101.ar
386 data_files:
387 - split: train
388 path: 20231101.ar/train-*
389 - config_name: 20231101.arc
390 data_files:
391 - split: train
392 path: 20231101.arc/train-*
393 - config_name: 20231101.ary
394 data_files:
395 - split: train
396 path: 20231101.ary/train-*
397 - config_name: 20231101.arz
398 data_files:
399 - split: train
400 path: 20231101.arz/train-*
401 - config_name: 20231101.as
402 data_files:
403 - split: train
404 path: 20231101.as/train-*
405 - config_name: 20231101.ast
406 data_files:
407 - split: train
408 path: 20231101.ast/train-*
409 - config_name: 20231101.atj
410 data_files:
411 - split: train
412 path: 20231101.atj/train-*
413 - config_name: 20231101.av
414 data_files:
415 - split: train
416 path: 20231101.av/train-*
417 - config_name: 20231101.avk
418 data_files:
419 - split: train
420 path: 20231101.avk/train-*
421 - config_name: 20231101.awa
422 data_files:
423 - split: train
424 path: 20231101.awa/train-*
425 - config_name: 20231101.ay
426 data_files:
427 - split: train
428 path: 20231101.ay/train-*
429 - config_name: 20231101.az
430 data_files:
431 - split: train
432 path: 20231101.az/train-*
433 - config_name: 20231101.azb
434 data_files:
435 - split: train
436 path: 20231101.azb/train-*
437 - config_name: 20231101.ba
438 data_files:
439 - split: train
440 path: 20231101.ba/train-*
441 - config_name: 20231101.ban
442 data_files:
443 - split: train
444 path: 20231101.ban/train-*
445 - config_name: 20231101.bar
446 data_files:
447 - split: train
448 path: 20231101.bar/train-*
449 - config_name: 20231101.bat-smg
450 data_files:
451 - split: train
452 path: 20231101.bat-smg/train-*
453 - config_name: 20231101.bcl
454 data_files:
455 - split: train
456 path: 20231101.bcl/train-*
457 - config_name: 20231101.be
458 data_files:
459 - split: train
460 path: 20231101.be/train-*
461 - config_name: 20231101.be-x-old
462 data_files:
463 - split: train
464 path: 20231101.be-x-old/train-*
465 - config_name: 20231101.bg
466 data_files:
467 - split: train
468 path: 20231101.bg/train-*
469 - config_name: 20231101.bh
470 data_files:
471 - split: train
472 path: 20231101.bh/train-*
473 - config_name: 20231101.bi
474 data_files:
475 - split: train
476 path: 20231101.bi/train-*
477 - config_name: 20231101.bjn
478 data_files:
479 - split: train
480 path: 20231101.bjn/train-*
481 - config_name: 20231101.blk
482 data_files:
483 - split: train
484 path: 20231101.blk/train-*
485 - config_name: 20231101.bm
486 data_files:
487 - split: train
488 path: 20231101.bm/train-*
489 - config_name: 20231101.bn
490 data_files:
491 - split: train
492 path: 20231101.bn/train-*
493 - config_name: 20231101.bo
494 data_files:
495 - split: train
496 path: 20231101.bo/train-*
497 - config_name: 20231101.bpy
498 data_files:
499 - split: train
500 path: 20231101.bpy/train-*
501 - config_name: 20231101.br
502 data_files:
503 - split: train
504 path: 20231101.br/train-*
505 - config_name: 20231101.bs
506 data_files:
507 - split: train
508 path: 20231101.bs/train-*
509 - config_name: 20231101.bug
510 data_files:
511 - split: train
512 path: 20231101.bug/train-*
513 - config_name: 20231101.bxr
514 data_files:
515 - split: train
516 path: 20231101.bxr/train-*
517 - config_name: 20231101.ca
518 data_files:
519 - split: train
520 path: 20231101.ca/train-*
521 - config_name: 20231101.cbk-zam
522 data_files:
523 - split: train
524 path: 20231101.cbk-zam/train-*
525 - config_name: 20231101.cdo
526 data_files:
527 - split: train
528 path: 20231101.cdo/train-*
529 - config_name: 20231101.ce
530 data_files:
531 - split: train
532 path: 20231101.ce/train-*
533 - config_name: 20231101.ceb
534 data_files:
535 - split: train
536 path: 20231101.ceb/train-*
537 - config_name: 20231101.ch
538 data_files:
539 - split: train
540 path: 20231101.ch/train-*
541 - config_name: 20231101.chr
542 data_files:
543 - split: train
544 path: 20231101.chr/train-*
545 - config_name: 20231101.chy
546 data_files:
547 - split: train
548 path: 20231101.chy/train-*
549 - config_name: 20231101.ckb
550 data_files:
551 - split: train
552 path: 20231101.ckb/train-*
553 - config_name: 20231101.co
554 data_files:
555 - split: train
556 path: 20231101.co/train-*
557 - config_name: 20231101.cr
558 data_files:
559 - split: train
560 path: 20231101.cr/train-*
561 - config_name: 20231101.crh
562 data_files:
563 - split: train
564 path: 20231101.crh/train-*
565 - config_name: 20231101.cs
566 data_files:
567 - split: train
568 path: 20231101.cs/train-*
569 - config_name: 20231101.csb
570 data_files:
571 - split: train
572 path: 20231101.csb/train-*
573 - config_name: 20231101.cu
574 data_files:
575 - split: train
576 path: 20231101.cu/train-*
577 - config_name: 20231101.cv
578 data_files:
579 - split: train
580 path: 20231101.cv/train-*
581 - config_name: 20231101.cy
582 data_files:
583 - split: train
584 path: 20231101.cy/train-*
585 - config_name: 20231101.da
586 data_files:
587 - split: train
588 path: 20231101.da/train-*
589 - config_name: 20231101.dag
590 data_files:
591 - split: train
592 path: 20231101.dag/train-*
593 - config_name: 20231101.de
594 data_files:
595 - split: train
596 path: 20231101.de/train-*
597 - config_name: 20231101.din
598 data_files:
599 - split: train
600 path: 20231101.din/train-*
601 - config_name: 20231101.diq
602 data_files:
603 - split: train
604 path: 20231101.diq/train-*
605 - config_name: 20231101.dsb
606 data_files:
607 - split: train
608 path: 20231101.dsb/train-*
609 - config_name: 20231101.dty
610 data_files:
611 - split: train
612 path: 20231101.dty/train-*
613 - config_name: 20231101.dv
614 data_files:
615 - split: train
616 path: 20231101.dv/train-*
617 - config_name: 20231101.dz
618 data_files:
619 - split: train
620 path: 20231101.dz/train-*
621 - config_name: 20231101.ee
622 data_files:
623 - split: train
624 path: 20231101.ee/train-*
625 - config_name: 20231101.el
626 data_files:
627 - split: train
628 path: 20231101.el/train-*
629 - config_name: 20231101.eml
630 data_files:
631 - split: train
632 path: 20231101.eml/train-*
633 - config_name: 20231101.en
634 data_files:
635 - split: train
636 path: 20231101.en/train-*
637 - config_name: 20231101.eo
638 data_files:
639 - split: train
640 path: 20231101.eo/train-*
641 - config_name: 20231101.es
642 data_files:
643 - split: train
644 path: 20231101.es/train-*
645 - config_name: 20231101.et
646 data_files:
647 - split: train
648 path: 20231101.et/train-*
649 - config_name: 20231101.eu
650 data_files:
651 - split: train
652 path: 20231101.eu/train-*
653 - config_name: 20231101.ext
654 data_files:
655 - split: train
656 path: 20231101.ext/train-*
657 - config_name: 20231101.fa
658 data_files:
659 - split: train
660 path: 20231101.fa/train-*
661 - config_name: 20231101.fat
662 data_files:
663 - split: train
664 path: 20231101.fat/train-*
665 - config_name: 20231101.ff
666 data_files:
667 - split: train
668 path: 20231101.ff/train-*
669 - config_name: 20231101.fi
670 data_files:
671 - split: train
672 path: 20231101.fi/train-*
673 - config_name: 20231101.fiu-vro
674 data_files:
675 - split: train
676 path: 20231101.fiu-vro/train-*
677 - config_name: 20231101.fj
678 data_files:
679 - split: train
680 path: 20231101.fj/train-*
681 - config_name: 20231101.fo
682 data_files:
683 - split: train
684 path: 20231101.fo/train-*
685 - config_name: 20231101.fon
686 data_files:
687 - split: train
688 path: 20231101.fon/train-*
689 - config_name: 20231101.fr
690 data_files:
691 - split: train
692 path: 20231101.fr/train-*
693 - config_name: 20231101.frp
694 data_files:
695 - split: train
696 path: 20231101.frp/train-*
697 - config_name: 20231101.frr
698 data_files:
699 - split: train
700 path: 20231101.frr/train-*
701 - config_name: 20231101.fur
702 data_files:
703 - split: train
704 path: 20231101.fur/train-*
705 - config_name: 20231101.fy
706 data_files:
707 - split: train
708 path: 20231101.fy/train-*
709 - config_name: 20231101.ga
710 data_files:
711 - split: train
712 path: 20231101.ga/train-*
713 - config_name: 20231101.gag
714 data_files:
715 - split: train
716 path: 20231101.gag/train-*
717 - config_name: 20231101.gan
718 data_files:
719 - split: train
720 path: 20231101.gan/train-*
721 - config_name: 20231101.gcr
722 data_files:
723 - split: train
724 path: 20231101.gcr/train-*
725 - config_name: 20231101.gd
726 data_files:
727 - split: train
728 path: 20231101.gd/train-*
729 - config_name: 20231101.gl
730 data_files:
731 - split: train
732 path: 20231101.gl/train-*
733 - config_name: 20231101.glk
734 data_files:
735 - split: train
736 path: 20231101.glk/train-*
737 - config_name: 20231101.gn
738 data_files:
739 - split: train
740 path: 20231101.gn/train-*
741 - config_name: 20231101.gom
742 data_files:
743 - split: train
744 path: 20231101.gom/train-*
745 - config_name: 20231101.gor
746 data_files:
747 - split: train
748 path: 20231101.gor/train-*
749 - config_name: 20231101.got
750 data_files:
751 - split: train
752 path: 20231101.got/train-*
753 - config_name: 20231101.gpe
754 data_files:
755 - split: train
756 path: 20231101.gpe/train-*
757 - config_name: 20231101.gu
758 data_files:
759 - split: train
760 path: 20231101.gu/train-*
761 - config_name: 20231101.guc
762 data_files:
763 - split: train
764 path: 20231101.guc/train-*
765 - config_name: 20231101.gur
766 data_files:
767 - split: train
768 path: 20231101.gur/train-*
769 - config_name: 20231101.guw
770 data_files:
771 - split: train
772 path: 20231101.guw/train-*
773 - config_name: 20231101.gv
774 data_files:
775 - split: train
776 path: 20231101.gv/train-*
777 - config_name: 20231101.ha
778 data_files:
779 - split: train
780 path: 20231101.ha/train-*
781 - config_name: 20231101.hak
782 data_files:
783 - split: train
784 path: 20231101.hak/train-*
785 - config_name: 20231101.haw
786 data_files:
787 - split: train
788 path: 20231101.haw/train-*
789 - config_name: 20231101.he
790 data_files:
791 - split: train
792 path: 20231101.he/train-*
793 - config_name: 20231101.hi
794 data_files:
795 - split: train
796 path: 20231101.hi/train-*
797 - config_name: 20231101.hif
798 data_files:
799 - split: train
800 path: 20231101.hif/train-*
801 - config_name: 20231101.hr
802 data_files:
803 - split: train
804 path: 20231101.hr/train-*
805 - config_name: 20231101.hsb
806 data_files:
807 - split: train
808 path: 20231101.hsb/train-*
809 - config_name: 20231101.ht
810 data_files:
811 - split: train
812 path: 20231101.ht/train-*
813 - config_name: 20231101.hu
814 data_files:
815 - split: train
816 path: 20231101.hu/train-*
817 - config_name: 20231101.hy
818 data_files:
819 - split: train
820 path: 20231101.hy/train-*
821 - config_name: 20231101.hyw
822 data_files:
823 - split: train
824 path: 20231101.hyw/train-*
825 - config_name: 20231101.ia
826 data_files:
827 - split: train
828 path: 20231101.ia/train-*
829 - config_name: 20231101.id
830 data_files:
831 - split: train
832 path: 20231101.id/train-*
833 - config_name: 20231101.ie
834 data_files:
835 - split: train
836 path: 20231101.ie/train-*
837 - config_name: 20231101.ig
838 data_files:
839 - split: train
840 path: 20231101.ig/train-*
841 - config_name: 20231101.ik
842 data_files:
843 - split: train
844 path: 20231101.ik/train-*
845 - config_name: 20231101.ilo
846 data_files:
847 - split: train
848 path: 20231101.ilo/train-*
849 - config_name: 20231101.inh
850 data_files:
851 - split: train
852 path: 20231101.inh/train-*
853 - config_name: 20231101.io
854 data_files:
855 - split: train
856 path: 20231101.io/train-*
857 - config_name: 20231101.is
858 data_files:
859 - split: train
860 path: 20231101.is/train-*
861 - config_name: 20231101.it
862 data_files:
863 - split: train
864 path: 20231101.it/train-*
865 - config_name: 20231101.iu
866 data_files:
867 - split: train
868 path: 20231101.iu/train-*
869 - config_name: 20231101.ja
870 data_files:
871 - split: train
872 path: 20231101.ja/train-*
873 - config_name: 20231101.jam
874 data_files:
875 - split: train
876 path: 20231101.jam/train-*
877 - config_name: 20231101.jbo
878 data_files:
879 - split: train
880 path: 20231101.jbo/train-*
881 - config_name: 20231101.jv
882 data_files:
883 - split: train
884 path: 20231101.jv/train-*
885 - config_name: 20231101.ka
886 data_files:
887 - split: train
888 path: 20231101.ka/train-*
889 - config_name: 20231101.kaa
890 data_files:
891 - split: train
892 path: 20231101.kaa/train-*
893 - config_name: 20231101.kab
894 data_files:
895 - split: train
896 path: 20231101.kab/train-*
897 - config_name: 20231101.kbd
898 data_files:
899 - split: train
900 path: 20231101.kbd/train-*
901 - config_name: 20231101.kbp
902 data_files:
903 - split: train
904 path: 20231101.kbp/train-*
905 - config_name: 20231101.kcg
906 data_files:
907 - split: train
908 path: 20231101.kcg/train-*
909 - config_name: 20231101.kg
910 data_files:
911 - split: train
912 path: 20231101.kg/train-*
913 - config_name: 20231101.ki
914 data_files:
915 - split: train
916 path: 20231101.ki/train-*
917 - config_name: 20231101.kk
918 data_files:
919 - split: train
920 path: 20231101.kk/train-*
921 - config_name: 20231101.kl
922 data_files:
923 - split: train
924 path: 20231101.kl/train-*
925 - config_name: 20231101.km
926 data_files:
927 - split: train
928 path: 20231101.km/train-*
929 - config_name: 20231101.kn
930 data_files:
931 - split: train
932 path: 20231101.kn/train-*
933 - config_name: 20231101.ko
934 data_files:
935 - split: train
936 path: 20231101.ko/train-*
937 - config_name: 20231101.koi
938 data_files:
939 - split: train
940 path: 20231101.koi/train-*
941 - config_name: 20231101.krc
942 data_files:
943 - split: train
944 path: 20231101.krc/train-*
945 - config_name: 20231101.ks
946 data_files:
947 - split: train
948 path: 20231101.ks/train-*
949 - config_name: 20231101.ksh
950 data_files:
951 - split: train
952 path: 20231101.ksh/train-*
953 - config_name: 20231101.ku
954 data_files:
955 - split: train
956 path: 20231101.ku/train-*
957 - config_name: 20231101.kv
958 data_files:
959 - split: train
960 path: 20231101.kv/train-*
961 - config_name: 20231101.kw
962 data_files:
963 - split: train
964 path: 20231101.kw/train-*
965 - config_name: 20231101.ky
966 data_files:
967 - split: train
968 path: 20231101.ky/train-*
969 - config_name: 20231101.la
970 data_files:
971 - split: train
972 path: 20231101.la/train-*
973 - config_name: 20231101.lad
974 data_files:
975 - split: train
976 path: 20231101.lad/train-*
977 - config_name: 20231101.lb
978 data_files:
979 - split: train
980 path: 20231101.lb/train-*
981 - config_name: 20231101.lbe
982 data_files:
983 - split: train
984 path: 20231101.lbe/train-*
985 - config_name: 20231101.lez
986 data_files:
987 - split: train
988 path: 20231101.lez/train-*
989 - config_name: 20231101.lfn
990 data_files:
991 - split: train
992 path: 20231101.lfn/train-*
993 - config_name: 20231101.lg
994 data_files:
995 - split: train
996 path: 20231101.lg/train-*
997 - config_name: 20231101.li
998 data_files:
999 - split: train
1000 path: 20231101.li/train-*
1001 - config_name: 20231101.lij
1002 data_files:
1003 - split: train
1004 path: 20231101.lij/train-*
1005 - config_name: 20231101.lld
1006 data_files:
1007 - split: train
1008 path: 20231101.lld/train-*
1009 - config_name: 20231101.lmo
1010 data_files:
1011 - split: train
1012 path: 20231101.lmo/train-*
1013 - config_name: 20231101.ln
1014 data_files:
1015 - split: train
1016 path: 20231101.ln/train-*
1017 - config_name: 20231101.lo
1018 data_files:
1019 - split: train
1020 path: 20231101.lo/train-*
1021 - config_name: 20231101.lt
1022 data_files:
1023 - split: train
1024 path: 20231101.lt/train-*
1025 - config_name: 20231101.ltg
1026 data_files:
1027 - split: train
1028 path: 20231101.ltg/train-*
1029 - config_name: 20231101.lv
1030 data_files:
1031 - split: train
1032 path: 20231101.lv/train-*
1033 - config_name: 20231101.mad
1034 data_files:
1035 - split: train
1036 path: 20231101.mad/train-*
1037 - config_name: 20231101.mai
1038 data_files:
1039 - split: train
1040 path: 20231101.mai/train-*
1041 - config_name: 20231101.map-bms
1042 data_files:
1043 - split: train
1044 path: 20231101.map-bms/train-*
1045 - config_name: 20231101.mdf
1046 data_files:
1047 - split: train
1048 path: 20231101.mdf/train-*
1049 - config_name: 20231101.mg
1050 data_files:
1051 - split: train
1052 path: 20231101.mg/train-*
1053 - config_name: 20231101.mhr
1054 data_files:
1055 - split: train
1056 path: 20231101.mhr/train-*
1057 - config_name: 20231101.mi
1058 data_files:
1059 - split: train
1060 path: 20231101.mi/train-*
1061 - config_name: 20231101.min
1062 data_files:
1063 - split: train
1064 path: 20231101.min/train-*
1065 - config_name: 20231101.mk
1066 data_files:
1067 - split: train
1068 path: 20231101.mk/train-*
1069 - config_name: 20231101.ml
1070 data_files:
1071 - split: train
1072 path: 20231101.ml/train-*
1073 - config_name: 20231101.mn
1074 data_files:
1075 - split: train
1076 path: 20231101.mn/train-*
1077 - config_name: 20231101.mni
1078 data_files:
1079 - split: train
1080 path: 20231101.mni/train-*
1081 - config_name: 20231101.mnw
1082 data_files:
1083 - split: train
1084 path: 20231101.mnw/train-*
1085 - config_name: 20231101.mr
1086 data_files:
1087 - split: train
1088 path: 20231101.mr/train-*
1089 - config_name: 20231101.mrj
1090 data_files:
1091 - split: train
1092 path: 20231101.mrj/train-*
1093 - config_name: 20231101.ms
1094 data_files:
1095 - split: train
1096 path: 20231101.ms/train-*
1097 - config_name: 20231101.mt
1098 data_files:
1099 - split: train
1100 path: 20231101.mt/train-*
1101 - config_name: 20231101.mwl
1102 data_files:
1103 - split: train
1104 path: 20231101.mwl/train-*
1105 - config_name: 20231101.my
1106 data_files:
1107 - split: train
1108 path: 20231101.my/train-*
1109 - config_name: 20231101.myv
1110 data_files:
1111 - split: train
1112 path: 20231101.myv/train-*
1113 - config_name: 20231101.mzn
1114 data_files:
1115 - split: train
1116 path: 20231101.mzn/train-*
1117 - config_name: 20231101.nah
1118 data_files:
1119 - split: train
1120 path: 20231101.nah/train-*
1121 - config_name: 20231101.nap
1122 data_files:
1123 - split: train
1124 path: 20231101.nap/train-*
1125 - config_name: 20231101.nds
1126 data_files:
1127 - split: train
1128 path: 20231101.nds/train-*
1129 - config_name: 20231101.nds-nl
1130 data_files:
1131 - split: train
1132 path: 20231101.nds-nl/train-*
1133 - config_name: 20231101.ne
1134 data_files:
1135 - split: train
1136 path: 20231101.ne/train-*
1137 - config_name: 20231101.new
1138 data_files:
1139 - split: train
1140 path: 20231101.new/train-*
1141 - config_name: 20231101.nia
1142 data_files:
1143 - split: train
1144 path: 20231101.nia/train-*
1145 - config_name: 20231101.nl
1146 data_files:
1147 - split: train
1148 path: 20231101.nl/train-*
1149 - config_name: 20231101.nn
1150 data_files:
1151 - split: train
1152 path: 20231101.nn/train-*
1153 - config_name: 20231101.no
1154 data_files:
1155 - split: train
1156 path: 20231101.no/train-*
1157 - config_name: 20231101.nov
1158 data_files:
1159 - split: train
1160 path: 20231101.nov/train-*
1161 - config_name: 20231101.nqo
1162 data_files:
1163 - split: train
1164 path: 20231101.nqo/train-*
1165 - config_name: 20231101.nrm
1166 data_files:
1167 - split: train
1168 path: 20231101.nrm/train-*
1169 - config_name: 20231101.nso
1170 data_files:
1171 - split: train
1172 path: 20231101.nso/train-*
1173 - config_name: 20231101.nv
1174 data_files:
1175 - split: train
1176 path: 20231101.nv/train-*
1177 - config_name: 20231101.ny
1178 data_files:
1179 - split: train
1180 path: 20231101.ny/train-*
1181 - config_name: 20231101.oc
1182 data_files:
1183 - split: train
1184 path: 20231101.oc/train-*
1185 - config_name: 20231101.olo
1186 data_files:
1187 - split: train
1188 path: 20231101.olo/train-*
1189 - config_name: 20231101.om
1190 data_files:
1191 - split: train
1192 path: 20231101.om/train-*
1193 - config_name: 20231101.or
1194 data_files:
1195 - split: train
1196 path: 20231101.or/train-*
1197 - config_name: 20231101.os
1198 data_files:
1199 - split: train
1200 path: 20231101.os/train-*
1201 - config_name: 20231101.pa
1202 data_files:
1203 - split: train
1204 path: 20231101.pa/train-*
1205 - config_name: 20231101.pag
1206 data_files:
1207 - split: train
1208 path: 20231101.pag/train-*
1209 - config_name: 20231101.pam
1210 data_files:
1211 - split: train
1212 path: 20231101.pam/train-*
1213 - config_name: 20231101.pap
1214 data_files:
1215 - split: train
1216 path: 20231101.pap/train-*
1217 - config_name: 20231101.pcd
1218 data_files:
1219 - split: train
1220 path: 20231101.pcd/train-*
1221 - config_name: 20231101.pcm
1222 data_files:
1223 - split: train
1224 path: 20231101.pcm/train-*
1225 - config_name: 20231101.pdc
1226 data_files:
1227 - split: train
1228 path: 20231101.pdc/train-*
1229 - config_name: 20231101.pfl
1230 data_files:
1231 - split: train
1232 path: 20231101.pfl/train-*
1233 - config_name: 20231101.pi
1234 data_files:
1235 - split: train
1236 path: 20231101.pi/train-*
1237 - config_name: 20231101.pih
1238 data_files:
1239 - split: train
1240 path: 20231101.pih/train-*
1241 - config_name: 20231101.pl
1242 data_files:
1243 - split: train
1244 path: 20231101.pl/train-*
1245 - config_name: 20231101.pms
1246 data_files:
1247 - split: train
1248 path: 20231101.pms/train-*
1249 - config_name: 20231101.pnb
1250 data_files:
1251 - split: train
1252 path: 20231101.pnb/train-*
1253 - config_name: 20231101.pnt
1254 data_files:
1255 - split: train
1256 path: 20231101.pnt/train-*
1257 - config_name: 20231101.ps
1258 data_files:
1259 - split: train
1260 path: 20231101.ps/train-*
1261 - config_name: 20231101.pt
1262 data_files:
1263 - split: train
1264 path: 20231101.pt/train-*
1265 - config_name: 20231101.pwn
1266 data_files:
1267 - split: train
1268 path: 20231101.pwn/train-*
1269 - config_name: 20231101.qu
1270 data_files:
1271 - split: train
1272 path: 20231101.qu/train-*
1273 - config_name: 20231101.rm
1274 data_files:
1275 - split: train
1276 path: 20231101.rm/train-*
1277 - config_name: 20231101.rmy
1278 data_files:
1279 - split: train
1280 path: 20231101.rmy/train-*
1281 - config_name: 20231101.rn
1282 data_files:
1283 - split: train
1284 path: 20231101.rn/train-*
1285 - config_name: 20231101.ro
1286 data_files:
1287 - split: train
1288 path: 20231101.ro/train-*
1289 - config_name: 20231101.roa-rup
1290 data_files:
1291 - split: train
1292 path: 20231101.roa-rup/train-*
1293 - config_name: 20231101.roa-tara
1294 data_files:
1295 - split: train
1296 path: 20231101.roa-tara/train-*
1297 - config_name: 20231101.ru
1298 data_files:
1299 - split: train
1300 path: 20231101.ru/train-*
1301 - config_name: 20231101.rue
1302 data_files:
1303 - split: train
1304 path: 20231101.rue/train-*
1305 - config_name: 20231101.rw
1306 data_files:
1307 - split: train
1308 path: 20231101.rw/train-*
1309 - config_name: 20231101.sa
1310 data_files:
1311 - split: train
1312 path: 20231101.sa/train-*
1313 - config_name: 20231101.sah
1314 data_files:
1315 - split: train
1316 path: 20231101.sah/train-*
1317 - config_name: 20231101.sat
1318 data_files:
1319 - split: train
1320 path: 20231101.sat/train-*
1321 - config_name: 20231101.sc
1322 data_files:
1323 - split: train
1324 path: 20231101.sc/train-*
1325 - config_name: 20231101.scn
1326 data_files:
1327 - split: train
1328 path: 20231101.scn/train-*
1329 - config_name: 20231101.sco
1330 data_files:
1331 - split: train
1332 path: 20231101.sco/train-*
1333 - config_name: 20231101.sd
1334 data_files:
1335 - split: train
1336 path: 20231101.sd/train-*
1337 - config_name: 20231101.se
1338 data_files:
1339 - split: train
1340 path: 20231101.se/train-*
1341 - config_name: 20231101.sg
1342 data_files:
1343 - split: train
1344 path: 20231101.sg/train-*
1345 - config_name: 20231101.sh
1346 data_files:
1347 - split: train
1348 path: 20231101.sh/train-*
1349 - config_name: 20231101.shi
1350 data_files:
1351 - split: train
1352 path: 20231101.shi/train-*
1353 - config_name: 20231101.shn
1354 data_files:
1355 - split: train
1356 path: 20231101.shn/train-*
1357 - config_name: 20231101.si
1358 data_files:
1359 - split: train
1360 path: 20231101.si/train-*
1361 - config_name: 20231101.simple
1362 data_files:
1363 - split: train
1364 path: 20231101.simple/train-*
1365 - config_name: 20231101.sk
1366 data_files:
1367 - split: train
1368 path: 20231101.sk/train-*
1369 - config_name: 20231101.skr
1370 data_files:
1371 - split: train
1372 path: 20231101.skr/train-*
1373 - config_name: 20231101.sl
1374 data_files:
1375 - split: train
1376 path: 20231101.sl/train-*
1377 - config_name: 20231101.sm
1378 data_files:
1379 - split: train
1380 path: 20231101.sm/train-*
1381 - config_name: 20231101.smn
1382 data_files:
1383 - split: train
1384 path: 20231101.smn/train-*
1385 - config_name: 20231101.sn
1386 data_files:
1387 - split: train
1388 path: 20231101.sn/train-*
1389 - config_name: 20231101.so
1390 data_files:
1391 - split: train
1392 path: 20231101.so/train-*
1393 - config_name: 20231101.sq
1394 data_files:
1395 - split: train
1396 path: 20231101.sq/train-*
1397 - config_name: 20231101.sr
1398 data_files:
1399 - split: train
1400 path: 20231101.sr/train-*
1401 - config_name: 20231101.srn
1402 data_files:
1403 - split: train
1404 path: 20231101.srn/train-*
1405 - config_name: 20231101.ss
1406 data_files:
1407 - split: train
1408 path: 20231101.ss/train-*
1409 - config_name: 20231101.st
1410 data_files:
1411 - split: train
1412 path: 20231101.st/train-*
1413 - config_name: 20231101.stq
1414 data_files:
1415 - split: train
1416 path: 20231101.stq/train-*
1417 - config_name: 20231101.su
1418 data_files:
1419 - split: train
1420 path: 20231101.su/train-*
1421 - config_name: 20231101.sv
1422 data_files:
1423 - split: train
1424 path: 20231101.sv/train-*
1425 - config_name: 20231101.sw
1426 data_files:
1427 - split: train
1428 path: 20231101.sw/train-*
1429 - config_name: 20231101.szl
1430 data_files:
1431 - split: train
1432 path: 20231101.szl/train-*
1433 - config_name: 20231101.szy
1434 data_files:
1435 - split: train
1436 path: 20231101.szy/train-*
1437 - config_name: 20231101.ta
1438 data_files:
1439 - split: train
1440 path: 20231101.ta/train-*
1441 - config_name: 20231101.tay
1442 data_files:
1443 - split: train
1444 path: 20231101.tay/train-*
1445 - config_name: 20231101.tcy
1446 data_files:
1447 - split: train
1448 path: 20231101.tcy/train-*
1449 - config_name: 20231101.te
1450 data_files:
1451 - split: train
1452 path: 20231101.te/train-*
1453 - config_name: 20231101.tet
1454 data_files:
1455 - split: train
1456 path: 20231101.tet/train-*
1457 - config_name: 20231101.tg
1458 data_files:
1459 - split: train
1460 path: 20231101.tg/train-*
1461 - config_name: 20231101.th
1462 data_files:
1463 - split: train
1464 path: 20231101.th/train-*
1465 - config_name: 20231101.ti
1466 data_files:
1467 - split: train
1468 path: 20231101.ti/train-*
1469 - config_name: 20231101.tk
1470 data_files:
1471 - split: train
1472 path: 20231101.tk/train-*
1473 - config_name: 20231101.tl
1474 data_files:
1475 - split: train
1476 path: 20231101.tl/train-*
1477 - config_name: 20231101.tly
1478 data_files:
1479 - split: train
1480 path: 20231101.tly/train-*
1481 - config_name: 20231101.tn
1482 data_files:
1483 - split: train
1484 path: 20231101.tn/train-*
1485 - config_name: 20231101.to
1486 data_files:
1487 - split: train
1488 path: 20231101.to/train-*
1489 - config_name: 20231101.tpi
1490 data_files:
1491 - split: train
1492 path: 20231101.tpi/train-*
1493 - config_name: 20231101.tr
1494 data_files:
1495 - split: train
1496 path: 20231101.tr/train-*
1497 - config_name: 20231101.trv
1498 data_files:
1499 - split: train
1500 path: 20231101.trv/train-*
1501 - config_name: 20231101.ts
1502 data_files:
1503 - split: train
1504 path: 20231101.ts/train-*
1505 - config_name: 20231101.tt
1506 data_files:
1507 - split: train
1508 path: 20231101.tt/train-*
1509 - config_name: 20231101.tum
1510 data_files:
1511 - split: train
1512 path: 20231101.tum/train-*
1513 - config_name: 20231101.tw
1514 data_files:
1515 - split: train
1516 path: 20231101.tw/train-*
1517 - config_name: 20231101.ty
1518 data_files:
1519 - split: train
1520 path: 20231101.ty/train-*
1521 - config_name: 20231101.tyv
1522 data_files:
1523 - split: train
1524 path: 20231101.tyv/train-*
1525 - config_name: 20231101.udm
1526 data_files:
1527 - split: train
1528 path: 20231101.udm/train-*
1529 - config_name: 20231101.ug
1530 data_files:
1531 - split: train
1532 path: 20231101.ug/train-*
1533 - config_name: 20231101.uk
1534 data_files:
1535 - split: train
1536 path: 20231101.uk/train-*
1537 - config_name: 20231101.ur
1538 data_files:
1539 - split: train
1540 path: 20231101.ur/train-*
1541 - config_name: 20231101.uz
1542 data_files:
1543 - split: train
1544 path: 20231101.uz/train-*
1545 - config_name: 20231101.ve
1546 data_files:
1547 - split: train
1548 path: 20231101.ve/train-*
1549 - config_name: 20231101.vec
1550 data_files:
1551 - split: train
1552 path: 20231101.vec/train-*
1553 - config_name: 20231101.vep
1554 data_files:
1555 - split: train
1556 path: 20231101.vep/train-*
1557 - config_name: 20231101.vi
1558 data_files:
1559 - split: train
1560 path: 20231101.vi/train-*
1561 - config_name: 20231101.vls
1562 data_files:
1563 - split: train
1564 path: 20231101.vls/train-*
1565 - config_name: 20231101.vo
1566 data_files:
1567 - split: train
1568 path: 20231101.vo/train-*
1569 - config_name: 20231101.wa
1570 data_files:
1571 - split: train
1572 path: 20231101.wa/train-*
1573 - config_name: 20231101.war
1574 data_files:
1575 - split: train
1576 path: 20231101.war/train-*
1577 - config_name: 20231101.wo
1578 data_files:
1579 - split: train
1580 path: 20231101.wo/train-*
1581 - config_name: 20231101.wuu
1582 data_files:
1583 - split: train
1584 path: 20231101.wuu/train-*
1585 - config_name: 20231101.xal
1586 data_files:
1587 - split: train
1588 path: 20231101.xal/train-*
1589 - config_name: 20231101.xh
1590 data_files:
1591 - split: train
1592 path: 20231101.xh/train-*
1593 - config_name: 20231101.xmf
1594 data_files:
1595 - split: train
1596 path: 20231101.xmf/train-*
1597 - config_name: 20231101.yi
1598 data_files:
1599 - split: train
1600 path: 20231101.yi/train-*
1601 - config_name: 20231101.yo
1602 data_files:
1603 - split: train
1604 path: 20231101.yo/train-*
1605 - config_name: 20231101.za
1606 data_files:
1607 - split: train
1608 path: 20231101.za/train-*
1609 - config_name: 20231101.zea
1610 data_files:
1611 - split: train
1612 path: 20231101.zea/train-*
1613 - config_name: 20231101.zh
1614 data_files:
1615 - split: train
1616 path: 20231101.zh/train-*
1617 - config_name: 20231101.zh-classical
1618 data_files:
1619 - split: train
1620 path: 20231101.zh-classical/train-*
1621 - config_name: 20231101.zh-min-nan
1622 data_files:
1623 - split: train
1624 path: 20231101.zh-min-nan/train-*
1625 - config_name: 20231101.zh-yue
1626 data_files:
1627 - split: train
1628 path: 20231101.zh-yue/train-*
1629 - config_name: 20231101.zu
1630 data_files:
1631 - split: train
1632 path: 20231101.zu/train-*
1633 dataset_info:
1634 - config_name: 20231101.ab
1635 features:
1636 - name: id
1637 dtype: string
1638 - name: url
1639 dtype: string
1640 - name: title
1641 dtype: string
1642 - name: text
1643 dtype: string
1644 splits:
1645 - name: train
1646 num_bytes: 4334455
1647 num_examples: 6152
1648 download_size: 1237796
1649 dataset_size: 4334455
1650 - config_name: 20231101.ace
1651 features:
1652 - name: id
1653 dtype: string
1654 - name: url
1655 dtype: string
1656 - name: title
1657 dtype: string
1658 - name: text
1659 dtype: string
1660 splits:
1661 - name: train
1662 num_bytes: 5065801
1663 num_examples: 13003
1664 download_size: 1574258
1665 dataset_size: 5065801
1666 - config_name: 20231101.ady
1667 features:
1668 - name: id
1669 dtype: string
1670 - name: url
1671 dtype: string
1672 - name: title
1673 dtype: string
1674 - name: text
1675 dtype: string
1676 splits:
1677 - name: train
1678 num_bytes: 765030
1679 num_examples: 706
1680 download_size: 347450
1681 dataset_size: 765030
1682 - config_name: 20231101.af
1683 features:
1684 - name: id
1685 dtype: string
1686 - name: url
1687 dtype: string
1688 - name: title
1689 dtype: string
1690 - name: text
1691 dtype: string
1692 splits:
1693 - name: train
1694 num_bytes: 226672176
1695 num_examples: 112518
1696 download_size: 124485544
1697 dataset_size: 226672176
1698 - config_name: 20231101.als
1699 features:
1700 - name: id
1701 dtype: string
1702 - name: url
1703 dtype: string
1704 - name: title
1705 dtype: string
1706 - name: text
1707 dtype: string
1708 splits:
1709 - name: train
1710 num_bytes: 81450196
1711 num_examples: 30013
1712 download_size: 49452211
1713 dataset_size: 81450196
1714 - config_name: 20231101.alt
1715 features:
1716 - name: id
1717 dtype: string
1718 - name: url
1719 dtype: string
1720 - name: title
1721 dtype: string
1722 - name: text
1723 dtype: string
1724 splits:
1725 - name: train
1726 num_bytes: 6819963
1727 num_examples: 1087
1728 download_size: 2910477
1729 dataset_size: 6819963
1730 - config_name: 20231101.am
1731 features:
1732 - name: id
1733 dtype: string
1734 - name: url
1735 dtype: string
1736 - name: title
1737 dtype: string
1738 - name: text
1739 dtype: string
1740 splits:
1741 - name: train
1742 num_bytes: 24218002
1743 num_examples: 13906
1744 download_size: 10720027
1745 dataset_size: 24218002
1746 - config_name: 20231101.ami
1747 features:
1748 - name: id
1749 dtype: string
1750 - name: url
1751 dtype: string
1752 - name: title
1753 dtype: string
1754 - name: text
1755 dtype: string
1756 splits:
1757 - name: train
1758 num_bytes: 4460174
1759 num_examples: 1628
1760 download_size: 2261859
1761 dataset_size: 4460174
1762 - config_name: 20231101.an
1763 features:
1764 - name: id
1765 dtype: string
1766 - name: url
1767 dtype: string
1768 - name: title
1769 dtype: string
1770 - name: text
1771 dtype: string
1772 splits:
1773 - name: train
1774 num_bytes: 57572050
1775 num_examples: 44249
1776 download_size: 29573020
1777 dataset_size: 57572050
1778 - config_name: 20231101.ang
1779 features:
1780 - name: id
1781 dtype: string
1782 - name: url
1783 dtype: string
1784 - name: title
1785 dtype: string
1786 - name: text
1787 dtype: string
1788 splits:
1789 - name: train
1790 num_bytes: 2913906
1791 num_examples: 4121
1792 download_size: 1789811
1793 dataset_size: 2913906
1794 - config_name: 20231101.anp
1795 features:
1796 - name: id
1797 dtype: string
1798 - name: url
1799 dtype: string
1800 - name: title
1801 dtype: string
1802 - name: text
1803 dtype: string
1804 splits:
1805 - name: train
1806 num_bytes: 9226211
1807 num_examples: 2749
1808 download_size: 3355979
1809 dataset_size: 9226211
1810 - config_name: 20231101.ar
1811 features:
1812 - name: id
1813 dtype: string
1814 - name: url
1815 dtype: string
1816 - name: title
1817 dtype: string
1818 - name: text
1819 dtype: string
1820 splits:
1821 - name: train
1822 num_bytes: 3124486159
1823 num_examples: 1219201
1824 download_size: 1323304271
1825 dataset_size: 3124486159
1826 - config_name: 20231101.arc
1827 features:
1828 - name: id
1829 dtype: string
1830 - name: url
1831 dtype: string
1832 - name: title
1833 dtype: string
1834 - name: text
1835 dtype: string
1836 splits:
1837 - name: train
1838 num_bytes: 849731
1839 num_examples: 1936
1840 download_size: 369584
1841 dataset_size: 849731
1842 - config_name: 20231101.ary
1843 features:
1844 - name: id
1845 dtype: string
1846 - name: url
1847 dtype: string
1848 - name: title
1849 dtype: string
1850 - name: text
1851 dtype: string
1852 splits:
1853 - name: train
1854 num_bytes: 12049878
1855 num_examples: 8087
1856 download_size: 4672257
1857 dataset_size: 12049878
1858 - config_name: 20231101.arz
1859 features:
1860 - name: id
1861 dtype: string
1862 - name: url
1863 dtype: string
1864 - name: title
1865 dtype: string
1866 - name: text
1867 dtype: string
1868 splits:
1869 - name: train
1870 num_bytes: 1402294447
1871 num_examples: 1620194
1872 download_size: 317231585
1873 dataset_size: 1402294447
1874 - config_name: 20231101.as
1875 features:
1876 - name: id
1877 dtype: string
1878 - name: url
1879 dtype: string
1880 - name: title
1881 dtype: string
1882 - name: text
1883 dtype: string
1884 splits:
1885 - name: train
1886 num_bytes: 90312333
1887 num_examples: 12338
1888 download_size: 34581561
1889 dataset_size: 90312333
1890 - config_name: 20231101.ast
1891 features:
1892 - name: id
1893 dtype: string
1894 - name: url
1895 dtype: string
1896 - name: title
1897 dtype: string
1898 - name: text
1899 dtype: string
1900 splits:
1901 - name: train
1902 num_bytes: 470575521
1903 num_examples: 133419
1904 download_size: 271196430
1905 dataset_size: 470575521
1906 - config_name: 20231101.atj
1907 features:
1908 - name: id
1909 dtype: string
1910 - name: url
1911 dtype: string
1912 - name: title
1913 dtype: string
1914 - name: text
1915 dtype: string
1916 splits:
1917 - name: train
1918 num_bytes: 1012467
1919 num_examples: 1971
1920 download_size: 513962
1921 dataset_size: 1012467
1922 - config_name: 20231101.av
1923 features:
1924 - name: id
1925 dtype: string
1926 - name: url
1927 dtype: string
1928 - name: title
1929 dtype: string
1930 - name: text
1931 dtype: string
1932 splits:
1933 - name: train
1934 num_bytes: 6084045
1935 num_examples: 3426
1936 download_size: 2573436
1937 dataset_size: 6084045
1938 - config_name: 20231101.avk
1939 features:
1940 - name: id
1941 dtype: string
1942 - name: url
1943 dtype: string
1944 - name: title
1945 dtype: string
1946 - name: text
1947 dtype: string
1948 splits:
1949 - name: train
1950 num_bytes: 32119428
1951 num_examples: 28353
1952 download_size: 7984474
1953 dataset_size: 32119428
1954 - config_name: 20231101.awa
1955 features:
1956 - name: id
1957 dtype: string
1958 - name: url
1959 dtype: string
1960 - name: title
1961 dtype: string
1962 - name: text
1963 dtype: string
1964 splits:
1965 - name: train
1966 num_bytes: 3703396
1967 num_examples: 3679
1968 download_size: 1269824
1969 dataset_size: 3703396
1970 - config_name: 20231101.ay
1971 features:
1972 - name: id
1973 dtype: string
1974 - name: url
1975 dtype: string
1976 - name: title
1977 dtype: string
1978 - name: text
1979 dtype: string
1980 splits:
1981 - name: train
1982 num_bytes: 4395813
1983 num_examples: 5384
1984 download_size: 1756131
1985 dataset_size: 4395813
1986 - config_name: 20231101.az
1987 features:
1988 - name: id
1989 dtype: string
1990 - name: url
1991 dtype: string
1992 - name: title
1993 dtype: string
1994 - name: text
1995 dtype: string
1996 splits:
1997 - name: train
1998 num_bytes: 433663157
1999 num_examples: 196158
2000 download_size: 230064038
2001 dataset_size: 433663157
2002 - config_name: 20231101.azb
2003 features:
2004 - name: id
2005 dtype: string
2006 - name: url
2007 dtype: string
2008 - name: title
2009 dtype: string
2010 - name: text
2011 dtype: string
2012 splits:
2013 - name: train
2014 num_bytes: 187041147
2015 num_examples: 243376
2016 download_size: 46739926
2017 dataset_size: 187041147
2018 - config_name: 20231101.ba
2019 features:
2020 - name: id
2021 dtype: string
2022 - name: url
2023 dtype: string
2024 - name: title
2025 dtype: string
2026 - name: text
2027 dtype: string
2028 splits:
2029 - name: train
2030 num_bytes: 297738837
2031 num_examples: 63319
2032 download_size: 122595805
2033 dataset_size: 297738837
2034 - config_name: 20231101.ban
2035 features:
2036 - name: id
2037 dtype: string
2038 - name: url
2039 dtype: string
2040 - name: title
2041 dtype: string
2042 - name: text
2043 dtype: string
2044 splits:
2045 - name: train
2046 num_bytes: 18012727
2047 num_examples: 20986
2048 download_size: 6715876
2049 dataset_size: 18012727
2050 - config_name: 20231101.bar
2051 features:
2052 - name: id
2053 dtype: string
2054 - name: url
2055 dtype: string
2056 - name: title
2057 dtype: string
2058 - name: text
2059 dtype: string
2060 splits:
2061 - name: train
2062 num_bytes: 36317102
2063 num_examples: 27096
2064 download_size: 21799389
2065 dataset_size: 36317102
2066 - config_name: 20231101.bat-smg
2067 features:
2068 - name: id
2069 dtype: string
2070 - name: url
2071 dtype: string
2072 - name: title
2073 dtype: string
2074 - name: text
2075 dtype: string
2076 splits:
2077 - name: train
2078 num_bytes: 7212849
2079 num_examples: 17221
2080 download_size: 3348765
2081 dataset_size: 7212849
2082 - config_name: 20231101.bcl
2083 features:
2084 - name: id
2085 dtype: string
2086 - name: url
2087 dtype: string
2088 - name: title
2089 dtype: string
2090 - name: text
2091 dtype: string
2092 splits:
2093 - name: train
2094 num_bytes: 20394331
2095 num_examples: 15743
2096 download_size: 11369234
2097 dataset_size: 20394331
2098 - config_name: 20231101.be
2099 features:
2100 - name: id
2101 dtype: string
2102 - name: url
2103 dtype: string
2104 - name: title
2105 dtype: string
2106 - name: text
2107 dtype: string
2108 splits:
2109 - name: train
2110 num_bytes: 624718980
2111 num_examples: 236165
2112 download_size: 284921288
2113 dataset_size: 624718980
2114 - config_name: 20231101.be-x-old
2115 features:
2116 - name: id
2117 dtype: string
2118 - name: url
2119 dtype: string
2120 - name: title
2121 dtype: string
2122 - name: text
2123 dtype: string
2124 splits:
2125 - name: train
2126 num_bytes: 252510447
2127 num_examples: 84361
2128 download_size: 114318588
2129 dataset_size: 252510447
2130 - config_name: 20231101.bg
2131 features:
2132 - name: id
2133 dtype: string
2134 - name: url
2135 dtype: string
2136 - name: title
2137 dtype: string
2138 - name: text
2139 dtype: string
2140 splits:
2141 - name: train
2142 num_bytes: 1103334425
2143 num_examples: 294275
2144 download_size: 512344058
2145 dataset_size: 1103334425
2146 - config_name: 20231101.bh
2147 features:
2148 - name: id
2149 dtype: string
2150 - name: url
2151 dtype: string
2152 - name: title
2153 dtype: string
2154 - name: text
2155 dtype: string
2156 splits:
2157 - name: train
2158 num_bytes: 16675295
2159 num_examples: 8612
2160 download_size: 5880458
2161 dataset_size: 16675295
2162 - config_name: 20231101.bi
2163 features:
2164 - name: id
2165 dtype: string
2166 - name: url
2167 dtype: string
2168 - name: title
2169 dtype: string
2170 - name: text
2171 dtype: string
2172 splits:
2173 - name: train
2174 num_bytes: 404249
2175 num_examples: 1548
2176 download_size: 203610
2177 dataset_size: 404249
2178 - config_name: 20231101.bjn
2179 features:
2180 - name: id
2181 dtype: string
2182 - name: url
2183 dtype: string
2184 - name: title
2185 dtype: string
2186 - name: text
2187 dtype: string
2188 splits:
2189 - name: train
2190 num_bytes: 6884860
2191 num_examples: 10519
2192 download_size: 3323032
2193 dataset_size: 6884860
2194 - config_name: 20231101.blk
2195 features:
2196 - name: id
2197 dtype: string
2198 - name: url
2199 dtype: string
2200 - name: title
2201 dtype: string
2202 - name: text
2203 dtype: string
2204 splits:
2205 - name: train
2206 num_bytes: 26566991
2207 num_examples: 2946
2208 download_size: 8028430
2209 dataset_size: 26566991
2210 - config_name: 20231101.bm
2211 features:
2212 - name: id
2213 dtype: string
2214 - name: url
2215 dtype: string
2216 - name: title
2217 dtype: string
2218 - name: text
2219 dtype: string
2220 splits:
2221 - name: train
2222 num_bytes: 623659
2223 num_examples: 1258
2224 download_size: 343812
2225 dataset_size: 623659
2226 - config_name: 20231101.bn
2227 features:
2228 - name: id
2229 dtype: string
2230 - name: url
2231 dtype: string
2232 - name: title
2233 dtype: string
2234 - name: text
2235 dtype: string
2236 splits:
2237 - name: train
2238 num_bytes: 962624238
2239 num_examples: 143069
2240 download_size: 343885999
2241 dataset_size: 962624238
2242 - config_name: 20231101.bo
2243 features:
2244 - name: id
2245 dtype: string
2246 - name: url
2247 dtype: string
2248 - name: title
2249 dtype: string
2250 - name: text
2251 dtype: string
2252 splits:
2253 - name: train
2254 num_bytes: 132723880
2255 num_examples: 12881
2256 download_size: 38851784
2257 dataset_size: 132723880
2258 - config_name: 20231101.bpy
2259 features:
2260 - name: id
2261 dtype: string
2262 - name: url
2263 dtype: string
2264 - name: title
2265 dtype: string
2266 - name: text
2267 dtype: string
2268 splits:
2269 - name: train
2270 num_bytes: 42975314
2271 num_examples: 25165
2272 download_size: 6568483
2273 dataset_size: 42975314
2274 - config_name: 20231101.br
2275 features:
2276 - name: id
2277 dtype: string
2278 - name: url
2279 dtype: string
2280 - name: title
2281 dtype: string
2282 - name: text
2283 dtype: string
2284 splits:
2285 - name: train
2286 num_bytes: 85635744
2287 num_examples: 84340
2288 download_size: 49768597
2289 dataset_size: 85635744
2290 - config_name: 20231101.bs
2291 features:
2292 - name: id
2293 dtype: string
2294 - name: url
2295 dtype: string
2296 - name: title
2297 dtype: string
2298 - name: text
2299 dtype: string
2300 splits:
2301 - name: train
2302 num_bytes: 193734399
2303 num_examples: 92596
2304 download_size: 107858627
2305 dataset_size: 193734399
2306 - config_name: 20231101.bug
2307 features:
2308 - name: id
2309 dtype: string
2310 - name: url
2311 dtype: string
2312 - name: title
2313 dtype: string
2314 - name: text
2315 dtype: string
2316 splits:
2317 - name: train
2318 num_bytes: 3434889
2319 num_examples: 15880
2320 download_size: 817034
2321 dataset_size: 3434889
2322 - config_name: 20231101.bxr
2323 features:
2324 - name: id
2325 dtype: string
2326 - name: url
2327 dtype: string
2328 - name: title
2329 dtype: string
2330 - name: text
2331 dtype: string
2332 splits:
2333 - name: train
2334 num_bytes: 6687172
2335 num_examples: 2791
2336 download_size: 3078699
2337 dataset_size: 6687172
2338 - config_name: 20231101.ca
2339 features:
2340 - name: id
2341 dtype: string
2342 - name: url
2343 dtype: string
2344 - name: title
2345 dtype: string
2346 - name: text
2347 dtype: string
2348 splits:
2349 - name: train
2350 num_bytes: 1958810542
2351 num_examples: 737409
2352 download_size: 1116799343
2353 dataset_size: 1958810542
2354 - config_name: 20231101.cbk-zam
2355 features:
2356 - name: id
2357 dtype: string
2358 - name: url
2359 dtype: string
2360 - name: title
2361 dtype: string
2362 - name: text
2363 dtype: string
2364 splits:
2365 - name: train
2366 num_bytes: 2061944
2367 num_examples: 3285
2368 download_size: 825899
2369 dataset_size: 2061944
2370 - config_name: 20231101.cdo
2371 features:
2372 - name: id
2373 dtype: string
2374 - name: url
2375 dtype: string
2376 - name: title
2377 dtype: string
2378 - name: text
2379 dtype: string
2380 splits:
2381 - name: train
2382 num_bytes: 5109207
2383 num_examples: 16449
2384 download_size: 1982914
2385 dataset_size: 5109207
2386 - config_name: 20231101.ce
2387 features:
2388 - name: id
2389 dtype: string
2390 - name: url
2391 dtype: string
2392 - name: title
2393 dtype: string
2394 - name: text
2395 dtype: string
2396 splits:
2397 - name: train
2398 num_bytes: 730387049
2399 num_examples: 601271
2400 download_size: 88393330
2401 dataset_size: 730387049
2402 - config_name: 20231101.ceb
2403 features:
2404 - name: id
2405 dtype: string
2406 - name: url
2407 dtype: string
2408 - name: title
2409 dtype: string
2410 - name: text
2411 dtype: string
2412 splits:
2413 - name: train
2414 num_bytes: 4568256711
2415 num_examples: 6122708
2416 download_size: 828085216
2417 dataset_size: 4568256711
2418 - config_name: 20231101.ch
2419 features:
2420 - name: id
2421 dtype: string
2422 - name: url
2423 dtype: string
2424 - name: title
2425 dtype: string
2426 - name: text
2427 dtype: string
2428 splits:
2429 - name: train
2430 num_bytes: 178002
2431 num_examples: 576
2432 download_size: 89277
2433 dataset_size: 178002
2434 - config_name: 20231101.chr
2435 features:
2436 - name: id
2437 dtype: string
2438 - name: url
2439 dtype: string
2440 - name: title
2441 dtype: string
2442 - name: text
2443 dtype: string
2444 splits:
2445 - name: train
2446 num_bytes: 767618
2447 num_examples: 1113
2448 download_size: 343140
2449 dataset_size: 767618
2450 - config_name: 20231101.chy
2451 features:
2452 - name: id
2453 dtype: string
2454 - name: url
2455 dtype: string
2456 - name: title
2457 dtype: string
2458 - name: text
2459 dtype: string
2460 splits:
2461 - name: train
2462 num_bytes: 148139
2463 num_examples: 802
2464 download_size: 75865
2465 dataset_size: 148139
2466 - config_name: 20231101.ckb
2467 features:
2468 - name: id
2469 dtype: string
2470 - name: url
2471 dtype: string
2472 - name: title
2473 dtype: string
2474 - name: text
2475 dtype: string
2476 splits:
2477 - name: train
2478 num_bytes: 107150420
2479 num_examples: 52024
2480 download_size: 42964544
2481 dataset_size: 107150420
2482 - config_name: 20231101.co
2483 features:
2484 - name: id
2485 dtype: string
2486 - name: url
2487 dtype: string
2488 - name: title
2489 dtype: string
2490 - name: text
2491 dtype: string
2492 splits:
2493 - name: train
2494 num_bytes: 11104243
2495 num_examples: 7799
2496 download_size: 5794731
2497 dataset_size: 11104243
2498 - config_name: 20231101.cr
2499 features:
2500 - name: id
2501 dtype: string
2502 - name: url
2503 dtype: string
2504 - name: title
2505 dtype: string
2506 - name: text
2507 dtype: string
2508 splits:
2509 - name: train
2510 num_bytes: 57257
2511 num_examples: 187
2512 download_size: 36081
2513 dataset_size: 57257
2514 - config_name: 20231101.crh
2515 features:
2516 - name: id
2517 dtype: string
2518 - name: url
2519 dtype: string
2520 - name: title
2521 dtype: string
2522 - name: text
2523 dtype: string
2524 splits:
2525 - name: train
2526 num_bytes: 9689171
2527 num_examples: 27691
2528 download_size: 3654461
2529 dataset_size: 9689171
2530 - config_name: 20231101.cs
2531 features:
2532 - name: id
2533 dtype: string
2534 - name: url
2535 dtype: string
2536 - name: title
2537 dtype: string
2538 - name: text
2539 dtype: string
2540 splits:
2541 - name: train
2542 num_bytes: 1566286962
2543 num_examples: 534044
2544 download_size: 976484249
2545 dataset_size: 1566286962
2546 - config_name: 20231101.csb
2547 features:
2548 - name: id
2549 dtype: string
2550 - name: url
2551 dtype: string
2552 - name: title
2553 dtype: string
2554 - name: text
2555 dtype: string
2556 splits:
2557 - name: train
2558 num_bytes: 3748643
2559 num_examples: 5480
2560 download_size: 2055233
2561 dataset_size: 3748643
2562 - config_name: 20231101.cu
2563 features:
2564 - name: id
2565 dtype: string
2566 - name: url
2567 dtype: string
2568 - name: title
2569 dtype: string
2570 - name: text
2571 dtype: string
2572 splits:
2573 - name: train
2574 num_bytes: 981592
2575 num_examples: 1235
2576 download_size: 398252
2577 dataset_size: 981592
2578 - config_name: 20231101.cv
2579 features:
2580 - name: id
2581 dtype: string
2582 - name: url
2583 dtype: string
2584 - name: title
2585 dtype: string
2586 - name: text
2587 dtype: string
2588 splits:
2589 - name: train
2590 num_bytes: 81873026
2591 num_examples: 51863
2592 download_size: 29640641
2593 dataset_size: 81873026
2594 - config_name: 20231101.cy
2595 features:
2596 - name: id
2597 dtype: string
2598 - name: url
2599 dtype: string
2600 - name: title
2601 dtype: string
2602 - name: text
2603 dtype: string
2604 splits:
2605 - name: train
2606 num_bytes: 305837783
2607 num_examples: 279455
2608 download_size: 112257456
2609 dataset_size: 305837783
2610 - config_name: 20231101.da
2611 features:
2612 - name: id
2613 dtype: string
2614 - name: url
2615 dtype: string
2616 - name: title
2617 dtype: string
2618 - name: text
2619 dtype: string
2620 splits:
2621 - name: train
2622 num_bytes: 547068330
2623 num_examples: 295347
2624 download_size: 327688122
2625 dataset_size: 547068330
2626 - config_name: 20231101.dag
2627 features:
2628 - name: id
2629 dtype: string
2630 - name: url
2631 dtype: string
2632 - name: title
2633 dtype: string
2634 - name: text
2635 dtype: string
2636 splits:
2637 - name: train
2638 num_bytes: 21618973
2639 num_examples: 10071
2640 download_size: 9026986
2641 dataset_size: 21618973
2642 - config_name: 20231101.de
2643 features:
2644 - name: id
2645 dtype: string
2646 - name: url
2647 dtype: string
2648 - name: title
2649 dtype: string
2650 - name: text
2651 dtype: string
2652 splits:
2653 - name: train
2654 num_bytes: 9622925305
2655 num_examples: 2845308
2656 download_size: 5771317942
2657 dataset_size: 9622925305
2658 - config_name: 20231101.din
2659 features:
2660 - name: id
2661 dtype: string
2662 - name: url
2663 dtype: string
2664 - name: title
2665 dtype: string
2666 - name: text
2667 dtype: string
2668 splits:
2669 - name: train
2670 num_bytes: 564398
2671 num_examples: 512
2672 download_size: 340530
2673 dataset_size: 564398
2674 - config_name: 20231101.diq
2675 features:
2676 - name: id
2677 dtype: string
2678 - name: url
2679 dtype: string
2680 - name: title
2681 dtype: string
2682 - name: text
2683 dtype: string
2684 splits:
2685 - name: train
2686 num_bytes: 19671441
2687 num_examples: 41775
2688 download_size: 7616839
2689 dataset_size: 19671441
2690 - config_name: 20231101.dsb
2691 features:
2692 - name: id
2693 dtype: string
2694 - name: url
2695 dtype: string
2696 - name: title
2697 dtype: string
2698 - name: text
2699 dtype: string
2700 splits:
2701 - name: train
2702 num_bytes: 3315228
2703 num_examples: 3379
2704 download_size: 1931937
2705 dataset_size: 3315228
2706 - config_name: 20231101.dty
2707 features:
2708 - name: id
2709 dtype: string
2710 - name: url
2711 dtype: string
2712 - name: title
2713 dtype: string
2714 - name: text
2715 dtype: string
2716 splits:
2717 - name: train
2718 num_bytes: 7030648
2719 num_examples: 3632
2720 download_size: 2521250
2721 dataset_size: 7030648
2722 - config_name: 20231101.dv
2723 features:
2724 - name: id
2725 dtype: string
2726 - name: url
2727 dtype: string
2728 - name: title
2729 dtype: string
2730 - name: text
2731 dtype: string
2732 splits:
2733 - name: train
2734 num_bytes: 13934393
2735 num_examples: 4352
2736 download_size: 5283133
2737 dataset_size: 13934393
2738 - config_name: 20231101.dz
2739 features:
2740 - name: id
2741 dtype: string
2742 - name: url
2743 dtype: string
2744 - name: title
2745 dtype: string
2746 - name: text
2747 dtype: string
2748 splits:
2749 - name: train
2750 num_bytes: 8855969
2751 num_examples: 788
2752 download_size: 2583520
2753 dataset_size: 8855969
2754 - config_name: 20231101.ee
2755 features:
2756 - name: id
2757 dtype: string
2758 - name: url
2759 dtype: string
2760 - name: title
2761 dtype: string
2762 - name: text
2763 dtype: string
2764 splits:
2765 - name: train
2766 num_bytes: 898491
2767 num_examples: 1181
2768 download_size: 492813
2769 dataset_size: 898491
2770 - config_name: 20231101.el
2771 features:
2772 - name: id
2773 dtype: string
2774 - name: url
2775 dtype: string
2776 - name: title
2777 dtype: string
2778 - name: text
2779 dtype: string
2780 splits:
2781 - name: train
2782 num_bytes: 1345589075
2783 num_examples: 226834
2784 download_size: 637372489
2785 dataset_size: 1345589075
2786 - config_name: 20231101.eml
2787 features:
2788 - name: id
2789 dtype: string
2790 - name: url
2791 dtype: string
2792 - name: title
2793 dtype: string
2794 - name: text
2795 dtype: string
2796 splits:
2797 - name: train
2798 num_bytes: 3625415
2799 num_examples: 12961
2800 download_size: 1689575
2801 dataset_size: 3625415
2802 - config_name: 20231101.en
2803 features:
2804 - name: id
2805 dtype: string
2806 - name: url
2807 dtype: string
2808 - name: title
2809 dtype: string
2810 - name: text
2811 dtype: string
2812 splits:
2813 - name: train
2814 num_bytes: 20200062385
2815 num_examples: 6407814
2816 download_size: 11630929031
2817 dataset_size: 20200062385
2818 - config_name: 20231101.eo
2819 features:
2820 - name: id
2821 dtype: string
2822 - name: url
2823 dtype: string
2824 - name: title
2825 dtype: string
2826 - name: text
2827 dtype: string
2828 splits:
2829 - name: train
2830 num_bytes: 523113804
2831 num_examples: 344851
2832 download_size: 297738138
2833 dataset_size: 523113804
2834 - config_name: 20231101.es
2835 features:
2836 - name: id
2837 dtype: string
2838 - name: url
2839 dtype: string
2840 - name: title
2841 dtype: string
2842 - name: text
2843 dtype: string
2844 splits:
2845 - name: train
2846 num_bytes: 6033536133
2847 num_examples: 1841155
2848 download_size: 3493595869
2849 dataset_size: 6033536133
2850 - config_name: 20231101.et
2851 features:
2852 - name: id
2853 dtype: string
2854 - name: url
2855 dtype: string
2856 - name: title
2857 dtype: string
2858 - name: text
2859 dtype: string
2860 splits:
2861 - name: train
2862 num_bytes: 440177170
2863 num_examples: 240397
2864 download_size: 265444734
2865 dataset_size: 440177170
2866 - config_name: 20231101.eu
2867 features:
2868 - name: id
2869 dtype: string
2870 - name: url
2871 dtype: string
2872 - name: title
2873 dtype: string
2874 - name: text
2875 dtype: string
2876 splits:
2877 - name: train
2878 num_bytes: 565567318
2879 num_examples: 416347
2880 download_size: 270355505
2881 dataset_size: 565567318
2882 - config_name: 20231101.ext
2883 features:
2884 - name: id
2885 dtype: string
2886 - name: url
2887 dtype: string
2888 - name: title
2889 dtype: string
2890 - name: text
2891 dtype: string
2892 splits:
2893 - name: train
2894 num_bytes: 4389633
2895 num_examples: 3785
2896 download_size: 2761099
2897 dataset_size: 4389633
2898 - config_name: 20231101.fa
2899 features:
2900 - name: id
2901 dtype: string
2902 - name: url
2903 dtype: string
2904 - name: title
2905 dtype: string
2906 - name: text
2907 dtype: string
2908 splits:
2909 - name: train
2910 num_bytes: 1899154938
2911 num_examples: 979869
2912 download_size: 759368283
2913 dataset_size: 1899154938
2914 - config_name: 20231101.fat
2915 features:
2916 - name: id
2917 dtype: string
2918 - name: url
2919 dtype: string
2920 - name: title
2921 dtype: string
2922 - name: text
2923 dtype: string
2924 splits:
2925 - name: train
2926 num_bytes: 2032812
2927 num_examples: 1122
2928 download_size: 1124684
2929 dataset_size: 2032812
2930 - config_name: 20231101.ff
2931 features:
2932 - name: id
2933 dtype: string
2934 - name: url
2935 dtype: string
2936 - name: title
2937 dtype: string
2938 - name: text
2939 dtype: string
2940 splits:
2941 - name: train
2942 num_bytes: 1867995
2943 num_examples: 2419
2944 download_size: 1087702
2945 dataset_size: 1867995
2946 - config_name: 20231101.fi
2947 features:
2948 - name: id
2949 dtype: string
2950 - name: url
2951 dtype: string
2952 - name: title
2953 dtype: string
2954 - name: text
2955 dtype: string
2956 splits:
2957 - name: train
2958 num_bytes: 1146146663
2959 num_examples: 561598
2960 download_size: 680512230
2961 dataset_size: 1146146663
2962 - config_name: 20231101.fiu-vro
2963 features:
2964 - name: id
2965 dtype: string
2966 - name: url
2967 dtype: string
2968 - name: title
2969 dtype: string
2970 - name: text
2971 dtype: string
2972 splits:
2973 - name: train
2974 num_bytes: 4636361
2975 num_examples: 6590
2976 download_size: 2434159
2977 dataset_size: 4636361
2978 - config_name: 20231101.fj
2979 features:
2980 - name: id
2981 dtype: string
2982 - name: url
2983 dtype: string
2984 - name: title
2985 dtype: string
2986 - name: text
2987 dtype: string
2988 splits:
2989 - name: train
2990 num_bytes: 604791
2991 num_examples: 1294
2992 download_size: 328059
2993 dataset_size: 604791
2994 - config_name: 20231101.fo
2995 features:
2996 - name: id
2997 dtype: string
2998 - name: url
2999 dtype: string
3000 - name: title
3001 dtype: string
3002 - name: text
3003 dtype: string
3004 splits:
3005 - name: train
3006 num_bytes: 15415249
3007 num_examples: 14080
3008 download_size: 8857239
3009 dataset_size: 15415249
3010 - config_name: 20231101.fon
3011 features:
3012 - name: id
3013 dtype: string
3014 - name: url
3015 dtype: string
3016 - name: title
3017 dtype: string
3018 - name: text
3019 dtype: string
3020 splits:
3021 - name: train
3022 num_bytes: 592216
3023 num_examples: 705
3024 download_size: 317444
3025 dataset_size: 592216
3026 - config_name: 20231101.fr
3027 features:
3028 - name: id
3029 dtype: string
3030 - name: url
3031 dtype: string
3032 - name: title
3033 dtype: string
3034 - name: text
3035 dtype: string
3036 splits:
3037 - name: train
3038 num_bytes: 8065794826
3039 num_examples: 2564646
3040 download_size: 4614488286
3041 dataset_size: 8065794826
3042 - config_name: 20231101.frp
3043 features:
3044 - name: id
3045 dtype: string
3046 - name: url
3047 dtype: string
3048 - name: title
3049 dtype: string
3050 - name: text
3051 dtype: string
3052 splits:
3053 - name: train
3054 num_bytes: 3676441
3055 num_examples: 5766
3056 download_size: 1914046
3057 dataset_size: 3676441
3058 - config_name: 20231101.frr
3059 features:
3060 - name: id
3061 dtype: string
3062 - name: url
3063 dtype: string
3064 - name: title
3065 dtype: string
3066 - name: text
3067 dtype: string
3068 splits:
3069 - name: train
3070 num_bytes: 10819914
3071 num_examples: 18666
3072 download_size: 5317694
3073 dataset_size: 10819914
3074 - config_name: 20231101.fur
3075 features:
3076 - name: id
3077 dtype: string
3078 - name: url
3079 dtype: string
3080 - name: title
3081 dtype: string
3082 - name: text
3083 dtype: string
3084 splits:
3085 - name: train
3086 num_bytes: 4090412
3087 num_examples: 4001
3088 download_size: 2421238
3089 dataset_size: 4090412
3090 - config_name: 20231101.fy
3091 features:
3092 - name: id
3093 dtype: string
3094 - name: url
3095 dtype: string
3096 - name: title
3097 dtype: string
3098 - name: text
3099 dtype: string
3100 splits:
3101 - name: train
3102 num_bytes: 134196708
3103 num_examples: 52416
3104 download_size: 76002257
3105 dataset_size: 134196708
3106 - config_name: 20231101.ga
3107 features:
3108 - name: id
3109 dtype: string
3110 - name: url
3111 dtype: string
3112 - name: title
3113 dtype: string
3114 - name: text
3115 dtype: string
3116 splits:
3117 - name: train
3118 num_bytes: 60640820
3119 num_examples: 59156
3120 download_size: 34136733
3121 dataset_size: 60640820
3122 - config_name: 20231101.gag
3123 features:
3124 - name: id
3125 dtype: string
3126 - name: url
3127 dtype: string
3128 - name: title
3129 dtype: string
3130 - name: text
3131 dtype: string
3132 splits:
3133 - name: train
3134 num_bytes: 2428849
3135 num_examples: 2968
3136 download_size: 1331866
3137 dataset_size: 2428849
3138 - config_name: 20231101.gan
3139 features:
3140 - name: id
3141 dtype: string
3142 - name: url
3143 dtype: string
3144 - name: title
3145 dtype: string
3146 - name: text
3147 dtype: string
3148 splits:
3149 - name: train
3150 num_bytes: 2915229
3151 num_examples: 6743
3152 download_size: 1508844
3153 dataset_size: 2915229
3154 - config_name: 20231101.gcr
3155 features:
3156 - name: id
3157 dtype: string
3158 - name: url
3159 dtype: string
3160 - name: title
3161 dtype: string
3162 - name: text
3163 dtype: string
3164 splits:
3165 - name: train
3166 num_bytes: 2338277
3167 num_examples: 2399
3168 download_size: 1345482
3169 dataset_size: 2338277
3170 - config_name: 20231101.gd
3171 features:
3172 - name: id
3173 dtype: string
3174 - name: url
3175 dtype: string
3176 - name: title
3177 dtype: string
3178 - name: text
3179 dtype: string
3180 splits:
3181 - name: train
3182 num_bytes: 14051607
3183 num_examples: 15979
3184 download_size: 7190137
3185 dataset_size: 14051607
3186 - config_name: 20231101.gl
3187 features:
3188 - name: id
3189 dtype: string
3190 - name: url
3191 dtype: string
3192 - name: title
3193 dtype: string
3194 - name: text
3195 dtype: string
3196 splits:
3197 - name: train
3198 num_bytes: 493905881
3199 num_examples: 200092
3200 download_size: 291104907
3201 dataset_size: 493905881
3202 - config_name: 20231101.glk
3203 features:
3204 - name: id
3205 dtype: string
3206 - name: url
3207 dtype: string
3208 - name: title
3209 dtype: string
3210 - name: text
3211 dtype: string
3212 splits:
3213 - name: train
3214 num_bytes: 6086185
3215 num_examples: 7049
3216 download_size: 2382997
3217 dataset_size: 6086185
3218 - config_name: 20231101.gn
3219 features:
3220 - name: id
3221 dtype: string
3222 - name: url
3223 dtype: string
3224 - name: title
3225 dtype: string
3226 - name: text
3227 dtype: string
3228 splits:
3229 - name: train
3230 num_bytes: 6921948
3231 num_examples: 5519
3232 download_size: 3806548
3233 dataset_size: 6921948
3234 - config_name: 20231101.gom
3235 features:
3236 - name: id
3237 dtype: string
3238 - name: url
3239 dtype: string
3240 - name: title
3241 dtype: string
3242 - name: text
3243 dtype: string
3244 splits:
3245 - name: train
3246 num_bytes: 30889533
3247 num_examples: 4259
3248 download_size: 11306217
3249 dataset_size: 30889533
3250 - config_name: 20231101.gor
3251 features:
3252 - name: id
3253 dtype: string
3254 - name: url
3255 dtype: string
3256 - name: title
3257 dtype: string
3258 - name: text
3259 dtype: string
3260 splits:
3261 - name: train
3262 num_bytes: 6369540
3263 num_examples: 15359
3264 download_size: 2101154
3265 dataset_size: 6369540
3266 - config_name: 20231101.got
3267 features:
3268 - name: id
3269 dtype: string
3270 - name: url
3271 dtype: string
3272 - name: title
3273 dtype: string
3274 - name: text
3275 dtype: string
3276 splits:
3277 - name: train
3278 num_bytes: 1533770
3279 num_examples: 1013
3280 download_size: 636307
3281 dataset_size: 1533770
3282 - config_name: 20231101.gpe
3283 features:
3284 - name: id
3285 dtype: string
3286 - name: url
3287 dtype: string
3288 - name: title
3289 dtype: string
3290 - name: text
3291 dtype: string
3292 splits:
3293 - name: train
3294 num_bytes: 2017667
3295 num_examples: 1110
3296 download_size: 1141261
3297 dataset_size: 2017667
3298 - config_name: 20231101.gu
3299 features:
3300 - name: id
3301 dtype: string
3302 - name: url
3303 dtype: string
3304 - name: title
3305 dtype: string
3306 - name: text
3307 dtype: string
3308 splits:
3309 - name: train
3310 num_bytes: 121282557
3311 num_examples: 30445
3312 download_size: 39554078
3313 dataset_size: 121282557
3314 - config_name: 20231101.guc
3315 features:
3316 - name: id
3317 dtype: string
3318 - name: url
3319 dtype: string
3320 - name: title
3321 dtype: string
3322 - name: text
3323 dtype: string
3324 splits:
3325 - name: train
3326 num_bytes: 978923
3327 num_examples: 679
3328 download_size: 578311
3329 dataset_size: 978923
3330 - config_name: 20231101.gur
3331 features:
3332 - name: id
3333 dtype: string
3334 - name: url
3335 dtype: string
3336 - name: title
3337 dtype: string
3338 - name: text
3339 dtype: string
3340 splits:
3341 - name: train
3342 num_bytes: 2325435
3343 num_examples: 1383
3344 download_size: 1068954
3345 dataset_size: 2325435
3346 - config_name: 20231101.guw
3347 features:
3348 - name: id
3349 dtype: string
3350 - name: url
3351 dtype: string
3352 - name: title
3353 dtype: string
3354 - name: text
3355 dtype: string
3356 splits:
3357 - name: train
3358 num_bytes: 1913143
3359 num_examples: 1312
3360 download_size: 1042328
3361 dataset_size: 1913143
3362 - config_name: 20231101.gv
3363 features:
3364 - name: id
3365 dtype: string
3366 - name: url
3367 dtype: string
3368 - name: title
3369 dtype: string
3370 - name: text
3371 dtype: string
3372 splits:
3373 - name: train
3374 num_bytes: 6307253
3375 num_examples: 6206
3376 download_size: 3347095
3377 dataset_size: 6307253
3378 - config_name: 20231101.ha
3379 features:
3380 - name: id
3381 dtype: string
3382 - name: url
3383 dtype: string
3384 - name: title
3385 dtype: string
3386 - name: text
3387 dtype: string
3388 splits:
3389 - name: train
3390 num_bytes: 77906472
3391 num_examples: 36492
3392 download_size: 43131815
3393 dataset_size: 77906472
3394 - config_name: 20231101.hak
3395 features:
3396 - name: id
3397 dtype: string
3398 - name: url
3399 dtype: string
3400 - name: title
3401 dtype: string
3402 - name: text
3403 dtype: string
3404 splits:
3405 - name: train
3406 num_bytes: 4523680
3407 num_examples: 10246
3408 download_size: 1878558
3409 dataset_size: 4523680
3410 - config_name: 20231101.haw
3411 features:
3412 - name: id
3413 dtype: string
3414 - name: url
3415 dtype: string
3416 - name: title
3417 dtype: string
3418 - name: text
3419 dtype: string
3420 splits:
3421 - name: train
3422 num_bytes: 1677790
3423 num_examples: 2612
3424 download_size: 696781
3425 dataset_size: 1677790
3426 - config_name: 20231101.he
3427 features:
3428 - name: id
3429 dtype: string
3430 - name: url
3431 dtype: string
3432 - name: title
3433 dtype: string
3434 - name: text
3435 dtype: string
3436 splits:
3437 - name: train
3438 num_bytes: 1950200381
3439 num_examples: 333874
3440 download_size: 979183998
3441 dataset_size: 1950200381
3442 - config_name: 20231101.hi
3443 features:
3444 - name: id
3445 dtype: string
3446 - name: url
3447 dtype: string
3448 - name: title
3449 dtype: string
3450 - name: text
3451 dtype: string
3452 splits:
3453 - name: train
3454 num_bytes: 672817362
3455 num_examples: 163093
3456 download_size: 237834604
3457 dataset_size: 672817362
3458 - config_name: 20231101.hif
3459 features:
3460 - name: id
3461 dtype: string
3462 - name: url
3463 dtype: string
3464 - name: title
3465 dtype: string
3466 - name: text
3467 dtype: string
3468 splits:
3469 - name: train
3470 num_bytes: 5685329
3471 num_examples: 10986
3472 download_size: 2715682
3473 dataset_size: 5685329
3474 - config_name: 20231101.hr
3475 features:
3476 - name: id
3477 dtype: string
3478 - name: url
3479 dtype: string
3480 - name: title
3481 dtype: string
3482 - name: text
3483 dtype: string
3484 splits:
3485 - name: train
3486 num_bytes: 443636903
3487 num_examples: 202848
3488 download_size: 275245343
3489 dataset_size: 443636903
3490 - config_name: 20231101.hsb
3491 features:
3492 - name: id
3493 dtype: string
3494 - name: url
3495 dtype: string
3496 - name: title
3497 dtype: string
3498 - name: text
3499 dtype: string
3500 splits:
3501 - name: train
3502 num_bytes: 15667118
3503 num_examples: 13957
3504 download_size: 7437491
3505 dataset_size: 15667118
3506 - config_name: 20231101.ht
3507 features:
3508 - name: id
3509 dtype: string
3510 - name: url
3511 dtype: string
3512 - name: title
3513 dtype: string
3514 - name: text
3515 dtype: string
3516 splits:
3517 - name: train
3518 num_bytes: 55088040
3519 num_examples: 70159
3520 download_size: 21993952
3521 dataset_size: 55088040
3522 - config_name: 20231101.hu
3523 features:
3524 - name: id
3525 dtype: string
3526 - name: url
3527 dtype: string
3528 - name: title
3529 dtype: string
3530 - name: text
3531 dtype: string
3532 splits:
3533 - name: train
3534 num_bytes: 1515899113
3535 num_examples: 532427
3536 download_size: 904857314
3537 dataset_size: 1515899113
3538 - config_name: 20231101.hy
3539 features:
3540 - name: id
3541 dtype: string
3542 - name: url
3543 dtype: string
3544 - name: title
3545 dtype: string
3546 - name: text
3547 dtype: string
3548 splits:
3549 - name: train
3550 num_bytes: 1179459973
3551 num_examples: 303036
3552 download_size: 490121120
3553 dataset_size: 1179459973
3554 - config_name: 20231101.hyw
3555 features:
3556 - name: id
3557 dtype: string
3558 - name: url
3559 dtype: string
3560 - name: title
3561 dtype: string
3562 - name: text
3563 dtype: string
3564 splits:
3565 - name: train
3566 num_bytes: 59564550
3567 num_examples: 11725
3568 download_size: 27450541
3569 dataset_size: 59564550
3570 - config_name: 20231101.ia
3571 features:
3572 - name: id
3573 dtype: string
3574 - name: url
3575 dtype: string
3576 - name: title
3577 dtype: string
3578 - name: text
3579 dtype: string
3580 splits:
3581 - name: train
3582 num_bytes: 16409449
3583 num_examples: 28247
3584 download_size: 8237640
3585 dataset_size: 16409449
3586 - config_name: 20231101.id
3587 features:
3588 - name: id
3589 dtype: string
3590 - name: url
3591 dtype: string
3592 - name: title
3593 dtype: string
3594 - name: text
3595 dtype: string
3596 splits:
3597 - name: train
3598 num_bytes: 1125928594
3599 num_examples: 665622
3600 download_size: 583801799
3601 dataset_size: 1125928594
3602 - config_name: 20231101.ie
3603 features:
3604 - name: id
3605 dtype: string
3606 - name: url
3607 dtype: string
3608 - name: title
3609 dtype: string
3610 - name: text
3611 dtype: string
3612 splits:
3613 - name: train
3614 num_bytes: 6737711
3615 num_examples: 11877
3616 download_size: 3019044
3617 dataset_size: 6737711
3618 - config_name: 20231101.ig
3619 features:
3620 - name: id
3621 dtype: string
3622 - name: url
3623 dtype: string
3624 - name: title
3625 dtype: string
3626 - name: text
3627 dtype: string
3628 splits:
3629 - name: train
3630 num_bytes: 66086115
3631 num_examples: 22908
3632 download_size: 34663540
3633 dataset_size: 66086115
3634 - config_name: 20231101.ik
3635 features:
3636 - name: id
3637 dtype: string
3638 - name: url
3639 dtype: string
3640 - name: title
3641 dtype: string
3642 - name: text
3643 dtype: string
3644 splits:
3645 - name: train
3646 num_bytes: 199773
3647 num_examples: 846
3648 download_size: 115758
3649 dataset_size: 199773
3650 - config_name: 20231101.ilo
3651 features:
3652 - name: id
3653 dtype: string
3654 - name: url
3655 dtype: string
3656 - name: title
3657 dtype: string
3658 - name: text
3659 dtype: string
3660 splits:
3661 - name: train
3662 num_bytes: 16854494
3663 num_examples: 15371
3664 download_size: 7352572
3665 dataset_size: 16854494
3666 - config_name: 20231101.inh
3667 features:
3668 - name: id
3669 dtype: string
3670 - name: url
3671 dtype: string
3672 - name: title
3673 dtype: string
3674 - name: text
3675 dtype: string
3676 splits:
3677 - name: train
3678 num_bytes: 2727253
3679 num_examples: 2123
3680 download_size: 1279524
3681 dataset_size: 2727253
3682 - config_name: 20231101.io
3683 features:
3684 - name: id
3685 dtype: string
3686 - name: url
3687 dtype: string
3688 - name: title
3689 dtype: string
3690 - name: text
3691 dtype: string
3692 splits:
3693 - name: train
3694 num_bytes: 38735196
3695 num_examples: 40930
3696 download_size: 17106040
3697 dataset_size: 38735196
3698 - config_name: 20231101.is
3699 features:
3700 - name: id
3701 dtype: string
3702 - name: url
3703 dtype: string
3704 - name: title
3705 dtype: string
3706 - name: text
3707 dtype: string
3708 splits:
3709 - name: train
3710 num_bytes: 87856729
3711 num_examples: 57453
3712 download_size: 52286137
3713 dataset_size: 87856729
3714 - config_name: 20231101.it
3715 features:
3716 - name: id
3717 dtype: string
3718 - name: url
3719 dtype: string
3720 - name: title
3721 dtype: string
3722 - name: text
3723 dtype: string
3724 splits:
3725 - name: train
3726 num_bytes: 4924856310
3727 num_examples: 1833639
3728 download_size: 2931265519
3729 dataset_size: 4924856310
3730 - config_name: 20231101.iu
3731 features:
3732 - name: id
3733 dtype: string
3734 - name: url
3735 dtype: string
3736 - name: title
3737 dtype: string
3738 - name: text
3739 dtype: string
3740 splits:
3741 - name: train
3742 num_bytes: 291185
3743 num_examples: 562
3744 download_size: 136987
3745 dataset_size: 291185
3746 - config_name: 20231101.ja
3747 features:
3748 - name: id
3749 dtype: string
3750 - name: url
3751 dtype: string
3752 - name: title
3753 dtype: string
3754 - name: text
3755 dtype: string
3756 splits:
3757 - name: train
3758 num_bytes: 7039610767
3759 num_examples: 1389467
3760 download_size: 3941998526
3761 dataset_size: 7039610767
3762 - config_name: 20231101.jam
3763 features:
3764 - name: id
3765 dtype: string
3766 - name: url
3767 dtype: string
3768 - name: title
3769 dtype: string
3770 - name: text
3771 dtype: string
3772 splits:
3773 - name: train
3774 num_bytes: 1142348
3775 num_examples: 1780
3776 download_size: 702664
3777 dataset_size: 1142348
3778 - config_name: 20231101.jbo
3779 features:
3780 - name: id
3781 dtype: string
3782 - name: url
3783 dtype: string
3784 - name: title
3785 dtype: string
3786 - name: text
3787 dtype: string
3788 splits:
3789 - name: train
3790 num_bytes: 2523538
3791 num_examples: 1394
3792 download_size: 890356
3793 dataset_size: 2523538
3794 - config_name: 20231101.jv
3795 features:
3796 - name: id
3797 dtype: string
3798 - name: url
3799 dtype: string
3800 - name: title
3801 dtype: string
3802 - name: text
3803 dtype: string
3804 splits:
3805 - name: train
3806 num_bytes: 72786688
3807 num_examples: 73380
3808 download_size: 36852134
3809 dataset_size: 72786688
3810 - config_name: 20231101.ka
3811 features:
3812 - name: id
3813 dtype: string
3814 - name: url
3815 dtype: string
3816 - name: title
3817 dtype: string
3818 - name: text
3819 dtype: string
3820 splits:
3821 - name: train
3822 num_bytes: 699872960
3823 num_examples: 169602
3824 download_size: 239987665
3825 dataset_size: 699872960
3826 - config_name: 20231101.kaa
3827 features:
3828 - name: id
3829 dtype: string
3830 - name: url
3831 dtype: string
3832 - name: title
3833 dtype: string
3834 - name: text
3835 dtype: string
3836 splits:
3837 - name: train
3838 num_bytes: 5139436
3839 num_examples: 4074
3840 download_size: 2913134
3841 dataset_size: 5139436
3842 - config_name: 20231101.kab
3843 features:
3844 - name: id
3845 dtype: string
3846 - name: url
3847 dtype: string
3848 - name: title
3849 dtype: string
3850 - name: text
3851 dtype: string
3852 splits:
3853 - name: train
3854 num_bytes: 4392542
3855 num_examples: 5830
3856 download_size: 2580584
3857 dataset_size: 4392542
3858 - config_name: 20231101.kbd
3859 features:
3860 - name: id
3861 dtype: string
3862 - name: url
3863 dtype: string
3864 - name: title
3865 dtype: string
3866 - name: text
3867 dtype: string
3868 splits:
3869 - name: train
3870 num_bytes: 3014575
3871 num_examples: 1670
3872 download_size: 1304580
3873 dataset_size: 3014575
3874 - config_name: 20231101.kbp
3875 features:
3876 - name: id
3877 dtype: string
3878 - name: url
3879 dtype: string
3880 - name: title
3881 dtype: string
3882 - name: text
3883 dtype: string
3884 splits:
3885 - name: train
3886 num_bytes: 3584563
3887 num_examples: 1931
3888 download_size: 1806400
3889 dataset_size: 3584563
3890 - config_name: 20231101.kcg
3891 features:
3892 - name: id
3893 dtype: string
3894 - name: url
3895 dtype: string
3896 - name: title
3897 dtype: string
3898 - name: text
3899 dtype: string
3900 splits:
3901 - name: train
3902 num_bytes: 914665
3903 num_examples: 1151
3904 download_size: 513904
3905 dataset_size: 914665
3906 - config_name: 20231101.kg
3907 features:
3908 - name: id
3909 dtype: string
3910 - name: url
3911 dtype: string
3912 - name: title
3913 dtype: string
3914 - name: text
3915 dtype: string
3916 splits:
3917 - name: train
3918 num_bytes: 390163
3919 num_examples: 1329
3920 download_size: 209059
3921 dataset_size: 390163
3922 - config_name: 20231101.ki
3923 features:
3924 - name: id
3925 dtype: string
3926 - name: url
3927 dtype: string
3928 - name: title
3929 dtype: string
3930 - name: text
3931 dtype: string
3932 splits:
3933 - name: train
3934 num_bytes: 760980
3935 num_examples: 1668
3936 download_size: 427003
3937 dataset_size: 760980
3938 - config_name: 20231101.kk
3939 features:
3940 - name: id
3941 dtype: string
3942 - name: url
3943 dtype: string
3944 - name: title
3945 dtype: string
3946 - name: text
3947 dtype: string
3948 splits:
3949 - name: train
3950 num_bytes: 497917145
3951 num_examples: 238615
3952 download_size: 180750520
3953 dataset_size: 497917145
3954 - config_name: 20231101.kl
3955 features:
3956 - name: id
3957 dtype: string
3958 - name: url
3959 dtype: string
3960 - name: title
3961 dtype: string
3962 - name: text
3963 dtype: string
3964 splits:
3965 - name: train
3966 num_bytes: 313658
3967 num_examples: 301
3968 download_size: 193719
3969 dataset_size: 313658
3970 - config_name: 20231101.km
3971 features:
3972 - name: id
3973 dtype: string
3974 - name: url
3975 dtype: string
3976 - name: title
3977 dtype: string
3978 - name: text
3979 dtype: string
3980 splits:
3981 - name: train
3982 num_bytes: 103252582
3983 num_examples: 11994
3984 download_size: 35567417
3985 dataset_size: 103252582
3986 - config_name: 20231101.kn
3987 features:
3988 - name: id
3989 dtype: string
3990 - name: url
3991 dtype: string
3992 - name: title
3993 dtype: string
3994 - name: text
3995 dtype: string
3996 splits:
3997 - name: train
3998 num_bytes: 402848197
3999 num_examples: 31437
4000 download_size: 147156434
4001 dataset_size: 402848197
4002 - config_name: 20231101.ko
4003 features:
4004 - name: id
4005 dtype: string
4006 - name: url
4007 dtype: string
4008 - name: title
4009 dtype: string
4010 - name: text
4011 dtype: string
4012 splits:
4013 - name: train
4014 num_bytes: 1412099944
4015 num_examples: 647897
4016 download_size: 782677061
4017 dataset_size: 1412099944
4018 - config_name: 20231101.koi
4019 features:
4020 - name: id
4021 dtype: string
4022 - name: url
4023 dtype: string
4024 - name: title
4025 dtype: string
4026 - name: text
4027 dtype: string
4028 splits:
4029 - name: train
4030 num_bytes: 5103799
4031 num_examples: 3504
4032 download_size: 1888392
4033 dataset_size: 5103799
4034 - config_name: 20231101.krc
4035 features:
4036 - name: id
4037 dtype: string
4038 - name: url
4039 dtype: string
4040 - name: title
4041 dtype: string
4042 - name: text
4043 dtype: string
4044 splits:
4045 - name: train
4046 num_bytes: 4589808
4047 num_examples: 2100
4048 download_size: 2022144
4049 dataset_size: 4589808
4050 - config_name: 20231101.ks
4051 features:
4052 - name: id
4053 dtype: string
4054 - name: url
4055 dtype: string
4056 - name: title
4057 dtype: string
4058 - name: text
4059 dtype: string
4060 splits:
4061 - name: train
4062 num_bytes: 2868186
4063 num_examples: 4307
4064 download_size: 1094458
4065 dataset_size: 2868186
4066 - config_name: 20231101.ksh
4067 features:
4068 - name: id
4069 dtype: string
4070 - name: url
4071 dtype: string
4072 - name: title
4073 dtype: string
4074 - name: text
4075 dtype: string
4076 splits:
4077 - name: train
4078 num_bytes: 3117003
4079 num_examples: 2945
4080 download_size: 2009928
4081 dataset_size: 3117003
4082 - config_name: 20231101.ku
4083 features:
4084 - name: id
4085 dtype: string
4086 - name: url
4087 dtype: string
4088 - name: title
4089 dtype: string
4090 - name: text
4091 dtype: string
4092 splits:
4093 - name: train
4094 num_bytes: 44523131
4095 num_examples: 63076
4096 download_size: 22938233
4097 dataset_size: 44523131
4098 - config_name: 20231101.kv
4099 features:
4100 - name: id
4101 dtype: string
4102 - name: url
4103 dtype: string
4104 - name: title
4105 dtype: string
4106 - name: text
4107 dtype: string
4108 splits:
4109 - name: train
4110 num_bytes: 9245577
4111 num_examples: 5595
4112 download_size: 3690978
4113 dataset_size: 9245577
4114 - config_name: 20231101.kw
4115 features:
4116 - name: id
4117 dtype: string
4118 - name: url
4119 dtype: string
4120 - name: title
4121 dtype: string
4122 - name: text
4123 dtype: string
4124 splits:
4125 - name: train
4126 num_bytes: 4687165
4127 num_examples: 6995
4128 download_size: 2711398
4129 dataset_size: 4687165
4130 - config_name: 20231101.ky
4131 features:
4132 - name: id
4133 dtype: string
4134 - name: url
4135 dtype: string
4136 - name: title
4137 dtype: string
4138 - name: text
4139 dtype: string
4140 splits:
4141 - name: train
4142 num_bytes: 166911089
4143 num_examples: 79438
4144 download_size: 63947035
4145 dataset_size: 166911089
4146 - config_name: 20231101.la
4147 features:
4148 - name: id
4149 dtype: string
4150 - name: url
4151 dtype: string
4152 - name: title
4153 dtype: string
4154 - name: text
4155 dtype: string
4156 splits:
4157 - name: train
4158 num_bytes: 141080163
4159 num_examples: 138263
4160 download_size: 76588430
4161 dataset_size: 141080163
4162 - config_name: 20231101.lad
4163 features:
4164 - name: id
4165 dtype: string
4166 - name: url
4167 dtype: string
4168 - name: title
4169 dtype: string
4170 - name: text
4171 dtype: string
4172 splits:
4173 - name: train
4174 num_bytes: 4901343
4175 num_examples: 3663
4176 download_size: 2754531
4177 dataset_size: 4901343
4178 - config_name: 20231101.lb
4179 features:
4180 - name: id
4181 dtype: string
4182 - name: url
4183 dtype: string
4184 - name: title
4185 dtype: string
4186 - name: text
4187 dtype: string
4188 splits:
4189 - name: train
4190 num_bytes: 88826996
4191 num_examples: 62414
4192 download_size: 50515020
4193 dataset_size: 88826996
4194 - config_name: 20231101.lbe
4195 features:
4196 - name: id
4197 dtype: string
4198 - name: url
4199 dtype: string
4200 - name: title
4201 dtype: string
4202 - name: text
4203 dtype: string
4204 splits:
4205 - name: train
4206 num_bytes: 745140
4207 num_examples: 1279
4208 download_size: 304394
4209 dataset_size: 745140
4210 - config_name: 20231101.lez
4211 features:
4212 - name: id
4213 dtype: string
4214 - name: url
4215 dtype: string
4216 - name: title
4217 dtype: string
4218 - name: text
4219 dtype: string
4220 splits:
4221 - name: train
4222 num_bytes: 9794637
4223 num_examples: 4264
4224 download_size: 3864848
4225 dataset_size: 9794637
4226 - config_name: 20231101.lfn
4227 features:
4228 - name: id
4229 dtype: string
4230 - name: url
4231 dtype: string
4232 - name: title
4233 dtype: string
4234 - name: text
4235 dtype: string
4236 splits:
4237 - name: train
4238 num_bytes: 8870685
4239 num_examples: 4832
4240 download_size: 5207546
4241 dataset_size: 8870685
4242 - config_name: 20231101.lg
4243 features:
4244 - name: id
4245 dtype: string
4246 - name: url
4247 dtype: string
4248 - name: title
4249 dtype: string
4250 - name: text
4251 dtype: string
4252 splits:
4253 - name: train
4254 num_bytes: 6891539
4255 num_examples: 4048
4256 download_size: 3708097
4257 dataset_size: 6891539
4258 - config_name: 20231101.li
4259 features:
4260 - name: id
4261 dtype: string
4262 - name: url
4263 dtype: string
4264 - name: title
4265 dtype: string
4266 - name: text
4267 dtype: string
4268 splits:
4269 - name: train
4270 num_bytes: 29633678
4271 num_examples: 14849
4272 download_size: 17727918
4273 dataset_size: 29633678
4274 - config_name: 20231101.lij
4275 features:
4276 - name: id
4277 dtype: string
4278 - name: url
4279 dtype: string
4280 - name: title
4281 dtype: string
4282 - name: text
4283 dtype: string
4284 splits:
4285 - name: train
4286 num_bytes: 11448686
4287 num_examples: 11203
4288 download_size: 6255409
4289 dataset_size: 11448686
4290 - config_name: 20231101.lld
4291 features:
4292 - name: id
4293 dtype: string
4294 - name: url
4295 dtype: string
4296 - name: title
4297 dtype: string
4298 - name: text
4299 dtype: string
4300 splits:
4301 - name: train
4302 num_bytes: 50163974
4303 num_examples: 180677
4304 download_size: 13866243
4305 dataset_size: 50163974
4306 - config_name: 20231101.lmo
4307 features:
4308 - name: id
4309 dtype: string
4310 - name: url
4311 dtype: string
4312 - name: title
4313 dtype: string
4314 - name: text
4315 dtype: string
4316 splits:
4317 - name: train
4318 num_bytes: 43496783
4319 num_examples: 73510
4320 download_size: 19142356
4321 dataset_size: 43496783
4322 - config_name: 20231101.ln
4323 features:
4324 - name: id
4325 dtype: string
4326 - name: url
4327 dtype: string
4328 - name: title
4329 dtype: string
4330 - name: text
4331 dtype: string
4332 splits:
4333 - name: train
4334 num_bytes: 2035050
4335 num_examples: 3534
4336 download_size: 1122138
4337 dataset_size: 2035050
4338 - config_name: 20231101.lo
4339 features:
4340 - name: id
4341 dtype: string
4342 - name: url
4343 dtype: string
4344 - name: title
4345 dtype: string
4346 - name: text
4347 dtype: string
4348 splits:
4349 - name: train
4350 num_bytes: 15283258
4351 num_examples: 5014
4352 download_size: 5646554
4353 dataset_size: 15283258
4354 - config_name: 20231101.lt
4355 features:
4356 - name: id
4357 dtype: string
4358 - name: url
4359 dtype: string
4360 - name: title
4361 dtype: string
4362 - name: text
4363 dtype: string
4364 splits:
4365 - name: train
4366 num_bytes: 336559824
4367 num_examples: 211292
4368 download_size: 194873569
4369 dataset_size: 336559824
4370 - config_name: 20231101.ltg
4371 features:
4372 - name: id
4373 dtype: string
4374 - name: url
4375 dtype: string
4376 - name: title
4377 dtype: string
4378 - name: text
4379 dtype: string
4380 splits:
4381 - name: train
4382 num_bytes: 915364
4383 num_examples: 1070
4384 download_size: 530299
4385 dataset_size: 915364
4386 - config_name: 20231101.lv
4387 features:
4388 - name: id
4389 dtype: string
4390 - name: url
4391 dtype: string
4392 - name: title
4393 dtype: string
4394 - name: text
4395 dtype: string
4396 splits:
4397 - name: train
4398 num_bytes: 227272112
4399 num_examples: 123413
4400 download_size: 129739227
4401 dataset_size: 227272112
4402 - config_name: 20231101.mad
4403 features:
4404 - name: id
4405 dtype: string
4406 - name: url
4407 dtype: string
4408 - name: title
4409 dtype: string
4410 - name: text
4411 dtype: string
4412 splits:
4413 - name: train
4414 num_bytes: 1596836
4415 num_examples: 1192
4416 download_size: 908630
4417 dataset_size: 1596836
4418 - config_name: 20231101.mai
4419 features:
4420 - name: id
4421 dtype: string
4422 - name: url
4423 dtype: string
4424 - name: title
4425 dtype: string
4426 - name: text
4427 dtype: string
4428 splits:
4429 - name: train
4430 num_bytes: 21562856
4431 num_examples: 14714
4432 download_size: 6180231
4433 dataset_size: 21562856
4434 - config_name: 20231101.map-bms
4435 features:
4436 - name: id
4437 dtype: string
4438 - name: url
4439 dtype: string
4440 - name: title
4441 dtype: string
4442 - name: text
4443 dtype: string
4444 splits:
4445 - name: train
4446 num_bytes: 5341068
4447 num_examples: 13580
4448 download_size: 2377123
4449 dataset_size: 5341068
4450 - config_name: 20231101.mdf
4451 features:
4452 - name: id
4453 dtype: string
4454 - name: url
4455 dtype: string
4456 - name: title
4457 dtype: string
4458 - name: text
4459 dtype: string
4460 splits:
4461 - name: train
4462 num_bytes: 4694770
4463 num_examples: 4257
4464 download_size: 1725294
4465 dataset_size: 4694770
4466 - config_name: 20231101.mg
4467 features:
4468 - name: id
4469 dtype: string
4470 - name: url
4471 dtype: string
4472 - name: title
4473 dtype: string
4474 - name: text
4475 dtype: string
4476 splits:
4477 - name: train
4478 num_bytes: 73767229
4479 num_examples: 96316
4480 download_size: 22117304
4481 dataset_size: 73767229
4482 - config_name: 20231101.mhr
4483 features:
4484 - name: id
4485 dtype: string
4486 - name: url
4487 dtype: string
4488 - name: title
4489 dtype: string
4490 - name: text
4491 dtype: string
4492 splits:
4493 - name: train
4494 num_bytes: 19249450
4495 num_examples: 11347
4496 download_size: 6902162
4497 dataset_size: 19249450
4498 - config_name: 20231101.mi
4499 features:
4500 - name: id
4501 dtype: string
4502 - name: url
4503 dtype: string
4504 - name: title
4505 dtype: string
4506 - name: text
4507 dtype: string
4508 splits:
4509 - name: train
4510 num_bytes: 4169094
4511 num_examples: 7919
4512 download_size: 1044444
4513 dataset_size: 4169094
4514 - config_name: 20231101.min
4515 features:
4516 - name: id
4517 dtype: string
4518 - name: url
4519 dtype: string
4520 - name: title
4521 dtype: string
4522 - name: text
4523 dtype: string
4524 splits:
4525 - name: train
4526 num_bytes: 118995918
4527 num_examples: 227143
4528 download_size: 25691303
4529 dataset_size: 118995918
4530 - config_name: 20231101.mk
4531 features:
4532 - name: id
4533 dtype: string
4534 - name: url
4535 dtype: string
4536 - name: title
4537 dtype: string
4538 - name: text
4539 dtype: string
4540 splits:
4541 - name: train
4542 num_bytes: 651422351
4543 num_examples: 139559
4544 download_size: 271265486
4545 dataset_size: 651422351
4546 - config_name: 20231101.ml
4547 features:
4548 - name: id
4549 dtype: string
4550 - name: url
4551 dtype: string
4552 - name: title
4553 dtype: string
4554 - name: text
4555 dtype: string
4556 splits:
4557 - name: train
4558 num_bytes: 494135127
4559 num_examples: 85791
4560 download_size: 183071274
4561 dataset_size: 494135127
4562 - config_name: 20231101.mn
4563 features:
4564 - name: id
4565 dtype: string
4566 - name: url
4567 dtype: string
4568 - name: title
4569 dtype: string
4570 - name: text
4571 dtype: string
4572 splits:
4573 - name: train
4574 num_bytes: 91943210
4575 num_examples: 24048
4576 download_size: 41521786
4577 dataset_size: 91943210
4578 - config_name: 20231101.mni
4579 features:
4580 - name: id
4581 dtype: string
4582 - name: url
4583 dtype: string
4584 - name: title
4585 dtype: string
4586 - name: text
4587 dtype: string
4588 splits:
4589 - name: train
4590 num_bytes: 9820483
4591 num_examples: 10894
4592 download_size: 2208525
4593 dataset_size: 9820483
4594 - config_name: 20231101.mnw
4595 features:
4596 - name: id
4597 dtype: string
4598 - name: url
4599 dtype: string
4600 - name: title
4601 dtype: string
4602 - name: text
4603 dtype: string
4604 splits:
4605 - name: train
4606 num_bytes: 47237206
4607 num_examples: 3295
4608 download_size: 13765461
4609 dataset_size: 47237206
4610 - config_name: 20231101.mr
4611 features:
4612 - name: id
4613 dtype: string
4614 - name: url
4615 dtype: string
4616 - name: title
4617 dtype: string
4618 - name: text
4619 dtype: string
4620 splits:
4621 - name: train
4622 num_bytes: 261879018
4623 num_examples: 94133
4624 download_size: 81991233
4625 dataset_size: 261879018
4626 - config_name: 20231101.mrj
4627 features:
4628 - name: id
4629 dtype: string
4630 - name: url
4631 dtype: string
4632 - name: title
4633 dtype: string
4634 - name: text
4635 dtype: string
4636 splits:
4637 - name: train
4638 num_bytes: 8732281
4639 num_examples: 10542
4640 download_size: 3283618
4641 dataset_size: 8732281
4642 - config_name: 20231101.ms
4643 features:
4644 - name: id
4645 dtype: string
4646 - name: url
4647 dtype: string
4648 - name: title
4649 dtype: string
4650 - name: text
4651 dtype: string
4652 splits:
4653 - name: train
4654 num_bytes: 423352360
4655 num_examples: 368628
4656 download_size: 210149264
4657 dataset_size: 423352360
4658 - config_name: 20231101.mt
4659 features:
4660 - name: id
4661 dtype: string
4662 - name: url
4663 dtype: string
4664 - name: title
4665 dtype: string
4666 - name: text
4667 dtype: string
4668 splits:
4669 - name: train
4670 num_bytes: 32009639
4671 num_examples: 5743
4672 download_size: 18686521
4673 dataset_size: 32009639
4674 - config_name: 20231101.mwl
4675 features:
4676 - name: id
4677 dtype: string
4678 - name: url
4679 dtype: string
4680 - name: title
4681 dtype: string
4682 - name: text
4683 dtype: string
4684 splits:
4685 - name: train
4686 num_bytes: 19353725
4687 num_examples: 4500
4688 download_size: 11521563
4689 dataset_size: 19353725
4690 - config_name: 20231101.my
4691 features:
4692 - name: id
4693 dtype: string
4694 - name: url
4695 dtype: string
4696 - name: title
4697 dtype: string
4698 - name: text
4699 dtype: string
4700 splits:
4701 - name: train
4702 num_bytes: 314417700
4703 num_examples: 109310
4704 download_size: 85497205
4705 dataset_size: 314417700
4706 - config_name: 20231101.myv
4707 features:
4708 - name: id
4709 dtype: string
4710 - name: url
4711 dtype: string
4712 - name: title
4713 dtype: string
4714 - name: text
4715 dtype: string
4716 splits:
4717 - name: train
4718 num_bytes: 11145865
4719 num_examples: 7958
4720 download_size: 4600620
4721 dataset_size: 11145865
4722 - config_name: 20231101.mzn
4723 features:
4724 - name: id
4725 dtype: string
4726 - name: url
4727 dtype: string
4728 - name: title
4729 dtype: string
4730 - name: text
4731 dtype: string
4732 splits:
4733 - name: train
4734 num_bytes: 16335757
4735 num_examples: 18717
4736 download_size: 5419390
4737 dataset_size: 16335757
4738 - config_name: 20231101.nah
4739 features:
4740 - name: id
4741 dtype: string
4742 - name: url
4743 dtype: string
4744 - name: title
4745 dtype: string
4746 - name: text
4747 dtype: string
4748 splits:
4749 - name: train
4750 num_bytes: 2503320
4751 num_examples: 6218
4752 download_size: 1191779
4753 dataset_size: 2503320
4754 - config_name: 20231101.nap
4755 features:
4756 - name: id
4757 dtype: string
4758 - name: url
4759 dtype: string
4760 - name: title
4761 dtype: string
4762 - name: text
4763 dtype: string
4764 splits:
4765 - name: train
4766 num_bytes: 6395706
4767 num_examples: 14884
4768 download_size: 3188122
4769 dataset_size: 6395706
4770 - config_name: 20231101.nds
4771 features:
4772 - name: id
4773 dtype: string
4774 - name: url
4775 dtype: string
4776 - name: title
4777 dtype: string
4778 - name: text
4779 dtype: string
4780 splits:
4781 - name: train
4782 num_bytes: 92990126
4783 num_examples: 84285
4784 download_size: 48106879
4785 dataset_size: 92990126
4786 - config_name: 20231101.nds-nl
4787 features:
4788 - name: id
4789 dtype: string
4790 - name: url
4791 dtype: string
4792 - name: title
4793 dtype: string
4794 - name: text
4795 dtype: string
4796 splits:
4797 - name: train
4798 num_bytes: 13582403
4799 num_examples: 7847
4800 download_size: 8354427
4801 dataset_size: 13582403
4802 - config_name: 20231101.ne
4803 features:
4804 - name: id
4805 dtype: string
4806 - name: url
4807 dtype: string
4808 - name: title
4809 dtype: string
4810 - name: text
4811 dtype: string
4812 splits:
4813 - name: train
4814 num_bytes: 109032486
4815 num_examples: 32885
4816 download_size: 37548833
4817 dataset_size: 109032486
4818 - config_name: 20231101.new
4819 features:
4820 - name: id
4821 dtype: string
4822 - name: url
4823 dtype: string
4824 - name: title
4825 dtype: string
4826 - name: text
4827 dtype: string
4828 splits:
4829 - name: train
4830 num_bytes: 159095610
4831 num_examples: 73003
4832 download_size: 20517810
4833 dataset_size: 159095610
4834 - config_name: 20231101.nia
4835 features:
4836 - name: id
4837 dtype: string
4838 - name: url
4839 dtype: string
4840 - name: title
4841 dtype: string
4842 - name: text
4843 dtype: string
4844 splits:
4845 - name: train
4846 num_bytes: 2117902
4847 num_examples: 1714
4848 download_size: 1086670
4849 dataset_size: 2117902
4850 - config_name: 20231101.nl
4851 features:
4852 - name: id
4853 dtype: string
4854 - name: url
4855 dtype: string
4856 - name: title
4857 dtype: string
4858 - name: text
4859 dtype: string
4860 splits:
4861 - name: train
4862 num_bytes: 2646316266
4863 num_examples: 2135977
4864 download_size: 1436843432
4865 dataset_size: 2646316266
4866 - config_name: 20231101.nn
4867 features:
4868 - name: id
4869 dtype: string
4870 - name: url
4871 dtype: string
4872 - name: title
4873 dtype: string
4874 - name: text
4875 dtype: string
4876 splits:
4877 - name: train
4878 num_bytes: 237467406
4879 num_examples: 167653
4880 download_size: 134751873
4881 dataset_size: 237467406
4882 - config_name: 20231101.no
4883 features:
4884 - name: id
4885 dtype: string
4886 - name: url
4887 dtype: string
4888 - name: title
4889 dtype: string
4890 - name: text
4891 dtype: string
4892 splits:
4893 - name: train
4894 num_bytes: 1033188011
4895 num_examples: 617937
4896 download_size: 590970350
4897 dataset_size: 1033188011
4898 - config_name: 20231101.nov
4899 features:
4900 - name: id
4901 dtype: string
4902 - name: url
4903 dtype: string
4904 - name: title
4905 dtype: string
4906 - name: text
4907 dtype: string
4908 splits:
4909 - name: train
4910 num_bytes: 965640
4911 num_examples: 1693
4912 download_size: 493500
4913 dataset_size: 965640
4914 - config_name: 20231101.nqo
4915 features:
4916 - name: id
4917 dtype: string
4918 - name: url
4919 dtype: string
4920 - name: title
4921 dtype: string
4922 - name: text
4923 dtype: string
4924 splits:
4925 - name: train
4926 num_bytes: 8261058
4927 num_examples: 1580
4928 download_size: 3508645
4929 dataset_size: 8261058
4930 - config_name: 20231101.nrm
4931 features:
4932 - name: id
4933 dtype: string
4934 - name: url
4935 dtype: string
4936 - name: title
4937 dtype: string
4938 - name: text
4939 dtype: string
4940 splits:
4941 - name: train
4942 num_bytes: 3216817
4943 num_examples: 4902
4944 download_size: 1507257
4945 dataset_size: 3216817
4946 - config_name: 20231101.nso
4947 features:
4948 - name: id
4949 dtype: string
4950 - name: url
4951 dtype: string
4952 - name: title
4953 dtype: string
4954 - name: text
4955 dtype: string
4956 splits:
4957 - name: train
4958 num_bytes: 2796467
4959 num_examples: 8650
4960 download_size: 936349
4961 dataset_size: 2796467
4962 - config_name: 20231101.nv
4963 features:
4964 - name: id
4965 dtype: string
4966 - name: url
4967 dtype: string
4968 - name: title
4969 dtype: string
4970 - name: text
4971 dtype: string
4972 splits:
4973 - name: train
4974 num_bytes: 16993060
4975 num_examples: 22460
4976 download_size: 3304031
4977 dataset_size: 16993060
4978 - config_name: 20231101.ny
4979 features:
4980 - name: id
4981 dtype: string
4982 - name: url
4983 dtype: string
4984 - name: title
4985 dtype: string
4986 - name: text
4987 dtype: string
4988 splits:
4989 - name: train
4990 num_bytes: 1691825
4991 num_examples: 1129
4992 download_size: 938621
4993 dataset_size: 1691825
4994 - config_name: 20231101.oc
4995 features:
4996 - name: id
4997 dtype: string
4998 - name: url
4999 dtype: string
5000 - name: title
5001 dtype: string
5002 - name: text
5003 dtype: string
5004 splits:
5005 - name: train
5006 num_bytes: 120092607
5007 num_examples: 89101
5008 download_size: 64043588
5009 dataset_size: 120092607
5010 - config_name: 20231101.olo
5011 features:
5012 - name: id
5013 dtype: string
5014 - name: url
5015 dtype: string
5016 - name: title
5017 dtype: string
5018 - name: text
5019 dtype: string
5020 splits:
5021 - name: train
5022 num_bytes: 3173332
5023 num_examples: 4640
5024 download_size: 1724315
5025 dataset_size: 3173332
5026 - config_name: 20231101.om
5027 features:
5028 - name: id
5029 dtype: string
5030 - name: url
5031 dtype: string
5032 - name: title
5033 dtype: string
5034 - name: text
5035 dtype: string
5036 splits:
5037 - name: train
5038 num_bytes: 3604768
5039 num_examples: 1970
5040 download_size: 1982849
5041 dataset_size: 3604768
5042 - config_name: 20231101.or
5043 features:
5044 - name: id
5045 dtype: string
5046 - name: url
5047 dtype: string
5048 - name: title
5049 dtype: string
5050 - name: text
5051 dtype: string
5052 splits:
5053 - name: train
5054 num_bytes: 75078226
5055 num_examples: 17375
5056 download_size: 26706212
5057 dataset_size: 75078226
5058 - config_name: 20231101.os
5059 features:
5060 - name: id
5061 dtype: string
5062 - name: url
5063 dtype: string
5064 - name: title
5065 dtype: string
5066 - name: text
5067 dtype: string
5068 splits:
5069 - name: train
5070 num_bytes: 13182881
5071 num_examples: 17663
5072 download_size: 5572799
5073 dataset_size: 13182881
5074 - config_name: 20231101.pa
5075 features:
5076 - name: id
5077 dtype: string
5078 - name: url
5079 dtype: string
5080 - name: title
5081 dtype: string
5082 - name: text
5083 dtype: string
5084 splits:
5085 - name: train
5086 num_bytes: 212972877
5087 num_examples: 51423
5088 download_size: 81452929
5089 dataset_size: 212972877
5090 - config_name: 20231101.pag
5091 features:
5092 - name: id
5093 dtype: string
5094 - name: url
5095 dtype: string
5096 - name: title
5097 dtype: string
5098 - name: text
5099 dtype: string
5100 splits:
5101 - name: train
5102 num_bytes: 1391816
5103 num_examples: 2665
5104 download_size: 455808
5105 dataset_size: 1391816
5106 - config_name: 20231101.pam
5107 features:
5108 - name: id
5109 dtype: string
5110 - name: url
5111 dtype: string
5112 - name: title
5113 dtype: string
5114 - name: text
5115 dtype: string
5116 splits:
5117 - name: train
5118 num_bytes: 8294902
5119 num_examples: 9006
5120 download_size: 4277038
5121 dataset_size: 8294902
5122 - config_name: 20231101.pap
5123 features:
5124 - name: id
5125 dtype: string
5126 - name: url
5127 dtype: string
5128 - name: title
5129 dtype: string
5130 - name: text
5131 dtype: string
5132 splits:
5133 - name: train
5134 num_bytes: 4251480
5135 num_examples: 3520
5136 download_size: 2435005
5137 dataset_size: 4251480
5138 - config_name: 20231101.pcd
5139 features:
5140 - name: id
5141 dtype: string
5142 - name: url
5143 dtype: string
5144 - name: title
5145 dtype: string
5146 - name: text
5147 dtype: string
5148 splits:
5149 - name: train
5150 num_bytes: 5704321
5151 num_examples: 5717
5152 download_size: 3145572
5153 dataset_size: 5704321
5154 - config_name: 20231101.pcm
5155 features:
5156 - name: id
5157 dtype: string
5158 - name: url
5159 dtype: string
5160 - name: title
5161 dtype: string
5162 - name: text
5163 dtype: string
5164 splits:
5165 - name: train
5166 num_bytes: 1886987
5167 num_examples: 1238
5168 download_size: 1160762
5169 dataset_size: 1886987
5170 - config_name: 20231101.pdc
5171 features:
5172 - name: id
5173 dtype: string
5174 - name: url
5175 dtype: string
5176 - name: title
5177 dtype: string
5178 - name: text
5179 dtype: string
5180 splits:
5181 - name: train
5182 num_bytes: 1225978
5183 num_examples: 2176
5184 download_size: 698254
5185 dataset_size: 1225978
5186 - config_name: 20231101.pfl
5187 features:
5188 - name: id
5189 dtype: string
5190 - name: url
5191 dtype: string
5192 - name: title
5193 dtype: string
5194 - name: text
5195 dtype: string
5196 splits:
5197 - name: train
5198 num_bytes: 3694464
5199 num_examples: 2762
5200 download_size: 1971214
5201 dataset_size: 3694464
5202 - config_name: 20231101.pi
5203 features:
5204 - name: id
5205 dtype: string
5206 - name: url
5207 dtype: string
5208 - name: title
5209 dtype: string
5210 - name: text
5211 dtype: string
5212 splits:
5213 - name: train
5214 num_bytes: 1144100
5215 num_examples: 3057
5216 download_size: 200764
5217 dataset_size: 1144100
5218 - config_name: 20231101.pih
5219 features:
5220 - name: id
5221 dtype: string
5222 - name: url
5223 dtype: string
5224 - name: title
5225 dtype: string
5226 - name: text
5227 dtype: string
5228 splits:
5229 - name: train
5230 num_bytes: 278139
5231 num_examples: 934
5232 download_size: 177092
5233 dataset_size: 278139
5234 - config_name: 20231101.pl
5235 features:
5236 - name: id
5237 dtype: string
5238 - name: url
5239 dtype: string
5240 - name: title
5241 dtype: string
5242 - name: text
5243 dtype: string
5244 splits:
5245 - name: train
5246 num_bytes: 2950148809
5247 num_examples: 1587721
5248 download_size: 1765059986
5249 dataset_size: 2950148809
5250 - config_name: 20231101.pms
5251 features:
5252 - name: id
5253 dtype: string
5254 - name: url
5255 dtype: string
5256 - name: title
5257 dtype: string
5258 - name: text
5259 dtype: string
5260 splits:
5261 - name: train
5262 num_bytes: 34340217
5263 num_examples: 67980
5264 download_size: 12008880
5265 dataset_size: 34340217
5266 - config_name: 20231101.pnb
5267 features:
5268 - name: id
5269 dtype: string
5270 - name: url
5271 dtype: string
5272 - name: title
5273 dtype: string
5274 - name: text
5275 dtype: string
5276 splits:
5277 - name: train
5278 num_bytes: 304117649
5279 num_examples: 72307
5280 download_size: 133266242
5281 dataset_size: 304117649
5282 - config_name: 20231101.pnt
5283 features:
5284 - name: id
5285 dtype: string
5286 - name: url
5287 dtype: string
5288 - name: title
5289 dtype: string
5290 - name: text
5291 dtype: string
5292 splits:
5293 - name: train
5294 num_bytes: 630636
5295 num_examples: 533
5296 download_size: 275639
5297 dataset_size: 630636
5298 - config_name: 20231101.ps
5299 features:
5300 - name: id
5301 dtype: string
5302 - name: url
5303 dtype: string
5304 - name: title
5305 dtype: string
5306 - name: text
5307 dtype: string
5308 splits:
5309 - name: train
5310 num_bytes: 114259737
5311 num_examples: 20529
5312 download_size: 53312545
5313 dataset_size: 114259737
5314 - config_name: 20231101.pt
5315 features:
5316 - name: id
5317 dtype: string
5318 - name: url
5319 dtype: string
5320 - name: title
5321 dtype: string
5322 - name: text
5323 dtype: string
5324 splits:
5325 - name: train
5326 num_bytes: 2758783436
5327 num_examples: 1112246
5328 download_size: 1579641059
5329 dataset_size: 2758783436
5330 - config_name: 20231101.pwn
5331 features:
5332 - name: id
5333 dtype: string
5334 - name: url
5335 dtype: string
5336 - name: title
5337 dtype: string
5338 - name: text
5339 dtype: string
5340 splits:
5341 - name: train
5342 num_bytes: 811954
5343 num_examples: 408
5344 download_size: 444109
5345 dataset_size: 811954
5346 - config_name: 20231101.qu
5347 features:
5348 - name: id
5349 dtype: string
5350 - name: url
5351 dtype: string
5352 - name: title
5353 dtype: string
5354 - name: text
5355 dtype: string
5356 splits:
5357 - name: train
5358 num_bytes: 16828457
5359 num_examples: 24196
5360 download_size: 7688106
5361 dataset_size: 16828457
5362 - config_name: 20231101.rm
5363 features:
5364 - name: id
5365 dtype: string
5366 - name: url
5367 dtype: string
5368 - name: title
5369 dtype: string
5370 - name: text
5371 dtype: string
5372 splits:
5373 - name: train
5374 num_bytes: 18053014
5375 num_examples: 3822
5376 download_size: 10483970
5377 dataset_size: 18053014
5378 - config_name: 20231101.rmy
5379 features:
5380 - name: id
5381 dtype: string
5382 - name: url
5383 dtype: string
5384 - name: title
5385 dtype: string
5386 - name: text
5387 dtype: string
5388 splits:
5389 - name: train
5390 num_bytes: 611778
5391 num_examples: 1279
5392 download_size: 356457
5393 dataset_size: 611778
5394 - config_name: 20231101.rn
5395 features:
5396 - name: id
5397 dtype: string
5398 - name: url
5399 dtype: string
5400 - name: title
5401 dtype: string
5402 - name: text
5403 dtype: string
5404 splits:
5405 - name: train
5406 num_bytes: 530318
5407 num_examples: 819
5408 download_size: 301252
5409 dataset_size: 530318
5410 - config_name: 20231101.ro
5411 features:
5412 - name: id
5413 dtype: string
5414 - name: url
5415 dtype: string
5416 - name: title
5417 dtype: string
5418 - name: text
5419 dtype: string
5420 splits:
5421 - name: train
5422 num_bytes: 847410736
5423 num_examples: 442389
5424 download_size: 466937380
5425 dataset_size: 847410736
5426 - config_name: 20231101.roa-rup
5427 features:
5428 - name: id
5429 dtype: string
5430 - name: url
5431 dtype: string
5432 - name: title
5433 dtype: string
5434 - name: text
5435 dtype: string
5436 splits:
5437 - name: train
5438 num_bytes: 1687829
5439 num_examples: 1432
5440 download_size: 951677
5441 dataset_size: 1687829
5442 - config_name: 20231101.roa-tara
5443 features:
5444 - name: id
5445 dtype: string
5446 - name: url
5447 dtype: string
5448 - name: title
5449 dtype: string
5450 - name: text
5451 dtype: string
5452 splits:
5453 - name: train
5454 num_bytes: 7470331
5455 num_examples: 9367
5456 download_size: 4003095
5457 dataset_size: 7470331
5458 - config_name: 20231101.ru
5459 features:
5460 - name: id
5461 dtype: string
5462 - name: url
5463 dtype: string
5464 - name: title
5465 dtype: string
5466 - name: text
5467 dtype: string
5468 splits:
5469 - name: train
5470 num_bytes: 10277958919
5471 num_examples: 1945063
5472 download_size: 4876849588
5473 dataset_size: 10277958919
5474 - config_name: 20231101.rue
5475 features:
5476 - name: id
5477 dtype: string
5478 - name: url
5479 dtype: string
5480 - name: title
5481 dtype: string
5482 - name: text
5483 dtype: string
5484 splits:
5485 - name: train
5486 num_bytes: 13128572
5487 num_examples: 8759
5488 download_size: 6346106
5489 dataset_size: 13128572
5490 - config_name: 20231101.rw
5491 features:
5492 - name: id
5493 dtype: string
5494 - name: url
5495 dtype: string
5496 - name: title
5497 dtype: string
5498 - name: text
5499 dtype: string
5500 splits:
5501 - name: train
5502 num_bytes: 11898854
5503 num_examples: 8063
5504 download_size: 6623388
5505 dataset_size: 11898854
5506 - config_name: 20231101.sa
5507 features:
5508 - name: id
5509 dtype: string
5510 - name: url
5511 dtype: string
5512 - name: title
5513 dtype: string
5514 - name: text
5515 dtype: string
5516 splits:
5517 - name: train
5518 num_bytes: 69854997
5519 num_examples: 12156
5520 download_size: 23850161
5521 dataset_size: 69854997
5522 - config_name: 20231101.sah
5523 features:
5524 - name: id
5525 dtype: string
5526 - name: url
5527 dtype: string
5528 - name: title
5529 dtype: string
5530 - name: text
5531 dtype: string
5532 splits:
5533 - name: train
5534 num_bytes: 48562374
5535 num_examples: 17098
5536 download_size: 21675888
5537 dataset_size: 48562374
5538 - config_name: 20231101.sat
5539 features:
5540 - name: id
5541 dtype: string
5542 - name: url
5543 dtype: string
5544 - name: title
5545 dtype: string
5546 - name: text
5547 dtype: string
5548 splits:
5549 - name: train
5550 num_bytes: 45247783
5551 num_examples: 9767
5552 download_size: 15428584
5553 dataset_size: 45247783
5554 - config_name: 20231101.sc
5555 features:
5556 - name: id
5557 dtype: string
5558 - name: url
5559 dtype: string
5560 - name: title
5561 dtype: string
5562 - name: text
5563 dtype: string
5564 splits:
5565 - name: train
5566 num_bytes: 12776438
5567 num_examples: 7586
5568 download_size: 7711996
5569 dataset_size: 12776438
5570 - config_name: 20231101.scn
5571 features:
5572 - name: id
5573 dtype: string
5574 - name: url
5575 dtype: string
5576 - name: title
5577 dtype: string
5578 - name: text
5579 dtype: string
5580 splits:
5581 - name: train
5582 num_bytes: 17685098
5583 num_examples: 26530
5584 download_size: 10223816
5585 dataset_size: 17685098
5586 - config_name: 20231101.sco
5587 features:
5588 - name: id
5589 dtype: string
5590 - name: url
5591 dtype: string
5592 - name: title
5593 dtype: string
5594 - name: text
5595 dtype: string
5596 splits:
5597 - name: train
5598 num_bytes: 42808738
5599 num_examples: 35276
5600 download_size: 24287944
5601 dataset_size: 42808738
5602 - config_name: 20231101.sd
5603 features:
5604 - name: id
5605 dtype: string
5606 - name: url
5607 dtype: string
5608 - name: title
5609 dtype: string
5610 - name: text
5611 dtype: string
5612 splits:
5613 - name: train
5614 num_bytes: 37021659
5615 num_examples: 16928
5616 download_size: 17591997
5617 dataset_size: 37021659
5618 - config_name: 20231101.se
5619 features:
5620 - name: id
5621 dtype: string
5622 - name: url
5623 dtype: string
5624 - name: title
5625 dtype: string
5626 - name: text
5627 dtype: string
5628 splits:
5629 - name: train
5630 num_bytes: 3600527
5631 num_examples: 8043
5632 download_size: 1816006
5633 dataset_size: 3600527
5634 - config_name: 20231101.sg
5635 features:
5636 - name: id
5637 dtype: string
5638 - name: url
5639 dtype: string
5640 - name: title
5641 dtype: string
5642 - name: text
5643 dtype: string
5644 splits:
5645 - name: train
5646 num_bytes: 140127
5647 num_examples: 564
5648 download_size: 72486
5649 dataset_size: 140127
5650 - config_name: 20231101.sh
5651 features:
5652 - name: id
5653 dtype: string
5654 - name: url
5655 dtype: string
5656 - name: title
5657 dtype: string
5658 - name: text
5659 dtype: string
5660 splits:
5661 - name: train
5662 num_bytes: 569225870
5663 num_examples: 458392
5664 download_size: 266379293
5665 dataset_size: 569225870
5666 - config_name: 20231101.shi
5667 features:
5668 - name: id
5669 dtype: string
5670 - name: url
5671 dtype: string
5672 - name: title
5673 dtype: string
5674 - name: text
5675 dtype: string
5676 splits:
5677 - name: train
5678 num_bytes: 2369002
5679 num_examples: 1779
5680 download_size: 1359828
5681 dataset_size: 2369002
5682 - config_name: 20231101.shn
5683 features:
5684 - name: id
5685 dtype: string
5686 - name: url
5687 dtype: string
5688 - name: title
5689 dtype: string
5690 - name: text
5691 dtype: string
5692 splits:
5693 - name: train
5694 num_bytes: 33553593
5695 num_examples: 13945
5696 download_size: 8163231
5697 dataset_size: 33553593
5698 - config_name: 20231101.si
5699 features:
5700 - name: id
5701 dtype: string
5702 - name: url
5703 dtype: string
5704 - name: title
5705 dtype: string
5706 - name: text
5707 dtype: string
5708 splits:
5709 - name: train
5710 num_bytes: 138806443
5711 num_examples: 23065
5712 download_size: 54229127
5713 dataset_size: 138806443
5714 - config_name: 20231101.simple
5715 features:
5716 - name: id
5717 dtype: string
5718 - name: url
5719 dtype: string
5720 - name: title
5721 dtype: string
5722 - name: text
5723 dtype: string
5724 splits:
5725 - name: train
5726 num_bytes: 291254232
5727 num_examples: 241787
5728 download_size: 156885218
5729 dataset_size: 291254232
5730 - config_name: 20231101.sk
5731 features:
5732 - name: id
5733 dtype: string
5734 - name: url
5735 dtype: string
5736 - name: title
5737 dtype: string
5738 - name: text
5739 dtype: string
5740 splits:
5741 - name: train
5742 num_bytes: 416804817
5743 num_examples: 242235
5744 download_size: 239513292
5745 dataset_size: 416804817
5746 - config_name: 20231101.skr
5747 features:
5748 - name: id
5749 dtype: string
5750 - name: url
5751 dtype: string
5752 - name: title
5753 dtype: string
5754 - name: text
5755 dtype: string
5756 splits:
5757 - name: train
5758 num_bytes: 22705446
5759 num_examples: 5819
5760 download_size: 9978607
5761 dataset_size: 22705446
5762 - config_name: 20231101.sl
5763 features:
5764 - name: id
5765 dtype: string
5766 - name: url
5767 dtype: string
5768 - name: title
5769 dtype: string
5770 - name: text
5771 dtype: string
5772 splits:
5773 - name: train
5774 num_bytes: 454829910
5775 num_examples: 183006
5776 download_size: 267485569
5777 dataset_size: 454829910
5778 - config_name: 20231101.sm
5779 features:
5780 - name: id
5781 dtype: string
5782 - name: url
5783 dtype: string
5784 - name: title
5785 dtype: string
5786 - name: text
5787 dtype: string
5788 splits:
5789 - name: train
5790 num_bytes: 902927
5791 num_examples: 1151
5792 download_size: 492349
5793 dataset_size: 902927
5794 - config_name: 20231101.smn
5795 features:
5796 - name: id
5797 dtype: string
5798 - name: url
5799 dtype: string
5800 - name: title
5801 dtype: string
5802 - name: text
5803 dtype: string
5804 splits:
5805 - name: train
5806 num_bytes: 5764244
5807 num_examples: 5383
5808 download_size: 2813872
5809 dataset_size: 5764244
5810 - config_name: 20231101.sn
5811 features:
5812 - name: id
5813 dtype: string
5814 - name: url
5815 dtype: string
5816 - name: title
5817 dtype: string
5818 - name: text
5819 dtype: string
5820 splits:
5821 - name: train
5822 num_bytes: 9790528
5823 num_examples: 11621
5824 download_size: 4979456
5825 dataset_size: 9790528
5826 - config_name: 20231101.so
5827 features:
5828 - name: id
5829 dtype: string
5830 - name: url
5831 dtype: string
5832 - name: title
5833 dtype: string
5834 - name: text
5835 dtype: string
5836 splits:
5837 - name: train
5838 num_bytes: 13663784
5839 num_examples: 9021
5840 download_size: 7940363
5841 dataset_size: 13663784
5842 - config_name: 20231101.sq
5843 features:
5844 - name: id
5845 dtype: string
5846 - name: url
5847 dtype: string
5848 - name: title
5849 dtype: string
5850 - name: text
5851 dtype: string
5852 splits:
5853 - name: train
5854 num_bytes: 208779652
5855 num_examples: 104854
5856 download_size: 116945494
5857 dataset_size: 208779652
5858 - config_name: 20231101.sr
5859 features:
5860 - name: id
5861 dtype: string
5862 - name: url
5863 dtype: string
5864 - name: title
5865 dtype: string
5866 - name: text
5867 dtype: string
5868 splits:
5869 - name: train
5870 num_bytes: 1721596392
5871 num_examples: 676605
5872 download_size: 697391786
5873 dataset_size: 1721596392
5874 - config_name: 20231101.srn
5875 features:
5876 - name: id
5877 dtype: string
5878 - name: url
5879 dtype: string
5880 - name: title
5881 dtype: string
5882 - name: text
5883 dtype: string
5884 splits:
5885 - name: train
5886 num_bytes: 649317
5887 num_examples: 1219
5888 download_size: 215103
5889 dataset_size: 649317
5890 - config_name: 20231101.ss
5891 features:
5892 - name: id
5893 dtype: string
5894 - name: url
5895 dtype: string
5896 - name: title
5897 dtype: string
5898 - name: text
5899 dtype: string
5900 splits:
5901 - name: train
5902 num_bytes: 1076102
5903 num_examples: 945
5904 download_size: 600997
5905 dataset_size: 1076102
5906 - config_name: 20231101.st
5907 features:
5908 - name: id
5909 dtype: string
5910 - name: url
5911 dtype: string
5912 - name: title
5913 dtype: string
5914 - name: text
5915 dtype: string
5916 splits:
5917 - name: train
5918 num_bytes: 968161
5919 num_examples: 1099
5920 download_size: 530165
5921 dataset_size: 968161
5922 - config_name: 20231101.stq
5923 features:
5924 - name: id
5925 dtype: string
5926 - name: url
5927 dtype: string
5928 - name: title
5929 dtype: string
5930 - name: text
5931 dtype: string
5932 splits:
5933 - name: train
5934 num_bytes: 4942784
5935 num_examples: 4134
5936 download_size: 2884429
5937 dataset_size: 4942784
5938 - config_name: 20231101.su
5939 features:
5940 - name: id
5941 dtype: string
5942 - name: url
5943 dtype: string
5944 - name: title
5945 dtype: string
5946 - name: text
5947 dtype: string
5948 splits:
5949 - name: train
5950 num_bytes: 48066965
5951 num_examples: 61555
5952 download_size: 19806020
5953 dataset_size: 48066965
5954 - config_name: 20231101.sv
5955 features:
5956 - name: id
5957 dtype: string
5958 - name: url
5959 dtype: string
5960 - name: title
5961 dtype: string
5962 - name: text
5963 dtype: string
5964 splits:
5965 - name: train
5966 num_bytes: 2153690744
5967 num_examples: 2574513
5968 download_size: 974261228
5969 dataset_size: 2153690744
5970 - config_name: 20231101.sw
5971 features:
5972 - name: id
5973 dtype: string
5974 - name: url
5975 dtype: string
5976 - name: title
5977 dtype: string
5978 - name: text
5979 dtype: string
5980 splits:
5981 - name: train
5982 num_bytes: 73119299
5983 num_examples: 78587
5984 download_size: 35936177
5985 dataset_size: 73119299
5986 - config_name: 20231101.szl
5987 features:
5988 - name: id
5989 dtype: string
5990 - name: url
5991 dtype: string
5992 - name: title
5993 dtype: string
5994 - name: text
5995 dtype: string
5996 splits:
5997 - name: train
5998 num_bytes: 21439309
5999 num_examples: 57035
6000 download_size: 7347967
6001 dataset_size: 21439309
6002 - config_name: 20231101.szy
6003 features:
6004 - name: id
6005 dtype: string
6006 - name: url
6007 dtype: string
6008 - name: title
6009 dtype: string
6010 - name: text
6011 dtype: string
6012 splits:
6013 - name: train
6014 num_bytes: 11355780
6015 num_examples: 4885
6016 download_size: 6192815
6017 dataset_size: 11355780
6018 - config_name: 20231101.ta
6019 features:
6020 - name: id
6021 dtype: string
6022 - name: url
6023 dtype: string
6024 - name: title
6025 dtype: string
6026 - name: text
6027 dtype: string
6028 splits:
6029 - name: train
6030 num_bytes: 810734099
6031 num_examples: 160651
6032 download_size: 265652020
6033 dataset_size: 810734099
6034 - config_name: 20231101.tay
6035 features:
6036 - name: id
6037 dtype: string
6038 - name: url
6039 dtype: string
6040 - name: title
6041 dtype: string
6042 - name: text
6043 dtype: string
6044 splits:
6045 - name: train
6046 num_bytes: 2974229
6047 num_examples: 2747
6048 download_size: 1232811
6049 dataset_size: 2974229
6050 - config_name: 20231101.tcy
6051 features:
6052 - name: id
6053 dtype: string
6054 - name: url
6055 dtype: string
6056 - name: title
6057 dtype: string
6058 - name: text
6059 dtype: string
6060 splits:
6061 - name: train
6062 num_bytes: 12166612
6063 num_examples: 2202
6064 download_size: 4611006
6065 dataset_size: 12166612
6066 - config_name: 20231101.te
6067 features:
6068 - name: id
6069 dtype: string
6070 - name: url
6071 dtype: string
6072 - name: title
6073 dtype: string
6074 - name: text
6075 dtype: string
6076 splits:
6077 - name: train
6078 num_bytes: 730376585
6079 num_examples: 87854
6080 download_size: 215097076
6081 dataset_size: 730376585
6082 - config_name: 20231101.tet
6083 features:
6084 - name: id
6085 dtype: string
6086 - name: url
6087 dtype: string
6088 - name: title
6089 dtype: string
6090 - name: text
6091 dtype: string
6092 splits:
6093 - name: train
6094 num_bytes: 1466200
6095 num_examples: 1468
6096 download_size: 744390
6097 dataset_size: 1466200
6098 - config_name: 20231101.tg
6099 features:
6100 - name: id
6101 dtype: string
6102 - name: url
6103 dtype: string
6104 - name: title
6105 dtype: string
6106 - name: text
6107 dtype: string
6108 splits:
6109 - name: train
6110 num_bytes: 148256281
6111 num_examples: 110962
6112 download_size: 49825647
6113 dataset_size: 148256281
6114 - config_name: 20231101.th
6115 features:
6116 - name: id
6117 dtype: string
6118 - name: url
6119 dtype: string
6120 - name: title
6121 dtype: string
6122 - name: text
6123 dtype: string
6124 splits:
6125 - name: train
6126 num_bytes: 1014547923
6127 num_examples: 159719
6128 download_size: 371916105
6129 dataset_size: 1014547923
6130 - config_name: 20231101.ti
6131 features:
6132 - name: id
6133 dtype: string
6134 - name: url
6135 dtype: string
6136 - name: title
6137 dtype: string
6138 - name: text
6139 dtype: string
6140 splits:
6141 - name: train
6142 num_bytes: 729995
6143 num_examples: 435
6144 download_size: 363723
6145 dataset_size: 729995
6146 - config_name: 20231101.tk
6147 features:
6148 - name: id
6149 dtype: string
6150 - name: url
6151 dtype: string
6152 - name: title
6153 dtype: string
6154 - name: text
6155 dtype: string
6156 splits:
6157 - name: train
6158 num_bytes: 13326412
6159 num_examples: 7918
6160 download_size: 7383654
6161 dataset_size: 13326412
6162 - config_name: 20231101.tl
6163 features:
6164 - name: id
6165 dtype: string
6166 - name: url
6167 dtype: string
6168 - name: title
6169 dtype: string
6170 - name: text
6171 dtype: string
6172 splits:
6173 - name: train
6174 num_bytes: 85794472
6175 num_examples: 45341
6176 download_size: 45797527
6177 dataset_size: 85794472
6178 - config_name: 20231101.tly
6179 features:
6180 - name: id
6181 dtype: string
6182 - name: url
6183 dtype: string
6184 - name: title
6185 dtype: string
6186 - name: text
6187 dtype: string
6188 splits:
6189 - name: train
6190 num_bytes: 2590482
6191 num_examples: 8086
6192 download_size: 1070456
6193 dataset_size: 2590482
6194 - config_name: 20231101.tn
6195 features:
6196 - name: id
6197 dtype: string
6198 - name: url
6199 dtype: string
6200 - name: title
6201 dtype: string
6202 - name: text
6203 dtype: string
6204 splits:
6205 - name: train
6206 num_bytes: 4380768
6207 num_examples: 1585
6208 download_size: 1708110
6209 dataset_size: 4380768
6210 - config_name: 20231101.to
6211 features:
6212 - name: id
6213 dtype: string
6214 - name: url
6215 dtype: string
6216 - name: title
6217 dtype: string
6218 - name: text
6219 dtype: string
6220 splits:
6221 - name: train
6222 num_bytes: 1090611
6223 num_examples: 1887
6224 download_size: 518244
6225 dataset_size: 1090611
6226 - config_name: 20231101.tpi
6227 features:
6228 - name: id
6229 dtype: string
6230 - name: url
6231 dtype: string
6232 - name: title
6233 dtype: string
6234 - name: text
6235 dtype: string
6236 splits:
6237 - name: train
6238 num_bytes: 460420
6239 num_examples: 1399
6240 download_size: 241908
6241 dataset_size: 460420
6242 - config_name: 20231101.tr
6243 features:
6244 - name: id
6245 dtype: string
6246 - name: url
6247 dtype: string
6248 - name: title
6249 dtype: string
6250 - name: text
6251 dtype: string
6252 splits:
6253 - name: train
6254 num_bytes: 997254242
6255 num_examples: 534988
6256 download_size: 552923659
6257 dataset_size: 997254242
6258 - config_name: 20231101.trv
6259 features:
6260 - name: id
6261 dtype: string
6262 - name: url
6263 dtype: string
6264 - name: title
6265 dtype: string
6266 - name: text
6267 dtype: string
6268 splits:
6269 - name: train
6270 num_bytes: 4971204
6271 num_examples: 1880
6272 download_size: 2706664
6273 dataset_size: 4971204
6274 - config_name: 20231101.ts
6275 features:
6276 - name: id
6277 dtype: string
6278 - name: url
6279 dtype: string
6280 - name: title
6281 dtype: string
6282 - name: text
6283 dtype: string
6284 splits:
6285 - name: train
6286 num_bytes: 847032
6287 num_examples: 785
6288 download_size: 455648
6289 dataset_size: 847032
6290 - config_name: 20231101.tt
6291 features:
6292 - name: id
6293 dtype: string
6294 - name: url
6295 dtype: string
6296 - name: title
6297 dtype: string
6298 - name: text
6299 dtype: string
6300 splits:
6301 - name: train
6302 num_bytes: 681325421
6303 num_examples: 501116
6304 download_size: 129141056
6305 dataset_size: 681325421
6306 - config_name: 20231101.tum
6307 features:
6308 - name: id
6309 dtype: string
6310 - name: url
6311 dtype: string
6312 - name: title
6313 dtype: string
6314 - name: text
6315 dtype: string
6316 splits:
6317 - name: train
6318 num_bytes: 13429984
6319 num_examples: 18708
6320 download_size: 5459856
6321 dataset_size: 13429984
6322 - config_name: 20231101.tw
6323 features:
6324 - name: id
6325 dtype: string
6326 - name: url
6327 dtype: string
6328 - name: title
6329 dtype: string
6330 - name: text
6331 dtype: string
6332 splits:
6333 - name: train
6334 num_bytes: 7982767
6335 num_examples: 3978
6336 download_size: 4118530
6337 dataset_size: 7982767
6338 - config_name: 20231101.ty
6339 features:
6340 - name: id
6341 dtype: string
6342 - name: url
6343 dtype: string
6344 - name: title
6345 dtype: string
6346 - name: text
6347 dtype: string
6348 splits:
6349 - name: train
6350 num_bytes: 338743
6351 num_examples: 1355
6352 download_size: 150963
6353 dataset_size: 338743
6354 - config_name: 20231101.tyv
6355 features:
6356 - name: id
6357 dtype: string
6358 - name: url
6359 dtype: string
6360 - name: title
6361 dtype: string
6362 - name: text
6363 dtype: string
6364 splits:
6365 - name: train
6366 num_bytes: 14324694
6367 num_examples: 3491
6368 download_size: 6528290
6369 dataset_size: 14324694
6370 - config_name: 20231101.udm
6371 features:
6372 - name: id
6373 dtype: string
6374 - name: url
6375 dtype: string
6376 - name: title
6377 dtype: string
6378 - name: text
6379 dtype: string
6380 splits:
6381 - name: train
6382 num_bytes: 7036113
6383 num_examples: 5677
6384 download_size: 2982821
6385 dataset_size: 7036113
6386 - config_name: 20231101.ug
6387 features:
6388 - name: id
6389 dtype: string
6390 - name: url
6391 dtype: string
6392 - name: title
6393 dtype: string
6394 - name: text
6395 dtype: string
6396 splits:
6397 - name: train
6398 num_bytes: 42254159
6399 num_examples: 8634
6400 download_size: 17741860
6401 dataset_size: 42254159
6402 - config_name: 20231101.uk
6403 features:
6404 - name: id
6405 dtype: string
6406 - name: url
6407 dtype: string
6408 - name: title
6409 dtype: string
6410 - name: text
6411 dtype: string
6412 splits:
6413 - name: train
6414 num_bytes: 4969483901
6415 num_examples: 1294720
6416 download_size: 2276769383
6417 dataset_size: 4969483901
6418 - config_name: 20231101.ur
6419 features:
6420 - name: id
6421 dtype: string
6422 - name: url
6423 dtype: string
6424 - name: title
6425 dtype: string
6426 - name: text
6427 dtype: string
6428 splits:
6429 - name: train
6430 num_bytes: 410511855
6431 num_examples: 200154
6432 download_size: 167627869
6433 dataset_size: 410511855
6434 - config_name: 20231101.uz
6435 features:
6436 - name: id
6437 dtype: string
6438 - name: url
6439 dtype: string
6440 - name: title
6441 dtype: string
6442 - name: text
6443 dtype: string
6444 splits:
6445 - name: train
6446 num_bytes: 397176774
6447 num_examples: 246729
6448 download_size: 210262652
6449 dataset_size: 397176774
6450 - config_name: 20231101.ve
6451 features:
6452 - name: id
6453 dtype: string
6454 - name: url
6455 dtype: string
6456 - name: title
6457 dtype: string
6458 - name: text
6459 dtype: string
6460 splits:
6461 - name: train
6462 num_bytes: 359542
6463 num_examples: 840
6464 download_size: 163318
6465 dataset_size: 359542
6466 - config_name: 20231101.vec
6467 features:
6468 - name: id
6469 dtype: string
6470 - name: url
6471 dtype: string
6472 - name: title
6473 dtype: string
6474 - name: text
6475 dtype: string
6476 splits:
6477 - name: train
6478 num_bytes: 37917528
6479 num_examples: 69268
6480 download_size: 16179506
6481 dataset_size: 37917528
6482 - config_name: 20231101.vep
6483 features:
6484 - name: id
6485 dtype: string
6486 - name: url
6487 dtype: string
6488 - name: title
6489 dtype: string
6490 - name: text
6491 dtype: string
6492 splits:
6493 - name: train
6494 num_bytes: 11643856
6495 num_examples: 6960
6496 download_size: 6423002
6497 dataset_size: 11643856
6498 - config_name: 20231101.vi
6499 features:
6500 - name: id
6501 dtype: string
6502 - name: url
6503 dtype: string
6504 - name: title
6505 dtype: string
6506 - name: text
6507 dtype: string
6508 splits:
6509 - name: train
6510 num_bytes: 1617830227
6511 num_examples: 1288680
6512 download_size: 729557588
6513 dataset_size: 1617830227
6514 - config_name: 20231101.vls
6515 features:
6516 - name: id
6517 dtype: string
6518 - name: url
6519 dtype: string
6520 - name: title
6521 dtype: string
6522 - name: text
6523 dtype: string
6524 splits:
6525 - name: train
6526 num_bytes: 11336278
6527 num_examples: 7872
6528 download_size: 6985406
6529 dataset_size: 11336278
6530 - config_name: 20231101.vo
6531 features:
6532 - name: id
6533 dtype: string
6534 - name: url
6535 dtype: string
6536 - name: title
6537 dtype: string
6538 - name: text
6539 dtype: string
6540 splits:
6541 - name: train
6542 num_bytes: 19521708
6543 num_examples: 35193
6544 download_size: 6582571
6545 dataset_size: 19521708
6546 - config_name: 20231101.wa
6547 features:
6548 - name: id
6549 dtype: string
6550 - name: url
6551 dtype: string
6552 - name: title
6553 dtype: string
6554 - name: text
6555 dtype: string
6556 splits:
6557 - name: train
6558 num_bytes: 12268826
6559 num_examples: 12038
6560 download_size: 7327616
6561 dataset_size: 12268826
6562 - config_name: 20231101.war
6563 features:
6564 - name: id
6565 dtype: string
6566 - name: url
6567 dtype: string
6568 - name: title
6569 dtype: string
6570 - name: text
6571 dtype: string
6572 splits:
6573 - name: train
6574 num_bytes: 467647882
6575 num_examples: 1266394
6576 download_size: 104588442
6577 dataset_size: 467647882
6578 - config_name: 20231101.wo
6579 features:
6580 - name: id
6581 dtype: string
6582 - name: url
6583 dtype: string
6584 - name: title
6585 dtype: string
6586 - name: text
6587 dtype: string
6588 splits:
6589 - name: train
6590 num_bytes: 3525303
6591 num_examples: 1746
6592 download_size: 2094574
6593 dataset_size: 3525303
6594 - config_name: 20231101.wuu
6595 features:
6596 - name: id
6597 dtype: string
6598 - name: url
6599 dtype: string
6600 - name: title
6601 dtype: string
6602 - name: text
6603 dtype: string
6604 splits:
6605 - name: train
6606 num_bytes: 25029545
6607 num_examples: 43010
6608 download_size: 15985963
6609 dataset_size: 25029545
6610 - config_name: 20231101.xal
6611 features:
6612 - name: id
6613 dtype: string
6614 - name: url
6615 dtype: string
6616 - name: title
6617 dtype: string
6618 - name: text
6619 dtype: string
6620 splits:
6621 - name: train
6622 num_bytes: 1391731
6623 num_examples: 2295
6624 download_size: 507198
6625 dataset_size: 1391731
6626 - config_name: 20231101.xh
6627 features:
6628 - name: id
6629 dtype: string
6630 - name: url
6631 dtype: string
6632 - name: title
6633 dtype: string
6634 - name: text
6635 dtype: string
6636 splits:
6637 - name: train
6638 num_bytes: 3665998
6639 num_examples: 1883
6640 download_size: 2505472
6641 dataset_size: 3665998
6642 - config_name: 20231101.xmf
6643 features:
6644 - name: id
6645 dtype: string
6646 - name: url
6647 dtype: string
6648 - name: title
6649 dtype: string
6650 - name: text
6651 dtype: string
6652 splits:
6653 - name: train
6654 num_bytes: 37712629
6655 num_examples: 18099
6656 download_size: 12948576
6657 dataset_size: 37712629
6658 - config_name: 20231101.yi
6659 features:
6660 - name: id
6661 dtype: string
6662 - name: url
6663 dtype: string
6664 - name: title
6665 dtype: string
6666 - name: text
6667 dtype: string
6668 splits:
6669 - name: train
6670 num_bytes: 36038273
6671 num_examples: 15179
6672 download_size: 16218296
6673 dataset_size: 36038273
6674 - config_name: 20231101.yo
6675 features:
6676 - name: id
6677 dtype: string
6678 - name: url
6679 dtype: string
6680 - name: title
6681 dtype: string
6682 - name: text
6683 dtype: string
6684 splits:
6685 - name: train
6686 num_bytes: 19081408
6687 num_examples: 33819
6688 download_size: 8861465
6689 dataset_size: 19081408
6690 - config_name: 20231101.za
6691 features:
6692 - name: id
6693 dtype: string
6694 - name: url
6695 dtype: string
6696 - name: title
6697 dtype: string
6698 - name: text
6699 dtype: string
6700 splits:
6701 - name: train
6702 num_bytes: 1365300
6703 num_examples: 2993
6704 download_size: 666521
6705 dataset_size: 1365300
6706 - config_name: 20231101.zea
6707 features:
6708 - name: id
6709 dtype: string
6710 - name: url
6711 dtype: string
6712 - name: title
6713 dtype: string
6714 - name: text
6715 dtype: string
6716 splits:
6717 - name: train
6718 num_bytes: 5224563
6719 num_examples: 6082
6720 download_size: 2620396
6721 dataset_size: 5224563
6722 - config_name: 20231101.zh
6723 features:
6724 - name: id
6725 dtype: string
6726 - name: url
6727 dtype: string
6728 - name: title
6729 dtype: string
6730 - name: text
6731 dtype: string
6732 splits:
6733 - name: train
6734 num_bytes: 2790577882
6735 num_examples: 1384748
6736 download_size: 1721150260
6737 dataset_size: 2790577882
6738 - config_name: 20231101.zh-classical
6739 features:
6740 - name: id
6741 dtype: string
6742 - name: url
6743 dtype: string
6744 - name: title
6745 dtype: string
6746 - name: text
6747 dtype: string
6748 splits:
6749 - name: train
6750 num_bytes: 14869227
6751 num_examples: 12708
6752 download_size: 10098073
6753 dataset_size: 14869227
6754 - config_name: 20231101.zh-min-nan
6755 features:
6756 - name: id
6757 dtype: string
6758 - name: url
6759 dtype: string
6760 - name: title
6761 dtype: string
6762 - name: text
6763 dtype: string
6764 splits:
6765 - name: train
6766 num_bytes: 153672031
6767 num_examples: 432798
6768 download_size: 37122048
6769 dataset_size: 153672031
6770 - config_name: 20231101.zh-yue
6771 features:
6772 - name: id
6773 dtype: string
6774 - name: url
6775 dtype: string
6776 - name: title
6777 dtype: string
6778 - name: text
6779 dtype: string
6780 splits:
6781 - name: train
6782 num_bytes: 109936351
6783 num_examples: 134140
6784 download_size: 64950815
6785 dataset_size: 109936351
6786 - config_name: 20231101.zu
6787 features:
6788 - name: id
6789 dtype: string
6790 - name: url
6791 dtype: string
6792 - name: title
6793 dtype: string
6794 - name: text
6795 dtype: string
6796 splits:
6797 - name: train
6798 num_bytes: 7088246
6799 num_examples: 11561
6800 download_size: 3792429
6801 dataset_size: 7088246
6802 language_bcp47:
6803 - be-tarask
6804 - en-simple
6805 ---
6806
6807 # Dataset Card for Wikimedia Wikipedia
6808
6809 ## Table of Contents
6810 - [Table of Contents](#table-of-contents)
6811 - [Dataset Description](#dataset-description)
6812 - [Dataset Summary](#dataset-summary)
6813 - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
6814 - [Languages](#languages)
6815 - [Dataset Structure](#dataset-structure)
6816 - [Data Instances](#data-instances)
6817 - [Data Fields](#data-fields)
6818 - [Data Splits](#data-splits)
6819 - [Dataset Creation](#dataset-creation)
6820 - [Curation Rationale](#curation-rationale)
6821 - [Source Data](#source-data)
6822 - [Annotations](#annotations)
6823 - [Personal and Sensitive Information](#personal-and-sensitive-information)
6824 - [Considerations for Using the Data](#considerations-for-using-the-data)
6825 - [Social Impact of Dataset](#social-impact-of-dataset)
6826 - [Discussion of Biases](#discussion-of-biases)
6827 - [Other Known Limitations](#other-known-limitations)
6828 - [Additional Information](#additional-information)
6829 - [Dataset Curators](#dataset-curators)
6830 - [Licensing Information](#licensing-information)
6831 - [Citation Information](#citation-information)
6832 - [Contributions](#contributions)
6833
6834 ## Dataset Description
6835
6836 - **Homepage:** [https://dumps.wikimedia.org](https://dumps.wikimedia.org)
6837 - **Repository:**
6838 - **Paper:**
6839 - **Point of Contact:**
6840
6841 ### Dataset Summary
6842
6843 Wikipedia dataset containing cleaned articles of all languages.
6844
6845 The dataset is built from the Wikipedia dumps (https://dumps.wikimedia.org/)
6846 with one subset per language, each containing a single train split.
6847
6848 Each example contains the content of one full Wikipedia article with cleaning to strip
6849 markdown and unwanted sections (references, etc.).
6850
6851
6852 All language subsets have already been processed for recent dump, and you can load them per date and language this way:
6853 ```python
6854 from datasets import load_dataset
6855
6856 ds = load_dataset("wikimedia/wikipedia", "20231101.en")
6857 ```
6858
6859 #### Data Visualization
6860 Click the [Nomic Atlas](https://atlas.nomic.ai/map/475c26d7-b142-4795-9887-02b6eeb18dc0/0d312be6-a3bb-4586-b6b7-53dcd0cbefa5) map below to visualize the 6.4 million samples in the `20231101.en` split.
6861
6862 <a href="https://atlas.nomic.ai/map/475c26d7-b142-4795-9887-02b6eeb18dc0/0d312be6-a3bb-4586-b6b7-53dcd0cbefa5">
6863 <img src="https://cdn-uploads.huggingface.co/production/uploads/6480c476cacb1c4a0696eeb8/sZNN6Vubc0Oue83vKaJUu.webp" alt="Nomic-Atlas Wikipedia Map" width="25%"/>
6864 </a>
6865
6866 ### Supported Tasks and Leaderboards
6867
6868 The dataset is generally used for Language Modeling.
6869
6870 ### Languages
6871
6872 You can find the list of languages here: https://meta.wikimedia.org/wiki/List_of_Wikipedias
6873
6874 ## Dataset Structure
6875
6876 ### Data Instances
6877
6878 An example looks as follows:
6879 ```
6880 {'id': '1',
6881 'url': 'https://simple.wikipedia.org/wiki/April',
6882 'title': 'April',
6883 'text': 'April is the fourth month...'
6884 }
6885 ```
6886
6887 ### Data Fields
6888
6889 The data fields are the same among all configurations:
6890 - `id` (`str`): ID of the article.
6891 - `url` (`str`): URL of the article.
6892 - `title` (`str`): Title of the article.
6893 - `text` (`str`): Text content of the article.
6894
6895 ### Data Splits
6896
6897 All configurations contain a single `train` split.
6898
6899 ## Dataset Creation
6900
6901 ### Curation Rationale
6902
6903 [More Information Needed]
6904
6905 ### Source Data
6906
6907 #### Initial Data Collection and Normalization
6908
6909 The dataset is built from the Wikipedia dumps: https://dumps.wikimedia.org
6910
6911 You can find the full list of languages and dates here: https://dumps.wikimedia.org/backup-index.html
6912
6913 The articles have been parsed using the [`mwparserfromhell`](https://mwparserfromhell.readthedocs.io) tool.
6914
6915 When uploading the data files for the 20231101 dump, we noticed that the Wikimedia Dumps website does not contain this date dump
6916 for the "bbc", "dga", nor "zgh" Wikipedias. We have reported the issue to the Wikimedia Phabricator: https://phabricator.wikimedia.org/T351761
6917
6918 #### Who are the source language producers?
6919
6920 [More Information Needed]
6921
6922 ### Annotations
6923
6924 #### Annotation process
6925
6926 [More Information Needed]
6927
6928 #### Who are the annotators?
6929
6930 [More Information Needed]
6931
6932 ### Personal and Sensitive Information
6933
6934 [More Information Needed]
6935
6936 ## Considerations for Using the Data
6937
6938 ### Social Impact of Dataset
6939
6940 [More Information Needed]
6941
6942 ### Discussion of Biases
6943
6944 [More Information Needed]
6945
6946 ### Other Known Limitations
6947
6948 [More Information Needed]
6949
6950 ## Additional Information
6951
6952 ### Dataset Curators
6953
6954 [More Information Needed]
6955
6956 ### Licensing Information
6957
6958 Copyright licensing information: https://dumps.wikimedia.org/legal.html
6959
6960 All original textual content is licensed under the [GNU Free Documentation License](https://www.gnu.org/licenses/fdl-1.3.html) (GFDL)
6961 and the [Creative Commons Attribution-Share-Alike 3.0 License](https://creativecommons.org/licenses/by-sa/3.0/).
6962 Some text may be available only under the Creative Commons license; see their [Terms of Use](https://foundation.wikimedia.org/wiki/Policy:Terms_of_Use) for details.
6963 Text written by some authors may be released under additional licenses or into the public domain.
6964
6965 ### Citation Information
6966
6967 ```
6968 @ONLINE{wikidump,
6969 author = "Wikimedia Foundation",
6970 title = "Wikimedia Downloads",
6971 url = "https://dumps.wikimedia.org"
6972 }
6973 ```