tokenizer_config.json
1.1 KB · 56 lines · json Raw
1 {
2 "added_tokens_decoder": {
3 "0": {
4 "content": "<s>",
5 "lstrip": false,
6 "normalized": false,
7 "rstrip": false,
8 "single_word": false,
9 "special": true
10 },
11 "1": {
12 "content": "<pad>",
13 "lstrip": false,
14 "normalized": false,
15 "rstrip": false,
16 "single_word": false,
17 "special": true
18 },
19 "2": {
20 "content": "</s>",
21 "lstrip": false,
22 "normalized": false,
23 "rstrip": false,
24 "single_word": false,
25 "special": true
26 },
27 "3": {
28 "content": "<unk>",
29 "lstrip": false,
30 "normalized": false,
31 "rstrip": false,
32 "single_word": false,
33 "special": true
34 },
35 "250001": {
36 "content": "<mask>",
37 "lstrip": true,
38 "normalized": false,
39 "rstrip": false,
40 "single_word": false,
41 "special": true
42 }
43 },
44 "bos_token": "<s>",
45 "clean_up_tokenization_spaces": true,
46 "cls_token": "<s>",
47 "eos_token": "</s>",
48 "mask_token": "<mask>",
49 "model_max_length": 8192,
50 "pad_token": "<pad>",
51 "sep_token": "</s>",
52 "sp_model_kwargs": {},
53 "tokenizer_class": "XLMRobertaTokenizer",
54 "unk_token": "<unk>"
55 }
56