tokenizer_config.json
1.2 KB · 56 lines · json Raw
1 {
2 "added_tokens_decoder": {
3 "0": {
4 "content": "[PAD]",
5 "lstrip": false,
6 "normalized": false,
7 "rstrip": false,
8 "single_word": false,
9 "special": true
10 },
11 "100": {
12 "content": "[UNK]",
13 "lstrip": false,
14 "normalized": false,
15 "rstrip": false,
16 "single_word": false,
17 "special": true
18 },
19 "101": {
20 "content": "[CLS]",
21 "lstrip": false,
22 "normalized": false,
23 "rstrip": false,
24 "single_word": false,
25 "special": true
26 },
27 "102": {
28 "content": "[SEP]",
29 "lstrip": false,
30 "normalized": false,
31 "rstrip": false,
32 "single_word": false,
33 "special": true
34 },
35 "103": {
36 "content": "[MASK]",
37 "lstrip": false,
38 "normalized": false,
39 "rstrip": false,
40 "single_word": false,
41 "special": true
42 }
43 },
44 "clean_up_tokenization_spaces": true,
45 "cls_token": "[CLS]",
46 "do_lower_case": true,
47 "mask_token": "[MASK]",
48 "model_max_length": 8192,
49 "pad_token": "[PAD]",
50 "sep_token": "[SEP]",
51 "strip_accents": null,
52 "tokenize_chinese_chars": true,
53 "tokenizer_class": "BertTokenizer",
54 "unk_token": "[UNK]"
55 }
56