tokenizer_config.json
1.5 KB · 73 lines · json Raw
1 {
2 "added_tokens_decoder": {
3 "0": {
4 "content": "<s>",
5 "lstrip": false,
6 "normalized": false,
7 "rstrip": false,
8 "single_word": false,
9 "special": true
10 },
11 "1": {
12 "content": "<pad>",
13 "lstrip": false,
14 "normalized": false,
15 "rstrip": false,
16 "single_word": false,
17 "special": true
18 },
19 "2": {
20 "content": "</s>",
21 "lstrip": false,
22 "normalized": false,
23 "rstrip": false,
24 "single_word": false,
25 "special": true
26 },
27 "3": {
28 "content": "<unk>",
29 "lstrip": false,
30 "normalized": true,
31 "rstrip": false,
32 "single_word": false,
33 "special": true
34 },
35 "104": {
36 "content": "[UNK]",
37 "lstrip": false,
38 "normalized": false,
39 "rstrip": false,
40 "single_word": false,
41 "special": true
42 },
43 "30526": {
44 "content": "<mask>",
45 "lstrip": true,
46 "normalized": false,
47 "rstrip": false,
48 "single_word": false,
49 "special": true
50 }
51 },
52 "bos_token": "<s>",
53 "clean_up_tokenization_spaces": true,
54 "cls_token": "<s>",
55 "do_lower_case": true,
56 "eos_token": "</s>",
57 "mask_token": "<mask>",
58 "max_length": 250,
59 "model_max_length": 512,
60 "pad_to_multiple_of": null,
61 "pad_token": "<pad>",
62 "pad_token_type_id": 0,
63 "padding_side": "right",
64 "sep_token": "</s>",
65 "stride": 0,
66 "strip_accents": null,
67 "tokenize_chinese_chars": true,
68 "tokenizer_class": "MPNetTokenizer",
69 "truncation_side": "right",
70 "truncation_strategy": "longest_first",
71 "unk_token": "[UNK]"
72 }
73