tokenizer_config.json
760 B · 34 lines · json Raw
1 {
2 "backend": "tokenizers",
3 "clean_up_tokenization_spaces": false,
4 "do_lower_case": false,
5 "eos_token": "<|endoftext|>",
6 "extra_special_tokens": [
7 "<|endoftext|>",
8 "[MASK]",
9 "[gMASK]",
10 "[sMASK]",
11 "<sop>",
12 "<eop>",
13 "<|system|>",
14 "<|user|>",
15 "<|assistant|>",
16 "<|observation|>",
17 "<|begin_of_image|>",
18 "<|end_of_image|>",
19 "<|begin_of_video|>",
20 "<|end_of_video|>",
21 "<|begin_of_audio|>",
22 "<|end_of_audio|>",
23 "<|begin_of_transcription|>",
24 "<|end_of_transcription|>"
25 ],
26 "is_local": true,
27 "model_max_length": 202752,
28 "model_specific_special_tokens": {},
29 "pad_token": "<|endoftext|>",
30 "padding_side": "left",
31 "remove_space": false,
32 "tokenizer_class": "TokenizersBackend"
33 }
34