tokenizer_config.json
| 1 | { |
| 2 | "add_bos_token": true, |
| 3 | "add_eos_token": false, |
| 4 | "add_prefix_space": null, |
| 5 | "added_tokens_decoder": { |
| 6 | "0": { |
| 7 | "content": "<unk>", |
| 8 | "lstrip": false, |
| 9 | "normalized": false, |
| 10 | "rstrip": false, |
| 11 | "single_word": false, |
| 12 | "special": true |
| 13 | }, |
| 14 | "1": { |
| 15 | "content": "<s>", |
| 16 | "lstrip": false, |
| 17 | "normalized": false, |
| 18 | "rstrip": false, |
| 19 | "single_word": false, |
| 20 | "special": true |
| 21 | }, |
| 22 | "2": { |
| 23 | "content": "</s>", |
| 24 | "lstrip": false, |
| 25 | "normalized": false, |
| 26 | "rstrip": false, |
| 27 | "single_word": false, |
| 28 | "special": true |
| 29 | } |
| 30 | }, |
| 31 | "bos_token": "<s>", |
| 32 | "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", |
| 33 | "clean_up_tokenization_spaces": false, |
| 34 | "eos_token": "</s>", |
| 35 | "extra_special_tokens": {}, |
| 36 | "legacy": true, |
| 37 | "model_max_length": 2048, |
| 38 | "pad_token": "<unk>", |
| 39 | "tokenizer_class": "LlamaTokenizer", |
| 40 | "unk_token": "<unk>", |
| 41 | "use_default_system_prompt": true |
| 42 | } |
| 43 | |