tokenizer_config.json
16.3 KB · 305 lines · json Raw
1 {
2 "add_prefix_space": false,
3 "added_tokens_decoder": {
4 "248044": {
5 "content": "<|endoftext|>",
6 "lstrip": false,
7 "normalized": false,
8 "rstrip": false,
9 "single_word": false,
10 "special": true
11 },
12 "248045": {
13 "content": "<|im_start|>",
14 "lstrip": false,
15 "normalized": false,
16 "rstrip": false,
17 "single_word": false,
18 "special": true
19 },
20 "248046": {
21 "content": "<|im_end|>",
22 "lstrip": false,
23 "normalized": false,
24 "rstrip": false,
25 "single_word": false,
26 "special": true
27 },
28 "248047": {
29 "content": "<|object_ref_start|>",
30 "lstrip": false,
31 "normalized": false,
32 "rstrip": false,
33 "single_word": false,
34 "special": true
35 },
36 "248048": {
37 "content": "<|object_ref_end|>",
38 "lstrip": false,
39 "normalized": false,
40 "rstrip": false,
41 "single_word": false,
42 "special": true
43 },
44 "248049": {
45 "content": "<|box_start|>",
46 "lstrip": false,
47 "normalized": false,
48 "rstrip": false,
49 "single_word": false,
50 "special": true
51 },
52 "248050": {
53 "content": "<|box_end|>",
54 "lstrip": false,
55 "normalized": false,
56 "rstrip": false,
57 "single_word": false,
58 "special": true
59 },
60 "248051": {
61 "content": "<|quad_start|>",
62 "lstrip": false,
63 "normalized": false,
64 "rstrip": false,
65 "single_word": false,
66 "special": true
67 },
68 "248052": {
69 "content": "<|quad_end|>",
70 "lstrip": false,
71 "normalized": false,
72 "rstrip": false,
73 "single_word": false,
74 "special": true
75 },
76 "248053": {
77 "content": "<|vision_start|>",
78 "lstrip": false,
79 "normalized": false,
80 "rstrip": false,
81 "single_word": false,
82 "special": true
83 },
84 "248054": {
85 "content": "<|vision_end|>",
86 "lstrip": false,
87 "normalized": false,
88 "rstrip": false,
89 "single_word": false,
90 "special": true
91 },
92 "248055": {
93 "content": "<|vision_pad|>",
94 "lstrip": false,
95 "normalized": false,
96 "rstrip": false,
97 "single_word": false,
98 "special": true
99 },
100 "248056": {
101 "content": "<|image_pad|>",
102 "lstrip": false,
103 "normalized": false,
104 "rstrip": false,
105 "single_word": false,
106 "special": true
107 },
108 "248057": {
109 "content": "<|video_pad|>",
110 "lstrip": false,
111 "normalized": false,
112 "rstrip": false,
113 "single_word": false,
114 "special": true
115 },
116 "248058": {
117 "content": "<tool_call>",
118 "lstrip": false,
119 "normalized": false,
120 "rstrip": false,
121 "single_word": false,
122 "special": false
123 },
124 "248059": {
125 "content": "</tool_call>",
126 "lstrip": false,
127 "normalized": false,
128 "rstrip": false,
129 "single_word": false,
130 "special": false
131 },
132 "248060": {
133 "content": "<|fim_prefix|>",
134 "lstrip": false,
135 "normalized": false,
136 "rstrip": false,
137 "single_word": false,
138 "special": false
139 },
140 "248061": {
141 "content": "<|fim_middle|>",
142 "lstrip": false,
143 "normalized": false,
144 "rstrip": false,
145 "single_word": false,
146 "special": false
147 },
148 "248062": {
149 "content": "<|fim_suffix|>",
150 "lstrip": false,
151 "normalized": false,
152 "rstrip": false,
153 "single_word": false,
154 "special": false
155 },
156 "248063": {
157 "content": "<|fim_pad|>",
158 "lstrip": false,
159 "normalized": false,
160 "rstrip": false,
161 "single_word": false,
162 "special": false
163 },
164 "248064": {
165 "content": "<|repo_name|>",
166 "lstrip": false,
167 "normalized": false,
168 "rstrip": false,
169 "single_word": false,
170 "special": false
171 },
172 "248065": {
173 "content": "<|file_sep|>",
174 "lstrip": false,
175 "normalized": false,
176 "rstrip": false,
177 "single_word": false,
178 "special": false
179 },
180 "248066": {
181 "content": "<tool_response>",
182 "lstrip": false,
183 "normalized": false,
184 "rstrip": false,
185 "single_word": false,
186 "special": false
187 },
188 "248067": {
189 "content": "</tool_response>",
190 "lstrip": false,
191 "normalized": false,
192 "rstrip": false,
193 "single_word": false,
194 "special": false
195 },
196 "248068": {
197 "content": "<think>",
198 "lstrip": false,
199 "normalized": false,
200 "rstrip": false,
201 "single_word": false,
202 "special": false
203 },
204 "248069": {
205 "content": "</think>",
206 "lstrip": false,
207 "normalized": false,
208 "rstrip": false,
209 "single_word": false,
210 "special": false
211 },
212 "248070": {
213 "content": "<|audio_start|>",
214 "lstrip": false,
215 "normalized": false,
216 "rstrip": false,
217 "single_word": false,
218 "special": true
219 },
220 "248071": {
221 "content": "<|audio_end|>",
222 "lstrip": false,
223 "normalized": false,
224 "rstrip": false,
225 "single_word": false,
226 "special": true
227 },
228 "248072": {
229 "content": "<tts_pad>",
230 "lstrip": false,
231 "normalized": false,
232 "rstrip": false,
233 "single_word": false,
234 "special": true
235 },
236 "248073": {
237 "content": "<tts_text_bos>",
238 "lstrip": false,
239 "normalized": false,
240 "rstrip": false,
241 "single_word": false,
242 "special": true
243 },
244 "248074": {
245 "content": "<tts_text_eod>",
246 "lstrip": false,
247 "normalized": false,
248 "rstrip": false,
249 "single_word": false,
250 "special": true
251 },
252 "248075": {
253 "content": "<tts_text_bos_single>",
254 "lstrip": false,
255 "normalized": false,
256 "rstrip": false,
257 "single_word": false,
258 "special": true
259 },
260 "248076": {
261 "content": "<|audio_pad|>",
262 "lstrip": false,
263 "normalized": false,
264 "rstrip": false,
265 "single_word": false,
266 "special": true
267 }
268 },
269 "additional_special_tokens": [
270 "<|im_start|>",
271 "<|im_end|>",
272 "<|object_ref_start|>",
273 "<|object_ref_end|>",
274 "<|box_start|>",
275 "<|box_end|>",
276 "<|quad_start|>",
277 "<|quad_end|>",
278 "<|vision_start|>",
279 "<|vision_end|>",
280 "<|vision_pad|>",
281 "<|image_pad|>",
282 "<|video_pad|>"
283 ],
284 "bos_token": null,
285 "chat_template": "{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- macro render_content(content, do_vision_count, is_system_content=false) %}\n {%- if content is string %}\n {{- content }}\n {%- elif content is iterable and content is not mapping %}\n {%- for item in content %}\n {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain images.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Picture ' ~ image_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|image_pad|><|vision_end|>' }}\n {%- elif 'video' in item or item.type == 'video' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain videos.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Video ' ~ video_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|video_pad|><|vision_end|>' }}\n {%- elif 'text' in item %}\n {{- item.text }}\n {%- else %}\n {{- raise_exception('Unexpected item type in content.') }}\n {%- endif %}\n {%- endfor %}\n {%- elif content is none or content is undefined %}\n {{- '' }}\n {%- else %}\n {{- raise_exception('Unexpected content type.') }}\n {%- endif %}\n{%- endmacro %}\n{%- if not messages %}\n {{- raise_exception('No messages provided.') }}\n{%- endif %}\n{%- if tools and tools is iterable and tools is not mapping %}\n {{- '<|im_start|>system\\n' }}\n {{- \"# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {%- if content %}\n {{- '\\n\\n' + content }}\n {%- endif %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" %}\n {%- set content = render_content(message.content, false)|trim %}\n {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if ns.multi_step_tool %}\n {{- raise_exception('No user query found in messages.') }}\n{%- endif %}\n{%- for message in messages %}\n {%- set content = render_content(message.content, true)|trim %}\n {%- if message.role == \"system\" %}\n {%- if not loop.first %}\n {{- raise_exception('System message must be at the beginning.') }}\n {%- endif %}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- set reasoning_content = reasoning_content|trim %}\n {%- if loop.index0 > ns.last_query_index %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {%- if loop.first %}\n {%- if content|trim %}\n {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- else %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- else %}\n {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>\\n' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '\\n</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>\\n' }}\n {%- elif loop.last %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Unexpected message role.') }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is true %}\n {{- '<think>\\n' }}\n {%- else %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
286 "clean_up_tokenization_spaces": false,
287 "eos_token": "<|im_end|>",
288 "errors": "replace",
289 "model_max_length": 262144,
290 "pad_token": "<|endoftext|>",
291 "split_special_tokens": false,
292 "tokenizer_class": "Qwen2Tokenizer",
293 "unk_token": null,
294 "add_bos_token": false,
295 "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
296 "extra_special_tokens": {
297 "audio_bos_token": "<|audio_start|>",
298 "audio_eos_token": "<|audio_end|>",
299 "audio_token": "<|audio_pad|>",
300 "image_token": "<|image_pad|>",
301 "video_token": "<|video_pad|>",
302 "vision_bos_token": "<|vision_start|>",
303 "vision_eos_token": "<|vision_end|>"
304 }
305 }