config.json
36.1 KB · 778 lines · json Raw
1 {
2 "architectures": [
3 "Qwen3_5MoeForConditionalGeneration"
4 ],
5 "image_token_id": 248056,
6 "model_type": "qwen3_5_moe",
7 "text_config": {
8 "attention_bias": false,
9 "attention_dropout": 0.0,
10 "attn_output_gate": true,
11 "bos_token_id": 248044,
12 "dtype": "bfloat16",
13 "eos_token_id": 248044,
14 "full_attention_interval": 4,
15 "head_dim": 256,
16 "hidden_act": "silu",
17 "hidden_size": 2048,
18 "initializer_range": 0.02,
19 "layer_types": [
20 "linear_attention",
21 "linear_attention",
22 "linear_attention",
23 "full_attention",
24 "linear_attention",
25 "linear_attention",
26 "linear_attention",
27 "full_attention",
28 "linear_attention",
29 "linear_attention",
30 "linear_attention",
31 "full_attention",
32 "linear_attention",
33 "linear_attention",
34 "linear_attention",
35 "full_attention",
36 "linear_attention",
37 "linear_attention",
38 "linear_attention",
39 "full_attention",
40 "linear_attention",
41 "linear_attention",
42 "linear_attention",
43 "full_attention",
44 "linear_attention",
45 "linear_attention",
46 "linear_attention",
47 "full_attention",
48 "linear_attention",
49 "linear_attention",
50 "linear_attention",
51 "full_attention",
52 "linear_attention",
53 "linear_attention",
54 "linear_attention",
55 "full_attention",
56 "linear_attention",
57 "linear_attention",
58 "linear_attention",
59 "full_attention"
60 ],
61 "linear_conv_kernel_dim": 4,
62 "linear_key_head_dim": 128,
63 "linear_num_key_heads": 16,
64 "linear_num_value_heads": 32,
65 "linear_value_head_dim": 128,
66 "mamba_ssm_dtype": "float32",
67 "max_position_embeddings": 262144,
68 "model_type": "qwen3_5_moe_text",
69 "moe_intermediate_size": 512,
70 "mtp_num_hidden_layers": 1,
71 "mtp_use_dedicated_embeddings": false,
72 "num_attention_heads": 16,
73 "num_experts": 256,
74 "num_experts_per_tok": 8,
75 "num_hidden_layers": 40,
76 "num_key_value_heads": 2,
77 "output_router_logits": false,
78 "pad_token_id": null,
79 "partial_rotary_factor": 0.25,
80 "rms_norm_eps": 1e-06,
81 "rope_parameters": {
82 "mrope_interleaved": true,
83 "mrope_section": [
84 11,
85 11,
86 10
87 ],
88 "partial_rotary_factor": 0.25,
89 "rope_theta": 10000000,
90 "rope_type": "default"
91 },
92 "router_aux_loss_coef": 0.001,
93 "shared_expert_intermediate_size": 512,
94 "tie_word_embeddings": false,
95 "use_cache": true,
96 "vocab_size": 248320
97 },
98 "tie_word_embeddings": false,
99 "transformers_version": "4.57.1",
100 "video_token_id": 248057,
101 "vision_config": {
102 "deepstack_visual_indexes": [],
103 "depth": 27,
104 "hidden_act": "gelu_pytorch_tanh",
105 "hidden_size": 1152,
106 "in_channels": 3,
107 "initializer_range": 0.02,
108 "intermediate_size": 4304,
109 "model_type": "qwen3_5_moe",
110 "num_heads": 16,
111 "num_position_embeddings": 2304,
112 "out_hidden_size": 2048,
113 "patch_size": 16,
114 "spatial_merge_size": 2,
115 "temporal_patch_size": 2
116 },
117 "vision_end_token_id": 248054,
118 "vision_start_token_id": 248053,
119 "quantization_config": {
120 "activation_scheme": "dynamic",
121 "fmt": "e4m3",
122 "quant_method": "fp8",
123 "modules_to_not_convert": [
124 "model.visual.blocks.0.attn.proj",
125 "model.visual.blocks.0.attn.qkv",
126 "model.visual.blocks.0.mlp.linear_fc1",
127 "model.visual.blocks.0.mlp.linear_fc2",
128 "visual.blocks.0.attn.proj",
129 "visual.blocks.0.attn.qkv_proj",
130 "visual.blocks.0.mlp.linear_fc1",
131 "visual.blocks.0.mlp.linear_fc2",
132 "model.visual.blocks.1.attn.proj",
133 "model.visual.blocks.1.attn.qkv",
134 "model.visual.blocks.1.mlp.linear_fc1",
135 "model.visual.blocks.1.mlp.linear_fc2",
136 "visual.blocks.1.attn.proj",
137 "visual.blocks.1.attn.qkv_proj",
138 "visual.blocks.1.mlp.linear_fc1",
139 "visual.blocks.1.mlp.linear_fc2",
140 "model.visual.blocks.2.attn.proj",
141 "model.visual.blocks.2.attn.qkv",
142 "model.visual.blocks.2.mlp.linear_fc1",
143 "model.visual.blocks.2.mlp.linear_fc2",
144 "visual.blocks.2.attn.proj",
145 "visual.blocks.2.attn.qkv_proj",
146 "visual.blocks.2.mlp.linear_fc1",
147 "visual.blocks.2.mlp.linear_fc2",
148 "model.visual.blocks.3.attn.proj",
149 "model.visual.blocks.3.attn.qkv",
150 "model.visual.blocks.3.mlp.linear_fc1",
151 "model.visual.blocks.3.mlp.linear_fc2",
152 "visual.blocks.3.attn.proj",
153 "visual.blocks.3.attn.qkv_proj",
154 "visual.blocks.3.mlp.linear_fc1",
155 "visual.blocks.3.mlp.linear_fc2",
156 "model.visual.blocks.4.attn.proj",
157 "model.visual.blocks.4.attn.qkv",
158 "model.visual.blocks.4.mlp.linear_fc1",
159 "model.visual.blocks.4.mlp.linear_fc2",
160 "visual.blocks.4.attn.proj",
161 "visual.blocks.4.attn.qkv_proj",
162 "visual.blocks.4.mlp.linear_fc1",
163 "visual.blocks.4.mlp.linear_fc2",
164 "model.visual.blocks.5.attn.proj",
165 "model.visual.blocks.5.attn.qkv",
166 "model.visual.blocks.5.mlp.linear_fc1",
167 "model.visual.blocks.5.mlp.linear_fc2",
168 "visual.blocks.5.attn.proj",
169 "visual.blocks.5.attn.qkv_proj",
170 "visual.blocks.5.mlp.linear_fc1",
171 "visual.blocks.5.mlp.linear_fc2",
172 "model.visual.blocks.6.attn.proj",
173 "model.visual.blocks.6.attn.qkv",
174 "model.visual.blocks.6.mlp.linear_fc1",
175 "model.visual.blocks.6.mlp.linear_fc2",
176 "visual.blocks.6.attn.proj",
177 "visual.blocks.6.attn.qkv_proj",
178 "visual.blocks.6.mlp.linear_fc1",
179 "visual.blocks.6.mlp.linear_fc2",
180 "model.visual.blocks.7.attn.proj",
181 "model.visual.blocks.7.attn.qkv",
182 "model.visual.blocks.7.mlp.linear_fc1",
183 "model.visual.blocks.7.mlp.linear_fc2",
184 "visual.blocks.7.attn.proj",
185 "visual.blocks.7.attn.qkv_proj",
186 "visual.blocks.7.mlp.linear_fc1",
187 "visual.blocks.7.mlp.linear_fc2",
188 "model.visual.blocks.8.attn.proj",
189 "model.visual.blocks.8.attn.qkv",
190 "model.visual.blocks.8.mlp.linear_fc1",
191 "model.visual.blocks.8.mlp.linear_fc2",
192 "visual.blocks.8.attn.proj",
193 "visual.blocks.8.attn.qkv_proj",
194 "visual.blocks.8.mlp.linear_fc1",
195 "visual.blocks.8.mlp.linear_fc2",
196 "model.visual.blocks.9.attn.proj",
197 "model.visual.blocks.9.attn.qkv",
198 "model.visual.blocks.9.mlp.linear_fc1",
199 "model.visual.blocks.9.mlp.linear_fc2",
200 "visual.blocks.9.attn.proj",
201 "visual.blocks.9.attn.qkv_proj",
202 "visual.blocks.9.mlp.linear_fc1",
203 "visual.blocks.9.mlp.linear_fc2",
204 "model.visual.blocks.10.attn.proj",
205 "model.visual.blocks.10.attn.qkv",
206 "model.visual.blocks.10.mlp.linear_fc1",
207 "model.visual.blocks.10.mlp.linear_fc2",
208 "visual.blocks.10.attn.proj",
209 "visual.blocks.10.attn.qkv_proj",
210 "visual.blocks.10.mlp.linear_fc1",
211 "visual.blocks.10.mlp.linear_fc2",
212 "model.visual.blocks.11.attn.proj",
213 "model.visual.blocks.11.attn.qkv",
214 "model.visual.blocks.11.mlp.linear_fc1",
215 "model.visual.blocks.11.mlp.linear_fc2",
216 "visual.blocks.11.attn.proj",
217 "visual.blocks.11.attn.qkv_proj",
218 "visual.blocks.11.mlp.linear_fc1",
219 "visual.blocks.11.mlp.linear_fc2",
220 "model.visual.blocks.12.attn.proj",
221 "model.visual.blocks.12.attn.qkv",
222 "model.visual.blocks.12.mlp.linear_fc1",
223 "model.visual.blocks.12.mlp.linear_fc2",
224 "visual.blocks.12.attn.proj",
225 "visual.blocks.12.attn.qkv_proj",
226 "visual.blocks.12.mlp.linear_fc1",
227 "visual.blocks.12.mlp.linear_fc2",
228 "model.visual.blocks.13.attn.proj",
229 "model.visual.blocks.13.attn.qkv",
230 "model.visual.blocks.13.mlp.linear_fc1",
231 "model.visual.blocks.13.mlp.linear_fc2",
232 "visual.blocks.13.attn.proj",
233 "visual.blocks.13.attn.qkv_proj",
234 "visual.blocks.13.mlp.linear_fc1",
235 "visual.blocks.13.mlp.linear_fc2",
236 "model.visual.blocks.14.attn.proj",
237 "model.visual.blocks.14.attn.qkv",
238 "model.visual.blocks.14.mlp.linear_fc1",
239 "model.visual.blocks.14.mlp.linear_fc2",
240 "visual.blocks.14.attn.proj",
241 "visual.blocks.14.attn.qkv_proj",
242 "visual.blocks.14.mlp.linear_fc1",
243 "visual.blocks.14.mlp.linear_fc2",
244 "model.visual.blocks.15.attn.proj",
245 "model.visual.blocks.15.attn.qkv",
246 "model.visual.blocks.15.mlp.linear_fc1",
247 "model.visual.blocks.15.mlp.linear_fc2",
248 "visual.blocks.15.attn.proj",
249 "visual.blocks.15.attn.qkv_proj",
250 "visual.blocks.15.mlp.linear_fc1",
251 "visual.blocks.15.mlp.linear_fc2",
252 "model.visual.blocks.16.attn.proj",
253 "model.visual.blocks.16.attn.qkv",
254 "model.visual.blocks.16.mlp.linear_fc1",
255 "model.visual.blocks.16.mlp.linear_fc2",
256 "visual.blocks.16.attn.proj",
257 "visual.blocks.16.attn.qkv_proj",
258 "visual.blocks.16.mlp.linear_fc1",
259 "visual.blocks.16.mlp.linear_fc2",
260 "model.visual.blocks.17.attn.proj",
261 "model.visual.blocks.17.attn.qkv",
262 "model.visual.blocks.17.mlp.linear_fc1",
263 "model.visual.blocks.17.mlp.linear_fc2",
264 "visual.blocks.17.attn.proj",
265 "visual.blocks.17.attn.qkv_proj",
266 "visual.blocks.17.mlp.linear_fc1",
267 "visual.blocks.17.mlp.linear_fc2",
268 "model.visual.blocks.18.attn.proj",
269 "model.visual.blocks.18.attn.qkv",
270 "model.visual.blocks.18.mlp.linear_fc1",
271 "model.visual.blocks.18.mlp.linear_fc2",
272 "visual.blocks.18.attn.proj",
273 "visual.blocks.18.attn.qkv_proj",
274 "visual.blocks.18.mlp.linear_fc1",
275 "visual.blocks.18.mlp.linear_fc2",
276 "model.visual.blocks.19.attn.proj",
277 "model.visual.blocks.19.attn.qkv",
278 "model.visual.blocks.19.mlp.linear_fc1",
279 "model.visual.blocks.19.mlp.linear_fc2",
280 "visual.blocks.19.attn.proj",
281 "visual.blocks.19.attn.qkv_proj",
282 "visual.blocks.19.mlp.linear_fc1",
283 "visual.blocks.19.mlp.linear_fc2",
284 "model.visual.blocks.20.attn.proj",
285 "model.visual.blocks.20.attn.qkv",
286 "model.visual.blocks.20.mlp.linear_fc1",
287 "model.visual.blocks.20.mlp.linear_fc2",
288 "visual.blocks.20.attn.proj",
289 "visual.blocks.20.attn.qkv_proj",
290 "visual.blocks.20.mlp.linear_fc1",
291 "visual.blocks.20.mlp.linear_fc2",
292 "model.visual.blocks.21.attn.proj",
293 "model.visual.blocks.21.attn.qkv",
294 "model.visual.blocks.21.mlp.linear_fc1",
295 "model.visual.blocks.21.mlp.linear_fc2",
296 "visual.blocks.21.attn.proj",
297 "visual.blocks.21.attn.qkv_proj",
298 "visual.blocks.21.mlp.linear_fc1",
299 "visual.blocks.21.mlp.linear_fc2",
300 "model.visual.blocks.22.attn.proj",
301 "model.visual.blocks.22.attn.qkv",
302 "model.visual.blocks.22.mlp.linear_fc1",
303 "model.visual.blocks.22.mlp.linear_fc2",
304 "visual.blocks.22.attn.proj",
305 "visual.blocks.22.attn.qkv_proj",
306 "visual.blocks.22.mlp.linear_fc1",
307 "visual.blocks.22.mlp.linear_fc2",
308 "model.visual.blocks.23.attn.proj",
309 "model.visual.blocks.23.attn.qkv",
310 "model.visual.blocks.23.mlp.linear_fc1",
311 "model.visual.blocks.23.mlp.linear_fc2",
312 "visual.blocks.23.attn.proj",
313 "visual.blocks.23.attn.qkv_proj",
314 "visual.blocks.23.mlp.linear_fc1",
315 "visual.blocks.23.mlp.linear_fc2",
316 "model.visual.blocks.24.attn.proj",
317 "model.visual.blocks.24.attn.qkv",
318 "model.visual.blocks.24.mlp.linear_fc1",
319 "model.visual.blocks.24.mlp.linear_fc2",
320 "visual.blocks.24.attn.proj",
321 "visual.blocks.24.attn.qkv_proj",
322 "visual.blocks.24.mlp.linear_fc1",
323 "visual.blocks.24.mlp.linear_fc2",
324 "model.visual.blocks.25.attn.proj",
325 "model.visual.blocks.25.attn.qkv",
326 "model.visual.blocks.25.mlp.linear_fc1",
327 "model.visual.blocks.25.mlp.linear_fc2",
328 "visual.blocks.25.attn.proj",
329 "visual.blocks.25.attn.qkv_proj",
330 "visual.blocks.25.mlp.linear_fc1",
331 "visual.blocks.25.mlp.linear_fc2",
332 "model.visual.blocks.26.attn.proj",
333 "model.visual.blocks.26.attn.qkv",
334 "model.visual.blocks.26.mlp.linear_fc1",
335 "model.visual.blocks.26.mlp.linear_fc2",
336 "visual.blocks.26.attn.proj",
337 "visual.blocks.26.attn.qkv_proj",
338 "visual.blocks.26.mlp.linear_fc1",
339 "visual.blocks.26.mlp.linear_fc2",
340 "model.visual.deepstack_merger_list.0.linear_fc1",
341 "model.visual.deepstack_merger_list.0.linear_fc2",
342 "model.visual.deepstack_merger_list.0.norm",
343 "visual.deepstack_merger_list.0.linear_fc1",
344 "visual.deepstack_merger_list.0.linear_fc2",
345 "visual.deepstack_merger_list.0.norm",
346 "model.visual.deepstack_merger_list.1.linear_fc1",
347 "model.visual.deepstack_merger_list.1.linear_fc2",
348 "model.visual.deepstack_merger_list.1.norm",
349 "visual.deepstack_merger_list.1.linear_fc1",
350 "visual.deepstack_merger_list.1.linear_fc2",
351 "visual.deepstack_merger_list.1.norm",
352 "model.visual.deepstack_merger_list.2.linear_fc1",
353 "model.visual.deepstack_merger_list.2.linear_fc2",
354 "model.visual.deepstack_merger_list.2.norm",
355 "visual.deepstack_merger_list.2.linear_fc1",
356 "visual.deepstack_merger_list.2.linear_fc2",
357 "visual.deepstack_merger_list.2.norm",
358 "model.visual.merger.linear_fc1",
359 "model.visual.merger.linear_fc2",
360 "model.visual.merger.norm",
361 "model.visual.patch_embed.proj",
362 "model.visual.pos_embed",
363 "visual.merger.linear_fc1",
364 "visual.merger.linear_fc2",
365 "visual.merger.norm",
366 "visual.patch_embed.proj",
367 "visual.pos_embed",
368 "visual",
369 "model.visual",
370 "lm_head",
371 "model.embed_tokens",
372 "model.language_model.layers.0.input_layernorm",
373 "model.language_model.layers.0.mlp.shared_expert_gate",
374 "model.language_model.layers.0.post_attention_layernorm",
375 "model.language_model.layers.0.mlp.gate",
376 "model.language_model.layers.0.linear_attn.A_log",
377 "model.language_model.layers.0.linear_attn.conv1d",
378 "model.language_model.layers.0.linear_attn.dt_bias",
379 "model.language_model.layers.0.linear_attn.in_proj_ba",
380 "model.language_model.layers.0.linear_attn.in_proj_b",
381 "model.language_model.layers.0.linear_attn.in_proj_a",
382 "model.language_model.layers.0.linear_attn.norm",
383 "model.language_model.layers.1.input_layernorm",
384 "model.language_model.layers.1.mlp.shared_expert_gate",
385 "model.language_model.layers.1.post_attention_layernorm",
386 "model.language_model.layers.1.mlp.gate",
387 "model.language_model.layers.1.linear_attn.A_log",
388 "model.language_model.layers.1.linear_attn.conv1d",
389 "model.language_model.layers.1.linear_attn.dt_bias",
390 "model.language_model.layers.1.linear_attn.in_proj_ba",
391 "model.language_model.layers.1.linear_attn.in_proj_b",
392 "model.language_model.layers.1.linear_attn.in_proj_a",
393 "model.language_model.layers.1.linear_attn.norm",
394 "model.language_model.layers.2.input_layernorm",
395 "model.language_model.layers.2.mlp.shared_expert_gate",
396 "model.language_model.layers.2.post_attention_layernorm",
397 "model.language_model.layers.2.mlp.gate",
398 "model.language_model.layers.2.linear_attn.A_log",
399 "model.language_model.layers.2.linear_attn.conv1d",
400 "model.language_model.layers.2.linear_attn.dt_bias",
401 "model.language_model.layers.2.linear_attn.in_proj_ba",
402 "model.language_model.layers.2.linear_attn.in_proj_b",
403 "model.language_model.layers.2.linear_attn.in_proj_a",
404 "model.language_model.layers.2.linear_attn.norm",
405 "model.language_model.layers.3.input_layernorm",
406 "model.language_model.layers.3.mlp.shared_expert_gate",
407 "model.language_model.layers.3.post_attention_layernorm",
408 "model.language_model.layers.3.mlp.gate",
409 "model.language_model.layers.3.self_attn.k_norm",
410 "model.language_model.layers.3.self_attn.q_norm",
411 "model.language_model.layers.4.input_layernorm",
412 "model.language_model.layers.4.mlp.shared_expert_gate",
413 "model.language_model.layers.4.post_attention_layernorm",
414 "model.language_model.layers.4.mlp.gate",
415 "model.language_model.layers.4.linear_attn.A_log",
416 "model.language_model.layers.4.linear_attn.conv1d",
417 "model.language_model.layers.4.linear_attn.dt_bias",
418 "model.language_model.layers.4.linear_attn.in_proj_ba",
419 "model.language_model.layers.4.linear_attn.in_proj_b",
420 "model.language_model.layers.4.linear_attn.in_proj_a",
421 "model.language_model.layers.4.linear_attn.norm",
422 "model.language_model.layers.5.input_layernorm",
423 "model.language_model.layers.5.mlp.shared_expert_gate",
424 "model.language_model.layers.5.post_attention_layernorm",
425 "model.language_model.layers.5.mlp.gate",
426 "model.language_model.layers.5.linear_attn.A_log",
427 "model.language_model.layers.5.linear_attn.conv1d",
428 "model.language_model.layers.5.linear_attn.dt_bias",
429 "model.language_model.layers.5.linear_attn.in_proj_ba",
430 "model.language_model.layers.5.linear_attn.in_proj_b",
431 "model.language_model.layers.5.linear_attn.in_proj_a",
432 "model.language_model.layers.5.linear_attn.norm",
433 "model.language_model.layers.6.input_layernorm",
434 "model.language_model.layers.6.mlp.shared_expert_gate",
435 "model.language_model.layers.6.post_attention_layernorm",
436 "model.language_model.layers.6.mlp.gate",
437 "model.language_model.layers.6.linear_attn.A_log",
438 "model.language_model.layers.6.linear_attn.conv1d",
439 "model.language_model.layers.6.linear_attn.dt_bias",
440 "model.language_model.layers.6.linear_attn.in_proj_ba",
441 "model.language_model.layers.6.linear_attn.in_proj_b",
442 "model.language_model.layers.6.linear_attn.in_proj_a",
443 "model.language_model.layers.6.linear_attn.norm",
444 "model.language_model.layers.7.input_layernorm",
445 "model.language_model.layers.7.mlp.shared_expert_gate",
446 "model.language_model.layers.7.post_attention_layernorm",
447 "model.language_model.layers.7.mlp.gate",
448 "model.language_model.layers.7.self_attn.k_norm",
449 "model.language_model.layers.7.self_attn.q_norm",
450 "model.language_model.layers.8.input_layernorm",
451 "model.language_model.layers.8.mlp.shared_expert_gate",
452 "model.language_model.layers.8.post_attention_layernorm",
453 "model.language_model.layers.8.mlp.gate",
454 "model.language_model.layers.8.linear_attn.A_log",
455 "model.language_model.layers.8.linear_attn.conv1d",
456 "model.language_model.layers.8.linear_attn.dt_bias",
457 "model.language_model.layers.8.linear_attn.in_proj_ba",
458 "model.language_model.layers.8.linear_attn.in_proj_b",
459 "model.language_model.layers.8.linear_attn.in_proj_a",
460 "model.language_model.layers.8.linear_attn.norm",
461 "model.language_model.layers.9.input_layernorm",
462 "model.language_model.layers.9.mlp.shared_expert_gate",
463 "model.language_model.layers.9.post_attention_layernorm",
464 "model.language_model.layers.9.mlp.gate",
465 "model.language_model.layers.9.linear_attn.A_log",
466 "model.language_model.layers.9.linear_attn.conv1d",
467 "model.language_model.layers.9.linear_attn.dt_bias",
468 "model.language_model.layers.9.linear_attn.in_proj_ba",
469 "model.language_model.layers.9.linear_attn.in_proj_b",
470 "model.language_model.layers.9.linear_attn.in_proj_a",
471 "model.language_model.layers.9.linear_attn.norm",
472 "model.language_model.layers.10.input_layernorm",
473 "model.language_model.layers.10.mlp.shared_expert_gate",
474 "model.language_model.layers.10.post_attention_layernorm",
475 "model.language_model.layers.10.mlp.gate",
476 "model.language_model.layers.10.linear_attn.A_log",
477 "model.language_model.layers.10.linear_attn.conv1d",
478 "model.language_model.layers.10.linear_attn.dt_bias",
479 "model.language_model.layers.10.linear_attn.in_proj_ba",
480 "model.language_model.layers.10.linear_attn.in_proj_b",
481 "model.language_model.layers.10.linear_attn.in_proj_a",
482 "model.language_model.layers.10.linear_attn.norm",
483 "model.language_model.layers.11.input_layernorm",
484 "model.language_model.layers.11.mlp.shared_expert_gate",
485 "model.language_model.layers.11.post_attention_layernorm",
486 "model.language_model.layers.11.mlp.gate",
487 "model.language_model.layers.11.self_attn.k_norm",
488 "model.language_model.layers.11.self_attn.q_norm",
489 "model.language_model.layers.12.input_layernorm",
490 "model.language_model.layers.12.mlp.shared_expert_gate",
491 "model.language_model.layers.12.post_attention_layernorm",
492 "model.language_model.layers.12.mlp.gate",
493 "model.language_model.layers.12.linear_attn.A_log",
494 "model.language_model.layers.12.linear_attn.conv1d",
495 "model.language_model.layers.12.linear_attn.dt_bias",
496 "model.language_model.layers.12.linear_attn.in_proj_ba",
497 "model.language_model.layers.12.linear_attn.in_proj_b",
498 "model.language_model.layers.12.linear_attn.in_proj_a",
499 "model.language_model.layers.12.linear_attn.norm",
500 "model.language_model.layers.13.input_layernorm",
501 "model.language_model.layers.13.mlp.shared_expert_gate",
502 "model.language_model.layers.13.post_attention_layernorm",
503 "model.language_model.layers.13.mlp.gate",
504 "model.language_model.layers.13.linear_attn.A_log",
505 "model.language_model.layers.13.linear_attn.conv1d",
506 "model.language_model.layers.13.linear_attn.dt_bias",
507 "model.language_model.layers.13.linear_attn.in_proj_ba",
508 "model.language_model.layers.13.linear_attn.in_proj_b",
509 "model.language_model.layers.13.linear_attn.in_proj_a",
510 "model.language_model.layers.13.linear_attn.norm",
511 "model.language_model.layers.14.input_layernorm",
512 "model.language_model.layers.14.mlp.shared_expert_gate",
513 "model.language_model.layers.14.post_attention_layernorm",
514 "model.language_model.layers.14.mlp.gate",
515 "model.language_model.layers.14.linear_attn.A_log",
516 "model.language_model.layers.14.linear_attn.conv1d",
517 "model.language_model.layers.14.linear_attn.dt_bias",
518 "model.language_model.layers.14.linear_attn.in_proj_ba",
519 "model.language_model.layers.14.linear_attn.in_proj_b",
520 "model.language_model.layers.14.linear_attn.in_proj_a",
521 "model.language_model.layers.14.linear_attn.norm",
522 "model.language_model.layers.15.input_layernorm",
523 "model.language_model.layers.15.mlp.shared_expert_gate",
524 "model.language_model.layers.15.post_attention_layernorm",
525 "model.language_model.layers.15.mlp.gate",
526 "model.language_model.layers.15.self_attn.k_norm",
527 "model.language_model.layers.15.self_attn.q_norm",
528 "model.language_model.layers.16.input_layernorm",
529 "model.language_model.layers.16.mlp.shared_expert_gate",
530 "model.language_model.layers.16.post_attention_layernorm",
531 "model.language_model.layers.16.mlp.gate",
532 "model.language_model.layers.16.linear_attn.A_log",
533 "model.language_model.layers.16.linear_attn.conv1d",
534 "model.language_model.layers.16.linear_attn.dt_bias",
535 "model.language_model.layers.16.linear_attn.in_proj_ba",
536 "model.language_model.layers.16.linear_attn.in_proj_b",
537 "model.language_model.layers.16.linear_attn.in_proj_a",
538 "model.language_model.layers.16.linear_attn.norm",
539 "model.language_model.layers.17.input_layernorm",
540 "model.language_model.layers.17.mlp.shared_expert_gate",
541 "model.language_model.layers.17.post_attention_layernorm",
542 "model.language_model.layers.17.mlp.gate",
543 "model.language_model.layers.17.linear_attn.A_log",
544 "model.language_model.layers.17.linear_attn.conv1d",
545 "model.language_model.layers.17.linear_attn.dt_bias",
546 "model.language_model.layers.17.linear_attn.in_proj_ba",
547 "model.language_model.layers.17.linear_attn.in_proj_b",
548 "model.language_model.layers.17.linear_attn.in_proj_a",
549 "model.language_model.layers.17.linear_attn.norm",
550 "model.language_model.layers.18.input_layernorm",
551 "model.language_model.layers.18.mlp.shared_expert_gate",
552 "model.language_model.layers.18.post_attention_layernorm",
553 "model.language_model.layers.18.mlp.gate",
554 "model.language_model.layers.18.linear_attn.A_log",
555 "model.language_model.layers.18.linear_attn.conv1d",
556 "model.language_model.layers.18.linear_attn.dt_bias",
557 "model.language_model.layers.18.linear_attn.in_proj_ba",
558 "model.language_model.layers.18.linear_attn.in_proj_b",
559 "model.language_model.layers.18.linear_attn.in_proj_a",
560 "model.language_model.layers.18.linear_attn.norm",
561 "model.language_model.layers.19.input_layernorm",
562 "model.language_model.layers.19.mlp.shared_expert_gate",
563 "model.language_model.layers.19.post_attention_layernorm",
564 "model.language_model.layers.19.mlp.gate",
565 "model.language_model.layers.19.self_attn.k_norm",
566 "model.language_model.layers.19.self_attn.q_norm",
567 "model.language_model.layers.20.input_layernorm",
568 "model.language_model.layers.20.mlp.shared_expert_gate",
569 "model.language_model.layers.20.post_attention_layernorm",
570 "model.language_model.layers.20.mlp.gate",
571 "model.language_model.layers.20.linear_attn.A_log",
572 "model.language_model.layers.20.linear_attn.conv1d",
573 "model.language_model.layers.20.linear_attn.dt_bias",
574 "model.language_model.layers.20.linear_attn.in_proj_ba",
575 "model.language_model.layers.20.linear_attn.in_proj_b",
576 "model.language_model.layers.20.linear_attn.in_proj_a",
577 "model.language_model.layers.20.linear_attn.norm",
578 "model.language_model.layers.21.input_layernorm",
579 "model.language_model.layers.21.mlp.shared_expert_gate",
580 "model.language_model.layers.21.post_attention_layernorm",
581 "model.language_model.layers.21.mlp.gate",
582 "model.language_model.layers.21.linear_attn.A_log",
583 "model.language_model.layers.21.linear_attn.conv1d",
584 "model.language_model.layers.21.linear_attn.dt_bias",
585 "model.language_model.layers.21.linear_attn.in_proj_ba",
586 "model.language_model.layers.21.linear_attn.in_proj_b",
587 "model.language_model.layers.21.linear_attn.in_proj_a",
588 "model.language_model.layers.21.linear_attn.norm",
589 "model.language_model.layers.22.input_layernorm",
590 "model.language_model.layers.22.mlp.shared_expert_gate",
591 "model.language_model.layers.22.post_attention_layernorm",
592 "model.language_model.layers.22.mlp.gate",
593 "model.language_model.layers.22.linear_attn.A_log",
594 "model.language_model.layers.22.linear_attn.conv1d",
595 "model.language_model.layers.22.linear_attn.dt_bias",
596 "model.language_model.layers.22.linear_attn.in_proj_ba",
597 "model.language_model.layers.22.linear_attn.in_proj_b",
598 "model.language_model.layers.22.linear_attn.in_proj_a",
599 "model.language_model.layers.22.linear_attn.norm",
600 "model.language_model.layers.23.input_layernorm",
601 "model.language_model.layers.23.mlp.shared_expert_gate",
602 "model.language_model.layers.23.post_attention_layernorm",
603 "model.language_model.layers.23.mlp.gate",
604 "model.language_model.layers.23.self_attn.k_norm",
605 "model.language_model.layers.23.self_attn.q_norm",
606 "model.language_model.layers.24.input_layernorm",
607 "model.language_model.layers.24.mlp.shared_expert_gate",
608 "model.language_model.layers.24.post_attention_layernorm",
609 "model.language_model.layers.24.mlp.gate",
610 "model.language_model.layers.24.linear_attn.A_log",
611 "model.language_model.layers.24.linear_attn.conv1d",
612 "model.language_model.layers.24.linear_attn.dt_bias",
613 "model.language_model.layers.24.linear_attn.in_proj_ba",
614 "model.language_model.layers.24.linear_attn.in_proj_b",
615 "model.language_model.layers.24.linear_attn.in_proj_a",
616 "model.language_model.layers.24.linear_attn.norm",
617 "model.language_model.layers.25.input_layernorm",
618 "model.language_model.layers.25.mlp.shared_expert_gate",
619 "model.language_model.layers.25.post_attention_layernorm",
620 "model.language_model.layers.25.mlp.gate",
621 "model.language_model.layers.25.linear_attn.A_log",
622 "model.language_model.layers.25.linear_attn.conv1d",
623 "model.language_model.layers.25.linear_attn.dt_bias",
624 "model.language_model.layers.25.linear_attn.in_proj_ba",
625 "model.language_model.layers.25.linear_attn.in_proj_b",
626 "model.language_model.layers.25.linear_attn.in_proj_a",
627 "model.language_model.layers.25.linear_attn.norm",
628 "model.language_model.layers.26.input_layernorm",
629 "model.language_model.layers.26.mlp.shared_expert_gate",
630 "model.language_model.layers.26.post_attention_layernorm",
631 "model.language_model.layers.26.mlp.gate",
632 "model.language_model.layers.26.linear_attn.A_log",
633 "model.language_model.layers.26.linear_attn.conv1d",
634 "model.language_model.layers.26.linear_attn.dt_bias",
635 "model.language_model.layers.26.linear_attn.in_proj_ba",
636 "model.language_model.layers.26.linear_attn.in_proj_b",
637 "model.language_model.layers.26.linear_attn.in_proj_a",
638 "model.language_model.layers.26.linear_attn.norm",
639 "model.language_model.layers.27.input_layernorm",
640 "model.language_model.layers.27.mlp.shared_expert_gate",
641 "model.language_model.layers.27.post_attention_layernorm",
642 "model.language_model.layers.27.mlp.gate",
643 "model.language_model.layers.27.self_attn.k_norm",
644 "model.language_model.layers.27.self_attn.q_norm",
645 "model.language_model.layers.28.input_layernorm",
646 "model.language_model.layers.28.mlp.shared_expert_gate",
647 "model.language_model.layers.28.post_attention_layernorm",
648 "model.language_model.layers.28.mlp.gate",
649 "model.language_model.layers.28.linear_attn.A_log",
650 "model.language_model.layers.28.linear_attn.conv1d",
651 "model.language_model.layers.28.linear_attn.dt_bias",
652 "model.language_model.layers.28.linear_attn.in_proj_ba",
653 "model.language_model.layers.28.linear_attn.in_proj_b",
654 "model.language_model.layers.28.linear_attn.in_proj_a",
655 "model.language_model.layers.28.linear_attn.norm",
656 "model.language_model.layers.29.input_layernorm",
657 "model.language_model.layers.29.mlp.shared_expert_gate",
658 "model.language_model.layers.29.post_attention_layernorm",
659 "model.language_model.layers.29.mlp.gate",
660 "model.language_model.layers.29.linear_attn.A_log",
661 "model.language_model.layers.29.linear_attn.conv1d",
662 "model.language_model.layers.29.linear_attn.dt_bias",
663 "model.language_model.layers.29.linear_attn.in_proj_ba",
664 "model.language_model.layers.29.linear_attn.in_proj_b",
665 "model.language_model.layers.29.linear_attn.in_proj_a",
666 "model.language_model.layers.29.linear_attn.norm",
667 "model.language_model.layers.30.input_layernorm",
668 "model.language_model.layers.30.mlp.shared_expert_gate",
669 "model.language_model.layers.30.post_attention_layernorm",
670 "model.language_model.layers.30.mlp.gate",
671 "model.language_model.layers.30.linear_attn.A_log",
672 "model.language_model.layers.30.linear_attn.conv1d",
673 "model.language_model.layers.30.linear_attn.dt_bias",
674 "model.language_model.layers.30.linear_attn.in_proj_ba",
675 "model.language_model.layers.30.linear_attn.in_proj_b",
676 "model.language_model.layers.30.linear_attn.in_proj_a",
677 "model.language_model.layers.30.linear_attn.norm",
678 "model.language_model.layers.31.input_layernorm",
679 "model.language_model.layers.31.mlp.shared_expert_gate",
680 "model.language_model.layers.31.post_attention_layernorm",
681 "model.language_model.layers.31.mlp.gate",
682 "model.language_model.layers.31.self_attn.k_norm",
683 "model.language_model.layers.31.self_attn.q_norm",
684 "model.language_model.layers.32.input_layernorm",
685 "model.language_model.layers.32.mlp.shared_expert_gate",
686 "model.language_model.layers.32.post_attention_layernorm",
687 "model.language_model.layers.32.mlp.gate",
688 "model.language_model.layers.32.linear_attn.A_log",
689 "model.language_model.layers.32.linear_attn.conv1d",
690 "model.language_model.layers.32.linear_attn.dt_bias",
691 "model.language_model.layers.32.linear_attn.in_proj_ba",
692 "model.language_model.layers.32.linear_attn.in_proj_b",
693 "model.language_model.layers.32.linear_attn.in_proj_a",
694 "model.language_model.layers.32.linear_attn.norm",
695 "model.language_model.layers.33.input_layernorm",
696 "model.language_model.layers.33.mlp.shared_expert_gate",
697 "model.language_model.layers.33.post_attention_layernorm",
698 "model.language_model.layers.33.mlp.gate",
699 "model.language_model.layers.33.linear_attn.A_log",
700 "model.language_model.layers.33.linear_attn.conv1d",
701 "model.language_model.layers.33.linear_attn.dt_bias",
702 "model.language_model.layers.33.linear_attn.in_proj_ba",
703 "model.language_model.layers.33.linear_attn.in_proj_b",
704 "model.language_model.layers.33.linear_attn.in_proj_a",
705 "model.language_model.layers.33.linear_attn.norm",
706 "model.language_model.layers.34.input_layernorm",
707 "model.language_model.layers.34.mlp.shared_expert_gate",
708 "model.language_model.layers.34.post_attention_layernorm",
709 "model.language_model.layers.34.mlp.gate",
710 "model.language_model.layers.34.linear_attn.A_log",
711 "model.language_model.layers.34.linear_attn.conv1d",
712 "model.language_model.layers.34.linear_attn.dt_bias",
713 "model.language_model.layers.34.linear_attn.in_proj_ba",
714 "model.language_model.layers.34.linear_attn.in_proj_b",
715 "model.language_model.layers.34.linear_attn.in_proj_a",
716 "model.language_model.layers.34.linear_attn.norm",
717 "model.language_model.layers.35.input_layernorm",
718 "model.language_model.layers.35.mlp.shared_expert_gate",
719 "model.language_model.layers.35.post_attention_layernorm",
720 "model.language_model.layers.35.mlp.gate",
721 "model.language_model.layers.35.self_attn.k_norm",
722 "model.language_model.layers.35.self_attn.q_norm",
723 "model.language_model.layers.36.input_layernorm",
724 "model.language_model.layers.36.mlp.shared_expert_gate",
725 "model.language_model.layers.36.post_attention_layernorm",
726 "model.language_model.layers.36.mlp.gate",
727 "model.language_model.layers.36.linear_attn.A_log",
728 "model.language_model.layers.36.linear_attn.conv1d",
729 "model.language_model.layers.36.linear_attn.dt_bias",
730 "model.language_model.layers.36.linear_attn.in_proj_ba",
731 "model.language_model.layers.36.linear_attn.in_proj_b",
732 "model.language_model.layers.36.linear_attn.in_proj_a",
733 "model.language_model.layers.36.linear_attn.norm",
734 "model.language_model.layers.37.input_layernorm",
735 "model.language_model.layers.37.mlp.shared_expert_gate",
736 "model.language_model.layers.37.post_attention_layernorm",
737 "model.language_model.layers.37.mlp.gate",
738 "model.language_model.layers.37.linear_attn.A_log",
739 "model.language_model.layers.37.linear_attn.conv1d",
740 "model.language_model.layers.37.linear_attn.dt_bias",
741 "model.language_model.layers.37.linear_attn.in_proj_ba",
742 "model.language_model.layers.37.linear_attn.in_proj_b",
743 "model.language_model.layers.37.linear_attn.in_proj_a",
744 "model.language_model.layers.37.linear_attn.norm",
745 "model.language_model.layers.38.input_layernorm",
746 "model.language_model.layers.38.mlp.shared_expert_gate",
747 "model.language_model.layers.38.post_attention_layernorm",
748 "model.language_model.layers.38.mlp.gate",
749 "model.language_model.layers.38.linear_attn.A_log",
750 "model.language_model.layers.38.linear_attn.conv1d",
751 "model.language_model.layers.38.linear_attn.dt_bias",
752 "model.language_model.layers.38.linear_attn.in_proj_ba",
753 "model.language_model.layers.38.linear_attn.in_proj_b",
754 "model.language_model.layers.38.linear_attn.in_proj_a",
755 "model.language_model.layers.38.linear_attn.norm",
756 "model.language_model.layers.39.input_layernorm",
757 "model.language_model.layers.39.mlp.shared_expert_gate",
758 "model.language_model.layers.39.post_attention_layernorm",
759 "model.language_model.layers.39.mlp.gate",
760 "model.language_model.layers.39.self_attn.k_norm",
761 "model.language_model.layers.39.self_attn.q_norm",
762 "mtp.layers.0.input_layernorm",
763 "mtp.layers.0.mlp.gate",
764 "mtp.layers.0.mlp.shared_expert_gate",
765 "mtp.layers.0.post_attention_layernorm",
766 "mtp.layers.0.self_attn.k_norm",
767 "mtp.layers.0.self_attn.q_norm",
768 "mtp.fc",
769 "mtp.norm",
770 "mtp.pre_fc_norm_embedding",
771 "mtp.pre_fc_norm_hidden"
772 ],
773 "weight_block_size": [
774 128,
775 128
776 ]
777 }
778 }