config.json
50.1 KB · 1033 lines · json Raw
1 {
2 "architectures": [
3 "Qwen3_5ForConditionalGeneration"
4 ],
5 "image_token_id": 248056,
6 "language_model_only": false,
7 "model_type": "qwen3_5",
8 "text_config": {
9 "attention_bias": false,
10 "attention_dropout": 0.0,
11 "attn_output_gate": true,
12 "bos_token_id": 248044,
13 "dtype": "bfloat16",
14 "eos_token_id": 248044,
15 "full_attention_interval": 4,
16 "head_dim": 256,
17 "hidden_act": "silu",
18 "hidden_size": 5120,
19 "initializer_range": 0.02,
20 "intermediate_size": 17408,
21 "layer_types": [
22 "linear_attention",
23 "linear_attention",
24 "linear_attention",
25 "full_attention",
26 "linear_attention",
27 "linear_attention",
28 "linear_attention",
29 "full_attention",
30 "linear_attention",
31 "linear_attention",
32 "linear_attention",
33 "full_attention",
34 "linear_attention",
35 "linear_attention",
36 "linear_attention",
37 "full_attention",
38 "linear_attention",
39 "linear_attention",
40 "linear_attention",
41 "full_attention",
42 "linear_attention",
43 "linear_attention",
44 "linear_attention",
45 "full_attention",
46 "linear_attention",
47 "linear_attention",
48 "linear_attention",
49 "full_attention",
50 "linear_attention",
51 "linear_attention",
52 "linear_attention",
53 "full_attention",
54 "linear_attention",
55 "linear_attention",
56 "linear_attention",
57 "full_attention",
58 "linear_attention",
59 "linear_attention",
60 "linear_attention",
61 "full_attention",
62 "linear_attention",
63 "linear_attention",
64 "linear_attention",
65 "full_attention",
66 "linear_attention",
67 "linear_attention",
68 "linear_attention",
69 "full_attention",
70 "linear_attention",
71 "linear_attention",
72 "linear_attention",
73 "full_attention",
74 "linear_attention",
75 "linear_attention",
76 "linear_attention",
77 "full_attention",
78 "linear_attention",
79 "linear_attention",
80 "linear_attention",
81 "full_attention",
82 "linear_attention",
83 "linear_attention",
84 "linear_attention",
85 "full_attention"
86 ],
87 "linear_conv_kernel_dim": 4,
88 "linear_key_head_dim": 128,
89 "linear_num_key_heads": 16,
90 "linear_num_value_heads": 48,
91 "linear_value_head_dim": 128,
92 "mamba_ssm_dtype": "float32",
93 "max_position_embeddings": 262144,
94 "model_type": "qwen3_5_text",
95 "mtp_num_hidden_layers": 1,
96 "mtp_use_dedicated_embeddings": false,
97 "num_attention_heads": 24,
98 "num_hidden_layers": 64,
99 "num_key_value_heads": 4,
100 "output_gate_type": "swish",
101 "pad_token_id": null,
102 "partial_rotary_factor": 0.25,
103 "rms_norm_eps": 1e-06,
104 "rope_parameters": {
105 "mrope_interleaved": true,
106 "mrope_section": [
107 11,
108 11,
109 10
110 ],
111 "partial_rotary_factor": 0.25,
112 "rope_theta": 10000000,
113 "rope_type": "default"
114 },
115 "tie_word_embeddings": false,
116 "use_cache": true,
117 "vocab_size": 248320
118 },
119 "tie_word_embeddings": false,
120 "transformers_version": "4.57.1",
121 "video_token_id": 248057,
122 "vision_config": {
123 "deepstack_visual_indexes": [],
124 "depth": 27,
125 "hidden_act": "gelu_pytorch_tanh",
126 "hidden_size": 1152,
127 "in_channels": 3,
128 "initializer_range": 0.02,
129 "intermediate_size": 4304,
130 "model_type": "qwen3_5",
131 "num_heads": 16,
132 "num_position_embeddings": 2304,
133 "out_hidden_size": 5120,
134 "patch_size": 16,
135 "spatial_merge_size": 2,
136 "temporal_patch_size": 2
137 },
138 "vision_end_token_id": 248054,
139 "vision_start_token_id": 248053,
140 "quantization_config": {
141 "activation_scheme": "dynamic",
142 "fmt": "e4m3",
143 "quant_method": "fp8",
144 "modules_to_not_convert": [
145 "model.visual.blocks.0.attn.proj",
146 "model.visual.blocks.0.attn.qkv",
147 "model.visual.blocks.0.mlp.linear_fc1",
148 "model.visual.blocks.0.mlp.linear_fc2",
149 "visual.blocks.0.attn.proj",
150 "visual.blocks.0.attn.qkv_proj",
151 "visual.blocks.0.mlp.linear_fc1",
152 "visual.blocks.0.mlp.linear_fc2",
153 "model.visual.blocks.1.attn.proj",
154 "model.visual.blocks.1.attn.qkv",
155 "model.visual.blocks.1.mlp.linear_fc1",
156 "model.visual.blocks.1.mlp.linear_fc2",
157 "visual.blocks.1.attn.proj",
158 "visual.blocks.1.attn.qkv_proj",
159 "visual.blocks.1.mlp.linear_fc1",
160 "visual.blocks.1.mlp.linear_fc2",
161 "model.visual.blocks.2.attn.proj",
162 "model.visual.blocks.2.attn.qkv",
163 "model.visual.blocks.2.mlp.linear_fc1",
164 "model.visual.blocks.2.mlp.linear_fc2",
165 "visual.blocks.2.attn.proj",
166 "visual.blocks.2.attn.qkv_proj",
167 "visual.blocks.2.mlp.linear_fc1",
168 "visual.blocks.2.mlp.linear_fc2",
169 "model.visual.blocks.3.attn.proj",
170 "model.visual.blocks.3.attn.qkv",
171 "model.visual.blocks.3.mlp.linear_fc1",
172 "model.visual.blocks.3.mlp.linear_fc2",
173 "visual.blocks.3.attn.proj",
174 "visual.blocks.3.attn.qkv_proj",
175 "visual.blocks.3.mlp.linear_fc1",
176 "visual.blocks.3.mlp.linear_fc2",
177 "model.visual.blocks.4.attn.proj",
178 "model.visual.blocks.4.attn.qkv",
179 "model.visual.blocks.4.mlp.linear_fc1",
180 "model.visual.blocks.4.mlp.linear_fc2",
181 "visual.blocks.4.attn.proj",
182 "visual.blocks.4.attn.qkv_proj",
183 "visual.blocks.4.mlp.linear_fc1",
184 "visual.blocks.4.mlp.linear_fc2",
185 "model.visual.blocks.5.attn.proj",
186 "model.visual.blocks.5.attn.qkv",
187 "model.visual.blocks.5.mlp.linear_fc1",
188 "model.visual.blocks.5.mlp.linear_fc2",
189 "visual.blocks.5.attn.proj",
190 "visual.blocks.5.attn.qkv_proj",
191 "visual.blocks.5.mlp.linear_fc1",
192 "visual.blocks.5.mlp.linear_fc2",
193 "model.visual.blocks.6.attn.proj",
194 "model.visual.blocks.6.attn.qkv",
195 "model.visual.blocks.6.mlp.linear_fc1",
196 "model.visual.blocks.6.mlp.linear_fc2",
197 "visual.blocks.6.attn.proj",
198 "visual.blocks.6.attn.qkv_proj",
199 "visual.blocks.6.mlp.linear_fc1",
200 "visual.blocks.6.mlp.linear_fc2",
201 "model.visual.blocks.7.attn.proj",
202 "model.visual.blocks.7.attn.qkv",
203 "model.visual.blocks.7.mlp.linear_fc1",
204 "model.visual.blocks.7.mlp.linear_fc2",
205 "visual.blocks.7.attn.proj",
206 "visual.blocks.7.attn.qkv_proj",
207 "visual.blocks.7.mlp.linear_fc1",
208 "visual.blocks.7.mlp.linear_fc2",
209 "model.visual.blocks.8.attn.proj",
210 "model.visual.blocks.8.attn.qkv",
211 "model.visual.blocks.8.mlp.linear_fc1",
212 "model.visual.blocks.8.mlp.linear_fc2",
213 "visual.blocks.8.attn.proj",
214 "visual.blocks.8.attn.qkv_proj",
215 "visual.blocks.8.mlp.linear_fc1",
216 "visual.blocks.8.mlp.linear_fc2",
217 "model.visual.blocks.9.attn.proj",
218 "model.visual.blocks.9.attn.qkv",
219 "model.visual.blocks.9.mlp.linear_fc1",
220 "model.visual.blocks.9.mlp.linear_fc2",
221 "visual.blocks.9.attn.proj",
222 "visual.blocks.9.attn.qkv_proj",
223 "visual.blocks.9.mlp.linear_fc1",
224 "visual.blocks.9.mlp.linear_fc2",
225 "model.visual.blocks.10.attn.proj",
226 "model.visual.blocks.10.attn.qkv",
227 "model.visual.blocks.10.mlp.linear_fc1",
228 "model.visual.blocks.10.mlp.linear_fc2",
229 "visual.blocks.10.attn.proj",
230 "visual.blocks.10.attn.qkv_proj",
231 "visual.blocks.10.mlp.linear_fc1",
232 "visual.blocks.10.mlp.linear_fc2",
233 "model.visual.blocks.11.attn.proj",
234 "model.visual.blocks.11.attn.qkv",
235 "model.visual.blocks.11.mlp.linear_fc1",
236 "model.visual.blocks.11.mlp.linear_fc2",
237 "visual.blocks.11.attn.proj",
238 "visual.blocks.11.attn.qkv_proj",
239 "visual.blocks.11.mlp.linear_fc1",
240 "visual.blocks.11.mlp.linear_fc2",
241 "model.visual.blocks.12.attn.proj",
242 "model.visual.blocks.12.attn.qkv",
243 "model.visual.blocks.12.mlp.linear_fc1",
244 "model.visual.blocks.12.mlp.linear_fc2",
245 "visual.blocks.12.attn.proj",
246 "visual.blocks.12.attn.qkv_proj",
247 "visual.blocks.12.mlp.linear_fc1",
248 "visual.blocks.12.mlp.linear_fc2",
249 "model.visual.blocks.13.attn.proj",
250 "model.visual.blocks.13.attn.qkv",
251 "model.visual.blocks.13.mlp.linear_fc1",
252 "model.visual.blocks.13.mlp.linear_fc2",
253 "visual.blocks.13.attn.proj",
254 "visual.blocks.13.attn.qkv_proj",
255 "visual.blocks.13.mlp.linear_fc1",
256 "visual.blocks.13.mlp.linear_fc2",
257 "model.visual.blocks.14.attn.proj",
258 "model.visual.blocks.14.attn.qkv",
259 "model.visual.blocks.14.mlp.linear_fc1",
260 "model.visual.blocks.14.mlp.linear_fc2",
261 "visual.blocks.14.attn.proj",
262 "visual.blocks.14.attn.qkv_proj",
263 "visual.blocks.14.mlp.linear_fc1",
264 "visual.blocks.14.mlp.linear_fc2",
265 "model.visual.blocks.15.attn.proj",
266 "model.visual.blocks.15.attn.qkv",
267 "model.visual.blocks.15.mlp.linear_fc1",
268 "model.visual.blocks.15.mlp.linear_fc2",
269 "visual.blocks.15.attn.proj",
270 "visual.blocks.15.attn.qkv_proj",
271 "visual.blocks.15.mlp.linear_fc1",
272 "visual.blocks.15.mlp.linear_fc2",
273 "model.visual.blocks.16.attn.proj",
274 "model.visual.blocks.16.attn.qkv",
275 "model.visual.blocks.16.mlp.linear_fc1",
276 "model.visual.blocks.16.mlp.linear_fc2",
277 "visual.blocks.16.attn.proj",
278 "visual.blocks.16.attn.qkv_proj",
279 "visual.blocks.16.mlp.linear_fc1",
280 "visual.blocks.16.mlp.linear_fc2",
281 "model.visual.blocks.17.attn.proj",
282 "model.visual.blocks.17.attn.qkv",
283 "model.visual.blocks.17.mlp.linear_fc1",
284 "model.visual.blocks.17.mlp.linear_fc2",
285 "visual.blocks.17.attn.proj",
286 "visual.blocks.17.attn.qkv_proj",
287 "visual.blocks.17.mlp.linear_fc1",
288 "visual.blocks.17.mlp.linear_fc2",
289 "model.visual.blocks.18.attn.proj",
290 "model.visual.blocks.18.attn.qkv",
291 "model.visual.blocks.18.mlp.linear_fc1",
292 "model.visual.blocks.18.mlp.linear_fc2",
293 "visual.blocks.18.attn.proj",
294 "visual.blocks.18.attn.qkv_proj",
295 "visual.blocks.18.mlp.linear_fc1",
296 "visual.blocks.18.mlp.linear_fc2",
297 "model.visual.blocks.19.attn.proj",
298 "model.visual.blocks.19.attn.qkv",
299 "model.visual.blocks.19.mlp.linear_fc1",
300 "model.visual.blocks.19.mlp.linear_fc2",
301 "visual.blocks.19.attn.proj",
302 "visual.blocks.19.attn.qkv_proj",
303 "visual.blocks.19.mlp.linear_fc1",
304 "visual.blocks.19.mlp.linear_fc2",
305 "model.visual.blocks.20.attn.proj",
306 "model.visual.blocks.20.attn.qkv",
307 "model.visual.blocks.20.mlp.linear_fc1",
308 "model.visual.blocks.20.mlp.linear_fc2",
309 "visual.blocks.20.attn.proj",
310 "visual.blocks.20.attn.qkv_proj",
311 "visual.blocks.20.mlp.linear_fc1",
312 "visual.blocks.20.mlp.linear_fc2",
313 "model.visual.blocks.21.attn.proj",
314 "model.visual.blocks.21.attn.qkv",
315 "model.visual.blocks.21.mlp.linear_fc1",
316 "model.visual.blocks.21.mlp.linear_fc2",
317 "visual.blocks.21.attn.proj",
318 "visual.blocks.21.attn.qkv_proj",
319 "visual.blocks.21.mlp.linear_fc1",
320 "visual.blocks.21.mlp.linear_fc2",
321 "model.visual.blocks.22.attn.proj",
322 "model.visual.blocks.22.attn.qkv",
323 "model.visual.blocks.22.mlp.linear_fc1",
324 "model.visual.blocks.22.mlp.linear_fc2",
325 "visual.blocks.22.attn.proj",
326 "visual.blocks.22.attn.qkv_proj",
327 "visual.blocks.22.mlp.linear_fc1",
328 "visual.blocks.22.mlp.linear_fc2",
329 "model.visual.blocks.23.attn.proj",
330 "model.visual.blocks.23.attn.qkv",
331 "model.visual.blocks.23.mlp.linear_fc1",
332 "model.visual.blocks.23.mlp.linear_fc2",
333 "visual.blocks.23.attn.proj",
334 "visual.blocks.23.attn.qkv_proj",
335 "visual.blocks.23.mlp.linear_fc1",
336 "visual.blocks.23.mlp.linear_fc2",
337 "model.visual.blocks.24.attn.proj",
338 "model.visual.blocks.24.attn.qkv",
339 "model.visual.blocks.24.mlp.linear_fc1",
340 "model.visual.blocks.24.mlp.linear_fc2",
341 "visual.blocks.24.attn.proj",
342 "visual.blocks.24.attn.qkv_proj",
343 "visual.blocks.24.mlp.linear_fc1",
344 "visual.blocks.24.mlp.linear_fc2",
345 "model.visual.blocks.25.attn.proj",
346 "model.visual.blocks.25.attn.qkv",
347 "model.visual.blocks.25.mlp.linear_fc1",
348 "model.visual.blocks.25.mlp.linear_fc2",
349 "visual.blocks.25.attn.proj",
350 "visual.blocks.25.attn.qkv_proj",
351 "visual.blocks.25.mlp.linear_fc1",
352 "visual.blocks.25.mlp.linear_fc2",
353 "model.visual.blocks.26.attn.proj",
354 "model.visual.blocks.26.attn.qkv",
355 "model.visual.blocks.26.mlp.linear_fc1",
356 "model.visual.blocks.26.mlp.linear_fc2",
357 "visual.blocks.26.attn.proj",
358 "visual.blocks.26.attn.qkv_proj",
359 "visual.blocks.26.mlp.linear_fc1",
360 "visual.blocks.26.mlp.linear_fc2",
361 "model.visual.deepstack_merger_list.0.linear_fc1",
362 "model.visual.deepstack_merger_list.0.linear_fc2",
363 "model.visual.deepstack_merger_list.0.norm",
364 "visual.deepstack_merger_list.0.linear_fc1",
365 "visual.deepstack_merger_list.0.linear_fc2",
366 "visual.deepstack_merger_list.0.norm",
367 "model.visual.deepstack_merger_list.1.linear_fc1",
368 "model.visual.deepstack_merger_list.1.linear_fc2",
369 "model.visual.deepstack_merger_list.1.norm",
370 "visual.deepstack_merger_list.1.linear_fc1",
371 "visual.deepstack_merger_list.1.linear_fc2",
372 "visual.deepstack_merger_list.1.norm",
373 "model.visual.deepstack_merger_list.2.linear_fc1",
374 "model.visual.deepstack_merger_list.2.linear_fc2",
375 "model.visual.deepstack_merger_list.2.norm",
376 "visual.deepstack_merger_list.2.linear_fc1",
377 "visual.deepstack_merger_list.2.linear_fc2",
378 "visual.deepstack_merger_list.2.norm",
379 "model.visual.merger.linear_fc1",
380 "model.visual.merger.linear_fc2",
381 "model.visual.merger.norm",
382 "model.visual.patch_embed.proj",
383 "model.visual.pos_embed",
384 "visual.merger.linear_fc1",
385 "visual.merger.linear_fc2",
386 "visual.merger.norm",
387 "visual.patch_embed.proj",
388 "visual.pos_embed",
389 "visual",
390 "model.visual",
391 "lm_head",
392 "model.embed_tokens",
393 "model.language_model.layers.0.input_layernorm",
394 "model.language_model.layers.0.mlp.shared_expert_gate",
395 "model.language_model.layers.0.post_attention_layernorm",
396 "model.language_model.layers.0.mlp.gate",
397 "model.language_model.layers.0.linear_attn.A_log",
398 "model.language_model.layers.0.linear_attn.conv1d",
399 "model.language_model.layers.0.linear_attn.dt_bias",
400 "model.language_model.layers.0.linear_attn.in_proj_ba",
401 "model.language_model.layers.0.linear_attn.in_proj_b",
402 "model.language_model.layers.0.linear_attn.in_proj_a",
403 "model.language_model.layers.0.linear_attn.norm",
404 "model.language_model.layers.1.input_layernorm",
405 "model.language_model.layers.1.mlp.shared_expert_gate",
406 "model.language_model.layers.1.post_attention_layernorm",
407 "model.language_model.layers.1.mlp.gate",
408 "model.language_model.layers.1.linear_attn.A_log",
409 "model.language_model.layers.1.linear_attn.conv1d",
410 "model.language_model.layers.1.linear_attn.dt_bias",
411 "model.language_model.layers.1.linear_attn.in_proj_ba",
412 "model.language_model.layers.1.linear_attn.in_proj_b",
413 "model.language_model.layers.1.linear_attn.in_proj_a",
414 "model.language_model.layers.1.linear_attn.norm",
415 "model.language_model.layers.2.input_layernorm",
416 "model.language_model.layers.2.mlp.shared_expert_gate",
417 "model.language_model.layers.2.post_attention_layernorm",
418 "model.language_model.layers.2.mlp.gate",
419 "model.language_model.layers.2.linear_attn.A_log",
420 "model.language_model.layers.2.linear_attn.conv1d",
421 "model.language_model.layers.2.linear_attn.dt_bias",
422 "model.language_model.layers.2.linear_attn.in_proj_ba",
423 "model.language_model.layers.2.linear_attn.in_proj_b",
424 "model.language_model.layers.2.linear_attn.in_proj_a",
425 "model.language_model.layers.2.linear_attn.norm",
426 "model.language_model.layers.3.input_layernorm",
427 "model.language_model.layers.3.mlp.shared_expert_gate",
428 "model.language_model.layers.3.post_attention_layernorm",
429 "model.language_model.layers.3.mlp.gate",
430 "model.language_model.layers.3.self_attn.k_norm",
431 "model.language_model.layers.3.self_attn.q_norm",
432 "model.language_model.layers.4.input_layernorm",
433 "model.language_model.layers.4.mlp.shared_expert_gate",
434 "model.language_model.layers.4.post_attention_layernorm",
435 "model.language_model.layers.4.mlp.gate",
436 "model.language_model.layers.4.linear_attn.A_log",
437 "model.language_model.layers.4.linear_attn.conv1d",
438 "model.language_model.layers.4.linear_attn.dt_bias",
439 "model.language_model.layers.4.linear_attn.in_proj_ba",
440 "model.language_model.layers.4.linear_attn.in_proj_b",
441 "model.language_model.layers.4.linear_attn.in_proj_a",
442 "model.language_model.layers.4.linear_attn.norm",
443 "model.language_model.layers.5.input_layernorm",
444 "model.language_model.layers.5.mlp.shared_expert_gate",
445 "model.language_model.layers.5.post_attention_layernorm",
446 "model.language_model.layers.5.mlp.gate",
447 "model.language_model.layers.5.linear_attn.A_log",
448 "model.language_model.layers.5.linear_attn.conv1d",
449 "model.language_model.layers.5.linear_attn.dt_bias",
450 "model.language_model.layers.5.linear_attn.in_proj_ba",
451 "model.language_model.layers.5.linear_attn.in_proj_b",
452 "model.language_model.layers.5.linear_attn.in_proj_a",
453 "model.language_model.layers.5.linear_attn.norm",
454 "model.language_model.layers.6.input_layernorm",
455 "model.language_model.layers.6.mlp.shared_expert_gate",
456 "model.language_model.layers.6.post_attention_layernorm",
457 "model.language_model.layers.6.mlp.gate",
458 "model.language_model.layers.6.linear_attn.A_log",
459 "model.language_model.layers.6.linear_attn.conv1d",
460 "model.language_model.layers.6.linear_attn.dt_bias",
461 "model.language_model.layers.6.linear_attn.in_proj_ba",
462 "model.language_model.layers.6.linear_attn.in_proj_b",
463 "model.language_model.layers.6.linear_attn.in_proj_a",
464 "model.language_model.layers.6.linear_attn.norm",
465 "model.language_model.layers.7.input_layernorm",
466 "model.language_model.layers.7.mlp.shared_expert_gate",
467 "model.language_model.layers.7.post_attention_layernorm",
468 "model.language_model.layers.7.mlp.gate",
469 "model.language_model.layers.7.self_attn.k_norm",
470 "model.language_model.layers.7.self_attn.q_norm",
471 "model.language_model.layers.8.input_layernorm",
472 "model.language_model.layers.8.mlp.shared_expert_gate",
473 "model.language_model.layers.8.post_attention_layernorm",
474 "model.language_model.layers.8.mlp.gate",
475 "model.language_model.layers.8.linear_attn.A_log",
476 "model.language_model.layers.8.linear_attn.conv1d",
477 "model.language_model.layers.8.linear_attn.dt_bias",
478 "model.language_model.layers.8.linear_attn.in_proj_ba",
479 "model.language_model.layers.8.linear_attn.in_proj_b",
480 "model.language_model.layers.8.linear_attn.in_proj_a",
481 "model.language_model.layers.8.linear_attn.norm",
482 "model.language_model.layers.9.input_layernorm",
483 "model.language_model.layers.9.mlp.shared_expert_gate",
484 "model.language_model.layers.9.post_attention_layernorm",
485 "model.language_model.layers.9.mlp.gate",
486 "model.language_model.layers.9.linear_attn.A_log",
487 "model.language_model.layers.9.linear_attn.conv1d",
488 "model.language_model.layers.9.linear_attn.dt_bias",
489 "model.language_model.layers.9.linear_attn.in_proj_ba",
490 "model.language_model.layers.9.linear_attn.in_proj_b",
491 "model.language_model.layers.9.linear_attn.in_proj_a",
492 "model.language_model.layers.9.linear_attn.norm",
493 "model.language_model.layers.10.input_layernorm",
494 "model.language_model.layers.10.mlp.shared_expert_gate",
495 "model.language_model.layers.10.post_attention_layernorm",
496 "model.language_model.layers.10.mlp.gate",
497 "model.language_model.layers.10.linear_attn.A_log",
498 "model.language_model.layers.10.linear_attn.conv1d",
499 "model.language_model.layers.10.linear_attn.dt_bias",
500 "model.language_model.layers.10.linear_attn.in_proj_ba",
501 "model.language_model.layers.10.linear_attn.in_proj_b",
502 "model.language_model.layers.10.linear_attn.in_proj_a",
503 "model.language_model.layers.10.linear_attn.norm",
504 "model.language_model.layers.11.input_layernorm",
505 "model.language_model.layers.11.mlp.shared_expert_gate",
506 "model.language_model.layers.11.post_attention_layernorm",
507 "model.language_model.layers.11.mlp.gate",
508 "model.language_model.layers.11.self_attn.k_norm",
509 "model.language_model.layers.11.self_attn.q_norm",
510 "model.language_model.layers.12.input_layernorm",
511 "model.language_model.layers.12.mlp.shared_expert_gate",
512 "model.language_model.layers.12.post_attention_layernorm",
513 "model.language_model.layers.12.mlp.gate",
514 "model.language_model.layers.12.linear_attn.A_log",
515 "model.language_model.layers.12.linear_attn.conv1d",
516 "model.language_model.layers.12.linear_attn.dt_bias",
517 "model.language_model.layers.12.linear_attn.in_proj_ba",
518 "model.language_model.layers.12.linear_attn.in_proj_b",
519 "model.language_model.layers.12.linear_attn.in_proj_a",
520 "model.language_model.layers.12.linear_attn.norm",
521 "model.language_model.layers.13.input_layernorm",
522 "model.language_model.layers.13.mlp.shared_expert_gate",
523 "model.language_model.layers.13.post_attention_layernorm",
524 "model.language_model.layers.13.mlp.gate",
525 "model.language_model.layers.13.linear_attn.A_log",
526 "model.language_model.layers.13.linear_attn.conv1d",
527 "model.language_model.layers.13.linear_attn.dt_bias",
528 "model.language_model.layers.13.linear_attn.in_proj_ba",
529 "model.language_model.layers.13.linear_attn.in_proj_b",
530 "model.language_model.layers.13.linear_attn.in_proj_a",
531 "model.language_model.layers.13.linear_attn.norm",
532 "model.language_model.layers.14.input_layernorm",
533 "model.language_model.layers.14.mlp.shared_expert_gate",
534 "model.language_model.layers.14.post_attention_layernorm",
535 "model.language_model.layers.14.mlp.gate",
536 "model.language_model.layers.14.linear_attn.A_log",
537 "model.language_model.layers.14.linear_attn.conv1d",
538 "model.language_model.layers.14.linear_attn.dt_bias",
539 "model.language_model.layers.14.linear_attn.in_proj_ba",
540 "model.language_model.layers.14.linear_attn.in_proj_b",
541 "model.language_model.layers.14.linear_attn.in_proj_a",
542 "model.language_model.layers.14.linear_attn.norm",
543 "model.language_model.layers.15.input_layernorm",
544 "model.language_model.layers.15.mlp.shared_expert_gate",
545 "model.language_model.layers.15.post_attention_layernorm",
546 "model.language_model.layers.15.mlp.gate",
547 "model.language_model.layers.15.self_attn.k_norm",
548 "model.language_model.layers.15.self_attn.q_norm",
549 "model.language_model.layers.16.input_layernorm",
550 "model.language_model.layers.16.mlp.shared_expert_gate",
551 "model.language_model.layers.16.post_attention_layernorm",
552 "model.language_model.layers.16.mlp.gate",
553 "model.language_model.layers.16.linear_attn.A_log",
554 "model.language_model.layers.16.linear_attn.conv1d",
555 "model.language_model.layers.16.linear_attn.dt_bias",
556 "model.language_model.layers.16.linear_attn.in_proj_ba",
557 "model.language_model.layers.16.linear_attn.in_proj_b",
558 "model.language_model.layers.16.linear_attn.in_proj_a",
559 "model.language_model.layers.16.linear_attn.norm",
560 "model.language_model.layers.17.input_layernorm",
561 "model.language_model.layers.17.mlp.shared_expert_gate",
562 "model.language_model.layers.17.post_attention_layernorm",
563 "model.language_model.layers.17.mlp.gate",
564 "model.language_model.layers.17.linear_attn.A_log",
565 "model.language_model.layers.17.linear_attn.conv1d",
566 "model.language_model.layers.17.linear_attn.dt_bias",
567 "model.language_model.layers.17.linear_attn.in_proj_ba",
568 "model.language_model.layers.17.linear_attn.in_proj_b",
569 "model.language_model.layers.17.linear_attn.in_proj_a",
570 "model.language_model.layers.17.linear_attn.norm",
571 "model.language_model.layers.18.input_layernorm",
572 "model.language_model.layers.18.mlp.shared_expert_gate",
573 "model.language_model.layers.18.post_attention_layernorm",
574 "model.language_model.layers.18.mlp.gate",
575 "model.language_model.layers.18.linear_attn.A_log",
576 "model.language_model.layers.18.linear_attn.conv1d",
577 "model.language_model.layers.18.linear_attn.dt_bias",
578 "model.language_model.layers.18.linear_attn.in_proj_ba",
579 "model.language_model.layers.18.linear_attn.in_proj_b",
580 "model.language_model.layers.18.linear_attn.in_proj_a",
581 "model.language_model.layers.18.linear_attn.norm",
582 "model.language_model.layers.19.input_layernorm",
583 "model.language_model.layers.19.mlp.shared_expert_gate",
584 "model.language_model.layers.19.post_attention_layernorm",
585 "model.language_model.layers.19.mlp.gate",
586 "model.language_model.layers.19.self_attn.k_norm",
587 "model.language_model.layers.19.self_attn.q_norm",
588 "model.language_model.layers.20.input_layernorm",
589 "model.language_model.layers.20.mlp.shared_expert_gate",
590 "model.language_model.layers.20.post_attention_layernorm",
591 "model.language_model.layers.20.mlp.gate",
592 "model.language_model.layers.20.linear_attn.A_log",
593 "model.language_model.layers.20.linear_attn.conv1d",
594 "model.language_model.layers.20.linear_attn.dt_bias",
595 "model.language_model.layers.20.linear_attn.in_proj_ba",
596 "model.language_model.layers.20.linear_attn.in_proj_b",
597 "model.language_model.layers.20.linear_attn.in_proj_a",
598 "model.language_model.layers.20.linear_attn.norm",
599 "model.language_model.layers.21.input_layernorm",
600 "model.language_model.layers.21.mlp.shared_expert_gate",
601 "model.language_model.layers.21.post_attention_layernorm",
602 "model.language_model.layers.21.mlp.gate",
603 "model.language_model.layers.21.linear_attn.A_log",
604 "model.language_model.layers.21.linear_attn.conv1d",
605 "model.language_model.layers.21.linear_attn.dt_bias",
606 "model.language_model.layers.21.linear_attn.in_proj_ba",
607 "model.language_model.layers.21.linear_attn.in_proj_b",
608 "model.language_model.layers.21.linear_attn.in_proj_a",
609 "model.language_model.layers.21.linear_attn.norm",
610 "model.language_model.layers.22.input_layernorm",
611 "model.language_model.layers.22.mlp.shared_expert_gate",
612 "model.language_model.layers.22.post_attention_layernorm",
613 "model.language_model.layers.22.mlp.gate",
614 "model.language_model.layers.22.linear_attn.A_log",
615 "model.language_model.layers.22.linear_attn.conv1d",
616 "model.language_model.layers.22.linear_attn.dt_bias",
617 "model.language_model.layers.22.linear_attn.in_proj_ba",
618 "model.language_model.layers.22.linear_attn.in_proj_b",
619 "model.language_model.layers.22.linear_attn.in_proj_a",
620 "model.language_model.layers.22.linear_attn.norm",
621 "model.language_model.layers.23.input_layernorm",
622 "model.language_model.layers.23.mlp.shared_expert_gate",
623 "model.language_model.layers.23.post_attention_layernorm",
624 "model.language_model.layers.23.mlp.gate",
625 "model.language_model.layers.23.self_attn.k_norm",
626 "model.language_model.layers.23.self_attn.q_norm",
627 "model.language_model.layers.24.input_layernorm",
628 "model.language_model.layers.24.mlp.shared_expert_gate",
629 "model.language_model.layers.24.post_attention_layernorm",
630 "model.language_model.layers.24.mlp.gate",
631 "model.language_model.layers.24.linear_attn.A_log",
632 "model.language_model.layers.24.linear_attn.conv1d",
633 "model.language_model.layers.24.linear_attn.dt_bias",
634 "model.language_model.layers.24.linear_attn.in_proj_ba",
635 "model.language_model.layers.24.linear_attn.in_proj_b",
636 "model.language_model.layers.24.linear_attn.in_proj_a",
637 "model.language_model.layers.24.linear_attn.norm",
638 "model.language_model.layers.25.input_layernorm",
639 "model.language_model.layers.25.mlp.shared_expert_gate",
640 "model.language_model.layers.25.post_attention_layernorm",
641 "model.language_model.layers.25.mlp.gate",
642 "model.language_model.layers.25.linear_attn.A_log",
643 "model.language_model.layers.25.linear_attn.conv1d",
644 "model.language_model.layers.25.linear_attn.dt_bias",
645 "model.language_model.layers.25.linear_attn.in_proj_ba",
646 "model.language_model.layers.25.linear_attn.in_proj_b",
647 "model.language_model.layers.25.linear_attn.in_proj_a",
648 "model.language_model.layers.25.linear_attn.norm",
649 "model.language_model.layers.26.input_layernorm",
650 "model.language_model.layers.26.mlp.shared_expert_gate",
651 "model.language_model.layers.26.post_attention_layernorm",
652 "model.language_model.layers.26.mlp.gate",
653 "model.language_model.layers.26.linear_attn.A_log",
654 "model.language_model.layers.26.linear_attn.conv1d",
655 "model.language_model.layers.26.linear_attn.dt_bias",
656 "model.language_model.layers.26.linear_attn.in_proj_ba",
657 "model.language_model.layers.26.linear_attn.in_proj_b",
658 "model.language_model.layers.26.linear_attn.in_proj_a",
659 "model.language_model.layers.26.linear_attn.norm",
660 "model.language_model.layers.27.input_layernorm",
661 "model.language_model.layers.27.mlp.shared_expert_gate",
662 "model.language_model.layers.27.post_attention_layernorm",
663 "model.language_model.layers.27.mlp.gate",
664 "model.language_model.layers.27.self_attn.k_norm",
665 "model.language_model.layers.27.self_attn.q_norm",
666 "model.language_model.layers.28.input_layernorm",
667 "model.language_model.layers.28.mlp.shared_expert_gate",
668 "model.language_model.layers.28.post_attention_layernorm",
669 "model.language_model.layers.28.mlp.gate",
670 "model.language_model.layers.28.linear_attn.A_log",
671 "model.language_model.layers.28.linear_attn.conv1d",
672 "model.language_model.layers.28.linear_attn.dt_bias",
673 "model.language_model.layers.28.linear_attn.in_proj_ba",
674 "model.language_model.layers.28.linear_attn.in_proj_b",
675 "model.language_model.layers.28.linear_attn.in_proj_a",
676 "model.language_model.layers.28.linear_attn.norm",
677 "model.language_model.layers.29.input_layernorm",
678 "model.language_model.layers.29.mlp.shared_expert_gate",
679 "model.language_model.layers.29.post_attention_layernorm",
680 "model.language_model.layers.29.mlp.gate",
681 "model.language_model.layers.29.linear_attn.A_log",
682 "model.language_model.layers.29.linear_attn.conv1d",
683 "model.language_model.layers.29.linear_attn.dt_bias",
684 "model.language_model.layers.29.linear_attn.in_proj_ba",
685 "model.language_model.layers.29.linear_attn.in_proj_b",
686 "model.language_model.layers.29.linear_attn.in_proj_a",
687 "model.language_model.layers.29.linear_attn.norm",
688 "model.language_model.layers.30.input_layernorm",
689 "model.language_model.layers.30.mlp.shared_expert_gate",
690 "model.language_model.layers.30.post_attention_layernorm",
691 "model.language_model.layers.30.mlp.gate",
692 "model.language_model.layers.30.linear_attn.A_log",
693 "model.language_model.layers.30.linear_attn.conv1d",
694 "model.language_model.layers.30.linear_attn.dt_bias",
695 "model.language_model.layers.30.linear_attn.in_proj_ba",
696 "model.language_model.layers.30.linear_attn.in_proj_b",
697 "model.language_model.layers.30.linear_attn.in_proj_a",
698 "model.language_model.layers.30.linear_attn.norm",
699 "model.language_model.layers.31.input_layernorm",
700 "model.language_model.layers.31.mlp.shared_expert_gate",
701 "model.language_model.layers.31.post_attention_layernorm",
702 "model.language_model.layers.31.mlp.gate",
703 "model.language_model.layers.31.self_attn.k_norm",
704 "model.language_model.layers.31.self_attn.q_norm",
705 "model.language_model.layers.32.input_layernorm",
706 "model.language_model.layers.32.mlp.shared_expert_gate",
707 "model.language_model.layers.32.post_attention_layernorm",
708 "model.language_model.layers.32.mlp.gate",
709 "model.language_model.layers.32.linear_attn.A_log",
710 "model.language_model.layers.32.linear_attn.conv1d",
711 "model.language_model.layers.32.linear_attn.dt_bias",
712 "model.language_model.layers.32.linear_attn.in_proj_ba",
713 "model.language_model.layers.32.linear_attn.in_proj_b",
714 "model.language_model.layers.32.linear_attn.in_proj_a",
715 "model.language_model.layers.32.linear_attn.norm",
716 "model.language_model.layers.33.input_layernorm",
717 "model.language_model.layers.33.mlp.shared_expert_gate",
718 "model.language_model.layers.33.post_attention_layernorm",
719 "model.language_model.layers.33.mlp.gate",
720 "model.language_model.layers.33.linear_attn.A_log",
721 "model.language_model.layers.33.linear_attn.conv1d",
722 "model.language_model.layers.33.linear_attn.dt_bias",
723 "model.language_model.layers.33.linear_attn.in_proj_ba",
724 "model.language_model.layers.33.linear_attn.in_proj_b",
725 "model.language_model.layers.33.linear_attn.in_proj_a",
726 "model.language_model.layers.33.linear_attn.norm",
727 "model.language_model.layers.34.input_layernorm",
728 "model.language_model.layers.34.mlp.shared_expert_gate",
729 "model.language_model.layers.34.post_attention_layernorm",
730 "model.language_model.layers.34.mlp.gate",
731 "model.language_model.layers.34.linear_attn.A_log",
732 "model.language_model.layers.34.linear_attn.conv1d",
733 "model.language_model.layers.34.linear_attn.dt_bias",
734 "model.language_model.layers.34.linear_attn.in_proj_ba",
735 "model.language_model.layers.34.linear_attn.in_proj_b",
736 "model.language_model.layers.34.linear_attn.in_proj_a",
737 "model.language_model.layers.34.linear_attn.norm",
738 "model.language_model.layers.35.input_layernorm",
739 "model.language_model.layers.35.mlp.shared_expert_gate",
740 "model.language_model.layers.35.post_attention_layernorm",
741 "model.language_model.layers.35.mlp.gate",
742 "model.language_model.layers.35.self_attn.k_norm",
743 "model.language_model.layers.35.self_attn.q_norm",
744 "model.language_model.layers.36.input_layernorm",
745 "model.language_model.layers.36.mlp.shared_expert_gate",
746 "model.language_model.layers.36.post_attention_layernorm",
747 "model.language_model.layers.36.mlp.gate",
748 "model.language_model.layers.36.linear_attn.A_log",
749 "model.language_model.layers.36.linear_attn.conv1d",
750 "model.language_model.layers.36.linear_attn.dt_bias",
751 "model.language_model.layers.36.linear_attn.in_proj_ba",
752 "model.language_model.layers.36.linear_attn.in_proj_b",
753 "model.language_model.layers.36.linear_attn.in_proj_a",
754 "model.language_model.layers.36.linear_attn.norm",
755 "model.language_model.layers.37.input_layernorm",
756 "model.language_model.layers.37.mlp.shared_expert_gate",
757 "model.language_model.layers.37.post_attention_layernorm",
758 "model.language_model.layers.37.mlp.gate",
759 "model.language_model.layers.37.linear_attn.A_log",
760 "model.language_model.layers.37.linear_attn.conv1d",
761 "model.language_model.layers.37.linear_attn.dt_bias",
762 "model.language_model.layers.37.linear_attn.in_proj_ba",
763 "model.language_model.layers.37.linear_attn.in_proj_b",
764 "model.language_model.layers.37.linear_attn.in_proj_a",
765 "model.language_model.layers.37.linear_attn.norm",
766 "model.language_model.layers.38.input_layernorm",
767 "model.language_model.layers.38.mlp.shared_expert_gate",
768 "model.language_model.layers.38.post_attention_layernorm",
769 "model.language_model.layers.38.mlp.gate",
770 "model.language_model.layers.38.linear_attn.A_log",
771 "model.language_model.layers.38.linear_attn.conv1d",
772 "model.language_model.layers.38.linear_attn.dt_bias",
773 "model.language_model.layers.38.linear_attn.in_proj_ba",
774 "model.language_model.layers.38.linear_attn.in_proj_b",
775 "model.language_model.layers.38.linear_attn.in_proj_a",
776 "model.language_model.layers.38.linear_attn.norm",
777 "model.language_model.layers.39.input_layernorm",
778 "model.language_model.layers.39.mlp.shared_expert_gate",
779 "model.language_model.layers.39.post_attention_layernorm",
780 "model.language_model.layers.39.mlp.gate",
781 "model.language_model.layers.39.self_attn.k_norm",
782 "model.language_model.layers.39.self_attn.q_norm",
783 "model.language_model.layers.40.input_layernorm",
784 "model.language_model.layers.40.mlp.shared_expert_gate",
785 "model.language_model.layers.40.post_attention_layernorm",
786 "model.language_model.layers.40.mlp.gate",
787 "model.language_model.layers.40.linear_attn.A_log",
788 "model.language_model.layers.40.linear_attn.conv1d",
789 "model.language_model.layers.40.linear_attn.dt_bias",
790 "model.language_model.layers.40.linear_attn.in_proj_ba",
791 "model.language_model.layers.40.linear_attn.in_proj_b",
792 "model.language_model.layers.40.linear_attn.in_proj_a",
793 "model.language_model.layers.40.linear_attn.norm",
794 "model.language_model.layers.41.input_layernorm",
795 "model.language_model.layers.41.mlp.shared_expert_gate",
796 "model.language_model.layers.41.post_attention_layernorm",
797 "model.language_model.layers.41.mlp.gate",
798 "model.language_model.layers.41.linear_attn.A_log",
799 "model.language_model.layers.41.linear_attn.conv1d",
800 "model.language_model.layers.41.linear_attn.dt_bias",
801 "model.language_model.layers.41.linear_attn.in_proj_ba",
802 "model.language_model.layers.41.linear_attn.in_proj_b",
803 "model.language_model.layers.41.linear_attn.in_proj_a",
804 "model.language_model.layers.41.linear_attn.norm",
805 "model.language_model.layers.42.input_layernorm",
806 "model.language_model.layers.42.mlp.shared_expert_gate",
807 "model.language_model.layers.42.post_attention_layernorm",
808 "model.language_model.layers.42.mlp.gate",
809 "model.language_model.layers.42.linear_attn.A_log",
810 "model.language_model.layers.42.linear_attn.conv1d",
811 "model.language_model.layers.42.linear_attn.dt_bias",
812 "model.language_model.layers.42.linear_attn.in_proj_ba",
813 "model.language_model.layers.42.linear_attn.in_proj_b",
814 "model.language_model.layers.42.linear_attn.in_proj_a",
815 "model.language_model.layers.42.linear_attn.norm",
816 "model.language_model.layers.43.input_layernorm",
817 "model.language_model.layers.43.mlp.shared_expert_gate",
818 "model.language_model.layers.43.post_attention_layernorm",
819 "model.language_model.layers.43.mlp.gate",
820 "model.language_model.layers.43.self_attn.k_norm",
821 "model.language_model.layers.43.self_attn.q_norm",
822 "model.language_model.layers.44.input_layernorm",
823 "model.language_model.layers.44.mlp.shared_expert_gate",
824 "model.language_model.layers.44.post_attention_layernorm",
825 "model.language_model.layers.44.mlp.gate",
826 "model.language_model.layers.44.linear_attn.A_log",
827 "model.language_model.layers.44.linear_attn.conv1d",
828 "model.language_model.layers.44.linear_attn.dt_bias",
829 "model.language_model.layers.44.linear_attn.in_proj_ba",
830 "model.language_model.layers.44.linear_attn.in_proj_b",
831 "model.language_model.layers.44.linear_attn.in_proj_a",
832 "model.language_model.layers.44.linear_attn.norm",
833 "model.language_model.layers.45.input_layernorm",
834 "model.language_model.layers.45.mlp.shared_expert_gate",
835 "model.language_model.layers.45.post_attention_layernorm",
836 "model.language_model.layers.45.mlp.gate",
837 "model.language_model.layers.45.linear_attn.A_log",
838 "model.language_model.layers.45.linear_attn.conv1d",
839 "model.language_model.layers.45.linear_attn.dt_bias",
840 "model.language_model.layers.45.linear_attn.in_proj_ba",
841 "model.language_model.layers.45.linear_attn.in_proj_b",
842 "model.language_model.layers.45.linear_attn.in_proj_a",
843 "model.language_model.layers.45.linear_attn.norm",
844 "model.language_model.layers.46.input_layernorm",
845 "model.language_model.layers.46.mlp.shared_expert_gate",
846 "model.language_model.layers.46.post_attention_layernorm",
847 "model.language_model.layers.46.mlp.gate",
848 "model.language_model.layers.46.linear_attn.A_log",
849 "model.language_model.layers.46.linear_attn.conv1d",
850 "model.language_model.layers.46.linear_attn.dt_bias",
851 "model.language_model.layers.46.linear_attn.in_proj_ba",
852 "model.language_model.layers.46.linear_attn.in_proj_b",
853 "model.language_model.layers.46.linear_attn.in_proj_a",
854 "model.language_model.layers.46.linear_attn.norm",
855 "model.language_model.layers.47.input_layernorm",
856 "model.language_model.layers.47.mlp.shared_expert_gate",
857 "model.language_model.layers.47.post_attention_layernorm",
858 "model.language_model.layers.47.mlp.gate",
859 "model.language_model.layers.47.self_attn.k_norm",
860 "model.language_model.layers.47.self_attn.q_norm",
861 "model.language_model.layers.48.input_layernorm",
862 "model.language_model.layers.48.mlp.shared_expert_gate",
863 "model.language_model.layers.48.post_attention_layernorm",
864 "model.language_model.layers.48.mlp.gate",
865 "model.language_model.layers.48.linear_attn.A_log",
866 "model.language_model.layers.48.linear_attn.conv1d",
867 "model.language_model.layers.48.linear_attn.dt_bias",
868 "model.language_model.layers.48.linear_attn.in_proj_ba",
869 "model.language_model.layers.48.linear_attn.in_proj_b",
870 "model.language_model.layers.48.linear_attn.in_proj_a",
871 "model.language_model.layers.48.linear_attn.norm",
872 "model.language_model.layers.49.input_layernorm",
873 "model.language_model.layers.49.mlp.shared_expert_gate",
874 "model.language_model.layers.49.post_attention_layernorm",
875 "model.language_model.layers.49.mlp.gate",
876 "model.language_model.layers.49.linear_attn.A_log",
877 "model.language_model.layers.49.linear_attn.conv1d",
878 "model.language_model.layers.49.linear_attn.dt_bias",
879 "model.language_model.layers.49.linear_attn.in_proj_ba",
880 "model.language_model.layers.49.linear_attn.in_proj_b",
881 "model.language_model.layers.49.linear_attn.in_proj_a",
882 "model.language_model.layers.49.linear_attn.norm",
883 "model.language_model.layers.50.input_layernorm",
884 "model.language_model.layers.50.mlp.shared_expert_gate",
885 "model.language_model.layers.50.post_attention_layernorm",
886 "model.language_model.layers.50.mlp.gate",
887 "model.language_model.layers.50.linear_attn.A_log",
888 "model.language_model.layers.50.linear_attn.conv1d",
889 "model.language_model.layers.50.linear_attn.dt_bias",
890 "model.language_model.layers.50.linear_attn.in_proj_ba",
891 "model.language_model.layers.50.linear_attn.in_proj_b",
892 "model.language_model.layers.50.linear_attn.in_proj_a",
893 "model.language_model.layers.50.linear_attn.norm",
894 "model.language_model.layers.51.input_layernorm",
895 "model.language_model.layers.51.mlp.shared_expert_gate",
896 "model.language_model.layers.51.post_attention_layernorm",
897 "model.language_model.layers.51.mlp.gate",
898 "model.language_model.layers.51.self_attn.k_norm",
899 "model.language_model.layers.51.self_attn.q_norm",
900 "model.language_model.layers.52.input_layernorm",
901 "model.language_model.layers.52.mlp.shared_expert_gate",
902 "model.language_model.layers.52.post_attention_layernorm",
903 "model.language_model.layers.52.mlp.gate",
904 "model.language_model.layers.52.linear_attn.A_log",
905 "model.language_model.layers.52.linear_attn.conv1d",
906 "model.language_model.layers.52.linear_attn.dt_bias",
907 "model.language_model.layers.52.linear_attn.in_proj_ba",
908 "model.language_model.layers.52.linear_attn.in_proj_b",
909 "model.language_model.layers.52.linear_attn.in_proj_a",
910 "model.language_model.layers.52.linear_attn.norm",
911 "model.language_model.layers.53.input_layernorm",
912 "model.language_model.layers.53.mlp.shared_expert_gate",
913 "model.language_model.layers.53.post_attention_layernorm",
914 "model.language_model.layers.53.mlp.gate",
915 "model.language_model.layers.53.linear_attn.A_log",
916 "model.language_model.layers.53.linear_attn.conv1d",
917 "model.language_model.layers.53.linear_attn.dt_bias",
918 "model.language_model.layers.53.linear_attn.in_proj_ba",
919 "model.language_model.layers.53.linear_attn.in_proj_b",
920 "model.language_model.layers.53.linear_attn.in_proj_a",
921 "model.language_model.layers.53.linear_attn.norm",
922 "model.language_model.layers.54.input_layernorm",
923 "model.language_model.layers.54.mlp.shared_expert_gate",
924 "model.language_model.layers.54.post_attention_layernorm",
925 "model.language_model.layers.54.mlp.gate",
926 "model.language_model.layers.54.linear_attn.A_log",
927 "model.language_model.layers.54.linear_attn.conv1d",
928 "model.language_model.layers.54.linear_attn.dt_bias",
929 "model.language_model.layers.54.linear_attn.in_proj_ba",
930 "model.language_model.layers.54.linear_attn.in_proj_b",
931 "model.language_model.layers.54.linear_attn.in_proj_a",
932 "model.language_model.layers.54.linear_attn.norm",
933 "model.language_model.layers.55.input_layernorm",
934 "model.language_model.layers.55.mlp.shared_expert_gate",
935 "model.language_model.layers.55.post_attention_layernorm",
936 "model.language_model.layers.55.mlp.gate",
937 "model.language_model.layers.55.self_attn.k_norm",
938 "model.language_model.layers.55.self_attn.q_norm",
939 "model.language_model.layers.56.input_layernorm",
940 "model.language_model.layers.56.mlp.shared_expert_gate",
941 "model.language_model.layers.56.post_attention_layernorm",
942 "model.language_model.layers.56.mlp.gate",
943 "model.language_model.layers.56.linear_attn.A_log",
944 "model.language_model.layers.56.linear_attn.conv1d",
945 "model.language_model.layers.56.linear_attn.dt_bias",
946 "model.language_model.layers.56.linear_attn.in_proj_ba",
947 "model.language_model.layers.56.linear_attn.in_proj_b",
948 "model.language_model.layers.56.linear_attn.in_proj_a",
949 "model.language_model.layers.56.linear_attn.norm",
950 "model.language_model.layers.57.input_layernorm",
951 "model.language_model.layers.57.mlp.shared_expert_gate",
952 "model.language_model.layers.57.post_attention_layernorm",
953 "model.language_model.layers.57.mlp.gate",
954 "model.language_model.layers.57.linear_attn.A_log",
955 "model.language_model.layers.57.linear_attn.conv1d",
956 "model.language_model.layers.57.linear_attn.dt_bias",
957 "model.language_model.layers.57.linear_attn.in_proj_ba",
958 "model.language_model.layers.57.linear_attn.in_proj_b",
959 "model.language_model.layers.57.linear_attn.in_proj_a",
960 "model.language_model.layers.57.linear_attn.norm",
961 "model.language_model.layers.58.input_layernorm",
962 "model.language_model.layers.58.mlp.shared_expert_gate",
963 "model.language_model.layers.58.post_attention_layernorm",
964 "model.language_model.layers.58.mlp.gate",
965 "model.language_model.layers.58.linear_attn.A_log",
966 "model.language_model.layers.58.linear_attn.conv1d",
967 "model.language_model.layers.58.linear_attn.dt_bias",
968 "model.language_model.layers.58.linear_attn.in_proj_ba",
969 "model.language_model.layers.58.linear_attn.in_proj_b",
970 "model.language_model.layers.58.linear_attn.in_proj_a",
971 "model.language_model.layers.58.linear_attn.norm",
972 "model.language_model.layers.59.input_layernorm",
973 "model.language_model.layers.59.mlp.shared_expert_gate",
974 "model.language_model.layers.59.post_attention_layernorm",
975 "model.language_model.layers.59.mlp.gate",
976 "model.language_model.layers.59.self_attn.k_norm",
977 "model.language_model.layers.59.self_attn.q_norm",
978 "model.language_model.layers.60.input_layernorm",
979 "model.language_model.layers.60.mlp.shared_expert_gate",
980 "model.language_model.layers.60.post_attention_layernorm",
981 "model.language_model.layers.60.mlp.gate",
982 "model.language_model.layers.60.linear_attn.A_log",
983 "model.language_model.layers.60.linear_attn.conv1d",
984 "model.language_model.layers.60.linear_attn.dt_bias",
985 "model.language_model.layers.60.linear_attn.in_proj_ba",
986 "model.language_model.layers.60.linear_attn.in_proj_b",
987 "model.language_model.layers.60.linear_attn.in_proj_a",
988 "model.language_model.layers.60.linear_attn.norm",
989 "model.language_model.layers.61.input_layernorm",
990 "model.language_model.layers.61.mlp.shared_expert_gate",
991 "model.language_model.layers.61.post_attention_layernorm",
992 "model.language_model.layers.61.mlp.gate",
993 "model.language_model.layers.61.linear_attn.A_log",
994 "model.language_model.layers.61.linear_attn.conv1d",
995 "model.language_model.layers.61.linear_attn.dt_bias",
996 "model.language_model.layers.61.linear_attn.in_proj_ba",
997 "model.language_model.layers.61.linear_attn.in_proj_b",
998 "model.language_model.layers.61.linear_attn.in_proj_a",
999 "model.language_model.layers.61.linear_attn.norm",
1000 "model.language_model.layers.62.input_layernorm",
1001 "model.language_model.layers.62.mlp.shared_expert_gate",
1002 "model.language_model.layers.62.post_attention_layernorm",
1003 "model.language_model.layers.62.mlp.gate",
1004 "model.language_model.layers.62.linear_attn.A_log",
1005 "model.language_model.layers.62.linear_attn.conv1d",
1006 "model.language_model.layers.62.linear_attn.dt_bias",
1007 "model.language_model.layers.62.linear_attn.in_proj_ba",
1008 "model.language_model.layers.62.linear_attn.in_proj_b",
1009 "model.language_model.layers.62.linear_attn.in_proj_a",
1010 "model.language_model.layers.62.linear_attn.norm",
1011 "model.language_model.layers.63.input_layernorm",
1012 "model.language_model.layers.63.mlp.shared_expert_gate",
1013 "model.language_model.layers.63.post_attention_layernorm",
1014 "model.language_model.layers.63.mlp.gate",
1015 "model.language_model.layers.63.self_attn.k_norm",
1016 "model.language_model.layers.63.self_attn.q_norm",
1017 "mtp.layers.0.input_layernorm",
1018 "mtp.layers.0.mlp.gate",
1019 "mtp.layers.0.mlp.shared_expert_gate",
1020 "mtp.layers.0.post_attention_layernorm",
1021 "mtp.layers.0.self_attn.k_norm",
1022 "mtp.layers.0.self_attn.q_norm",
1023 "mtp.fc",
1024 "mtp.norm",
1025 "mtp.pre_fc_norm_embedding",
1026 "mtp.pre_fc_norm_hidden"
1027 ],
1028 "weight_block_size": [
1029 128,
1030 128
1031 ]
1032 }
1033 }