config.json
35.0 KB · 783 lines · json Raw
1 {
2 "architectures": [
3 "GlmMoeDsaForCausalLM"
4 ],
5 "attention_bias": false,
6 "attention_dropout": 0.0,
7 "dtype": "bfloat16",
8 "eos_token_id": [
9 154820,
10 154827,
11 154829
12 ],
13 "ep_size": 1,
14 "first_k_dense_replace": 3,
15 "hidden_act": "silu",
16 "head_dim": 64,
17 "hidden_size": 6144,
18 "index_head_dim": 128,
19 "index_n_heads": 32,
20 "index_topk": 2048,
21 "indexer_rope_interleave": true,
22 "initializer_range": 0.02,
23 "intermediate_size": 12288,
24 "kv_lora_rank": 512,
25 "max_position_embeddings": 202752,
26 "moe_intermediate_size": 2048,
27 "moe_layer_freq": 1,
28 "model_type": "glm_moe_dsa",
29 "n_group": 1,
30 "n_routed_experts": 256,
31 "n_shared_experts": 1,
32 "norm_topk_prob": true,
33 "num_attention_heads": 64,
34 "num_experts_per_tok": 8,
35 "num_hidden_layers": 78,
36 "num_key_value_heads": 64,
37 "num_nextn_predict_layers": 1,
38 "pad_token_id": 154820,
39 "pretraining_tp": 1,
40 "q_lora_rank": 2048,
41 "qk_head_dim": 256,
42 "qk_nope_head_dim": 192,
43 "qk_rope_head_dim": 64,
44 "rms_norm_eps": 1e-05,
45 "rope_interleave": true,
46 "rope_parameters": {
47 "rope_theta": 1000000,
48 "rope_type": "default"
49 },
50 "routed_scaling_factor": 2.5,
51 "scoring_func": "sigmoid",
52 "tie_word_embeddings": false,
53 "topk_group": 1,
54 "topk_method": "noaux_tc",
55 "transformers_version": "5.0.2.dev0",
56 "use_cache": true,
57 "v_head_dim": 256,
58 "vocab_size": 154880,
59 "quantization_config": {
60 "activation_scheme": "dynamic",
61 "fmt": "e4m3",
62 "quant_method": "fp8",
63 "weight_block_size": [
64 128,
65 128
66 ],
67 "modules_to_not_convert": [
68 "lm_head",
69 "model.embed_tokens",
70 "model.layers.0.input_layernorm",
71 "model.layers.0.post_attention_layernorm",
72 "model.layers.0.self_attn.indexer.k_norm",
73 "model.layers.0.self_attn.indexer.k_norm.bias",
74 "model.layers.0.self_attn.indexers_proj",
75 "model.layers.0.self_attn.kv_a_layernorm",
76 "model.layers.0.self_attn.q_a_layernorm",
77 "model.layers.1.input_layernorm",
78 "model.layers.1.post_attention_layernorm",
79 "model.layers.1.self_attn.indexer.k_norm",
80 "model.layers.1.self_attn.indexer.k_norm.bias",
81 "model.layers.1.self_attn.indexers_proj",
82 "model.layers.1.self_attn.kv_a_layernorm",
83 "model.layers.1.self_attn.q_a_layernorm",
84 "model.layers.2.input_layernorm",
85 "model.layers.2.post_attention_layernorm",
86 "model.layers.2.self_attn.indexer.k_norm",
87 "model.layers.2.self_attn.indexer.k_norm.bias",
88 "model.layers.2.self_attn.indexers_proj",
89 "model.layers.2.self_attn.kv_a_layernorm",
90 "model.layers.2.self_attn.q_a_layernorm",
91 "model.layers.3.input_layernorm",
92 "model.layers.3.mlp.gate",
93 "model.layers.3.mlp.gate.e_score_correction_bias",
94 "model.layers.3.post_attention_layernorm",
95 "model.layers.3.self_attn.indexer.k_norm",
96 "model.layers.3.self_attn.indexer.k_norm.bias",
97 "model.layers.3.self_attn.indexers_proj",
98 "model.layers.3.self_attn.kv_a_layernorm",
99 "model.layers.3.self_attn.q_a_layernorm",
100 "model.layers.4.input_layernorm",
101 "model.layers.4.mlp.gate",
102 "model.layers.4.mlp.gate.e_score_correction_bias",
103 "model.layers.4.post_attention_layernorm",
104 "model.layers.4.self_attn.indexer.k_norm",
105 "model.layers.4.self_attn.indexer.k_norm.bias",
106 "model.layers.4.self_attn.indexers_proj",
107 "model.layers.4.self_attn.kv_a_layernorm",
108 "model.layers.4.self_attn.q_a_layernorm",
109 "model.layers.5.input_layernorm",
110 "model.layers.5.mlp.gate",
111 "model.layers.5.mlp.gate.e_score_correction_bias",
112 "model.layers.5.post_attention_layernorm",
113 "model.layers.5.self_attn.indexer.k_norm",
114 "model.layers.5.self_attn.indexer.k_norm.bias",
115 "model.layers.5.self_attn.indexers_proj",
116 "model.layers.5.self_attn.kv_a_layernorm",
117 "model.layers.5.self_attn.q_a_layernorm",
118 "model.layers.6.input_layernorm",
119 "model.layers.6.mlp.gate",
120 "model.layers.6.mlp.gate.e_score_correction_bias",
121 "model.layers.6.post_attention_layernorm",
122 "model.layers.6.self_attn.indexer.k_norm",
123 "model.layers.6.self_attn.indexer.k_norm.bias",
124 "model.layers.6.self_attn.indexers_proj",
125 "model.layers.6.self_attn.kv_a_layernorm",
126 "model.layers.6.self_attn.q_a_layernorm",
127 "model.layers.7.input_layernorm",
128 "model.layers.7.mlp.gate",
129 "model.layers.7.mlp.gate.e_score_correction_bias",
130 "model.layers.7.post_attention_layernorm",
131 "model.layers.7.self_attn.indexer.k_norm",
132 "model.layers.7.self_attn.indexer.k_norm.bias",
133 "model.layers.7.self_attn.indexers_proj",
134 "model.layers.7.self_attn.kv_a_layernorm",
135 "model.layers.7.self_attn.q_a_layernorm",
136 "model.layers.8.input_layernorm",
137 "model.layers.8.mlp.gate",
138 "model.layers.8.mlp.gate.e_score_correction_bias",
139 "model.layers.8.post_attention_layernorm",
140 "model.layers.8.self_attn.indexer.k_norm",
141 "model.layers.8.self_attn.indexer.k_norm.bias",
142 "model.layers.8.self_attn.indexers_proj",
143 "model.layers.8.self_attn.kv_a_layernorm",
144 "model.layers.8.self_attn.q_a_layernorm",
145 "model.layers.9.input_layernorm",
146 "model.layers.9.mlp.gate",
147 "model.layers.9.mlp.gate.e_score_correction_bias",
148 "model.layers.9.post_attention_layernorm",
149 "model.layers.9.self_attn.indexer.k_norm",
150 "model.layers.9.self_attn.indexer.k_norm.bias",
151 "model.layers.9.self_attn.indexers_proj",
152 "model.layers.9.self_attn.kv_a_layernorm",
153 "model.layers.9.self_attn.q_a_layernorm",
154 "model.layers.10.input_layernorm",
155 "model.layers.10.mlp.gate",
156 "model.layers.10.mlp.gate.e_score_correction_bias",
157 "model.layers.10.post_attention_layernorm",
158 "model.layers.10.self_attn.indexer.k_norm",
159 "model.layers.10.self_attn.indexer.k_norm.bias",
160 "model.layers.10.self_attn.indexers_proj",
161 "model.layers.10.self_attn.kv_a_layernorm",
162 "model.layers.10.self_attn.q_a_layernorm",
163 "model.layers.11.input_layernorm",
164 "model.layers.11.mlp.gate",
165 "model.layers.11.mlp.gate.e_score_correction_bias",
166 "model.layers.11.post_attention_layernorm",
167 "model.layers.11.self_attn.indexer.k_norm",
168 "model.layers.11.self_attn.indexer.k_norm.bias",
169 "model.layers.11.self_attn.indexers_proj",
170 "model.layers.11.self_attn.kv_a_layernorm",
171 "model.layers.11.self_attn.q_a_layernorm",
172 "model.layers.12.input_layernorm",
173 "model.layers.12.mlp.gate",
174 "model.layers.12.mlp.gate.e_score_correction_bias",
175 "model.layers.12.post_attention_layernorm",
176 "model.layers.12.self_attn.indexer.k_norm",
177 "model.layers.12.self_attn.indexer.k_norm.bias",
178 "model.layers.12.self_attn.indexers_proj",
179 "model.layers.12.self_attn.kv_a_layernorm",
180 "model.layers.12.self_attn.q_a_layernorm",
181 "model.layers.13.input_layernorm",
182 "model.layers.13.mlp.gate",
183 "model.layers.13.mlp.gate.e_score_correction_bias",
184 "model.layers.13.post_attention_layernorm",
185 "model.layers.13.self_attn.indexer.k_norm",
186 "model.layers.13.self_attn.indexer.k_norm.bias",
187 "model.layers.13.self_attn.indexers_proj",
188 "model.layers.13.self_attn.kv_a_layernorm",
189 "model.layers.13.self_attn.q_a_layernorm",
190 "model.layers.14.input_layernorm",
191 "model.layers.14.mlp.gate",
192 "model.layers.14.mlp.gate.e_score_correction_bias",
193 "model.layers.14.post_attention_layernorm",
194 "model.layers.14.self_attn.indexer.k_norm",
195 "model.layers.14.self_attn.indexer.k_norm.bias",
196 "model.layers.14.self_attn.indexers_proj",
197 "model.layers.14.self_attn.kv_a_layernorm",
198 "model.layers.14.self_attn.q_a_layernorm",
199 "model.layers.15.input_layernorm",
200 "model.layers.15.mlp.gate",
201 "model.layers.15.mlp.gate.e_score_correction_bias",
202 "model.layers.15.post_attention_layernorm",
203 "model.layers.15.self_attn.indexer.k_norm",
204 "model.layers.15.self_attn.indexer.k_norm.bias",
205 "model.layers.15.self_attn.indexers_proj",
206 "model.layers.15.self_attn.kv_a_layernorm",
207 "model.layers.15.self_attn.q_a_layernorm",
208 "model.layers.16.input_layernorm",
209 "model.layers.16.mlp.gate",
210 "model.layers.16.mlp.gate.e_score_correction_bias",
211 "model.layers.16.post_attention_layernorm",
212 "model.layers.16.self_attn.indexer.k_norm",
213 "model.layers.16.self_attn.indexer.k_norm.bias",
214 "model.layers.16.self_attn.indexers_proj",
215 "model.layers.16.self_attn.kv_a_layernorm",
216 "model.layers.16.self_attn.q_a_layernorm",
217 "model.layers.17.input_layernorm",
218 "model.layers.17.mlp.gate",
219 "model.layers.17.mlp.gate.e_score_correction_bias",
220 "model.layers.17.post_attention_layernorm",
221 "model.layers.17.self_attn.indexer.k_norm",
222 "model.layers.17.self_attn.indexer.k_norm.bias",
223 "model.layers.17.self_attn.indexers_proj",
224 "model.layers.17.self_attn.kv_a_layernorm",
225 "model.layers.17.self_attn.q_a_layernorm",
226 "model.layers.18.input_layernorm",
227 "model.layers.18.mlp.gate",
228 "model.layers.18.mlp.gate.e_score_correction_bias",
229 "model.layers.18.post_attention_layernorm",
230 "model.layers.18.self_attn.indexer.k_norm",
231 "model.layers.18.self_attn.indexer.k_norm.bias",
232 "model.layers.18.self_attn.indexers_proj",
233 "model.layers.18.self_attn.kv_a_layernorm",
234 "model.layers.18.self_attn.q_a_layernorm",
235 "model.layers.19.input_layernorm",
236 "model.layers.19.mlp.gate",
237 "model.layers.19.mlp.gate.e_score_correction_bias",
238 "model.layers.19.post_attention_layernorm",
239 "model.layers.19.self_attn.indexer.k_norm",
240 "model.layers.19.self_attn.indexer.k_norm.bias",
241 "model.layers.19.self_attn.indexers_proj",
242 "model.layers.19.self_attn.kv_a_layernorm",
243 "model.layers.19.self_attn.q_a_layernorm",
244 "model.layers.20.input_layernorm",
245 "model.layers.20.mlp.gate",
246 "model.layers.20.mlp.gate.e_score_correction_bias",
247 "model.layers.20.post_attention_layernorm",
248 "model.layers.20.self_attn.indexer.k_norm",
249 "model.layers.20.self_attn.indexer.k_norm.bias",
250 "model.layers.20.self_attn.indexers_proj",
251 "model.layers.20.self_attn.kv_a_layernorm",
252 "model.layers.20.self_attn.q_a_layernorm",
253 "model.layers.21.input_layernorm",
254 "model.layers.21.mlp.gate",
255 "model.layers.21.mlp.gate.e_score_correction_bias",
256 "model.layers.21.post_attention_layernorm",
257 "model.layers.21.self_attn.indexer.k_norm",
258 "model.layers.21.self_attn.indexer.k_norm.bias",
259 "model.layers.21.self_attn.indexers_proj",
260 "model.layers.21.self_attn.kv_a_layernorm",
261 "model.layers.21.self_attn.q_a_layernorm",
262 "model.layers.22.input_layernorm",
263 "model.layers.22.mlp.gate",
264 "model.layers.22.mlp.gate.e_score_correction_bias",
265 "model.layers.22.post_attention_layernorm",
266 "model.layers.22.self_attn.indexer.k_norm",
267 "model.layers.22.self_attn.indexer.k_norm.bias",
268 "model.layers.22.self_attn.indexers_proj",
269 "model.layers.22.self_attn.kv_a_layernorm",
270 "model.layers.22.self_attn.q_a_layernorm",
271 "model.layers.23.input_layernorm",
272 "model.layers.23.mlp.gate",
273 "model.layers.23.mlp.gate.e_score_correction_bias",
274 "model.layers.23.post_attention_layernorm",
275 "model.layers.23.self_attn.indexer.k_norm",
276 "model.layers.23.self_attn.indexer.k_norm.bias",
277 "model.layers.23.self_attn.indexers_proj",
278 "model.layers.23.self_attn.kv_a_layernorm",
279 "model.layers.23.self_attn.q_a_layernorm",
280 "model.layers.24.input_layernorm",
281 "model.layers.24.mlp.gate",
282 "model.layers.24.mlp.gate.e_score_correction_bias",
283 "model.layers.24.post_attention_layernorm",
284 "model.layers.24.self_attn.indexer.k_norm",
285 "model.layers.24.self_attn.indexer.k_norm.bias",
286 "model.layers.24.self_attn.indexers_proj",
287 "model.layers.24.self_attn.kv_a_layernorm",
288 "model.layers.24.self_attn.q_a_layernorm",
289 "model.layers.25.input_layernorm",
290 "model.layers.25.mlp.gate",
291 "model.layers.25.mlp.gate.e_score_correction_bias",
292 "model.layers.25.post_attention_layernorm",
293 "model.layers.25.self_attn.indexer.k_norm",
294 "model.layers.25.self_attn.indexer.k_norm.bias",
295 "model.layers.25.self_attn.indexers_proj",
296 "model.layers.25.self_attn.kv_a_layernorm",
297 "model.layers.25.self_attn.q_a_layernorm",
298 "model.layers.26.input_layernorm",
299 "model.layers.26.mlp.gate",
300 "model.layers.26.mlp.gate.e_score_correction_bias",
301 "model.layers.26.post_attention_layernorm",
302 "model.layers.26.self_attn.indexer.k_norm",
303 "model.layers.26.self_attn.indexer.k_norm.bias",
304 "model.layers.26.self_attn.indexers_proj",
305 "model.layers.26.self_attn.kv_a_layernorm",
306 "model.layers.26.self_attn.q_a_layernorm",
307 "model.layers.27.input_layernorm",
308 "model.layers.27.mlp.gate",
309 "model.layers.27.mlp.gate.e_score_correction_bias",
310 "model.layers.27.post_attention_layernorm",
311 "model.layers.27.self_attn.indexer.k_norm",
312 "model.layers.27.self_attn.indexer.k_norm.bias",
313 "model.layers.27.self_attn.indexers_proj",
314 "model.layers.27.self_attn.kv_a_layernorm",
315 "model.layers.27.self_attn.q_a_layernorm",
316 "model.layers.28.input_layernorm",
317 "model.layers.28.mlp.gate",
318 "model.layers.28.mlp.gate.e_score_correction_bias",
319 "model.layers.28.post_attention_layernorm",
320 "model.layers.28.self_attn.indexer.k_norm",
321 "model.layers.28.self_attn.indexer.k_norm.bias",
322 "model.layers.28.self_attn.indexers_proj",
323 "model.layers.28.self_attn.kv_a_layernorm",
324 "model.layers.28.self_attn.q_a_layernorm",
325 "model.layers.29.input_layernorm",
326 "model.layers.29.mlp.gate",
327 "model.layers.29.mlp.gate.e_score_correction_bias",
328 "model.layers.29.post_attention_layernorm",
329 "model.layers.29.self_attn.indexer.k_norm",
330 "model.layers.29.self_attn.indexer.k_norm.bias",
331 "model.layers.29.self_attn.indexers_proj",
332 "model.layers.29.self_attn.kv_a_layernorm",
333 "model.layers.29.self_attn.q_a_layernorm",
334 "model.layers.30.input_layernorm",
335 "model.layers.30.mlp.gate",
336 "model.layers.30.mlp.gate.e_score_correction_bias",
337 "model.layers.30.post_attention_layernorm",
338 "model.layers.30.self_attn.indexer.k_norm",
339 "model.layers.30.self_attn.indexer.k_norm.bias",
340 "model.layers.30.self_attn.indexers_proj",
341 "model.layers.30.self_attn.kv_a_layernorm",
342 "model.layers.30.self_attn.q_a_layernorm",
343 "model.layers.31.input_layernorm",
344 "model.layers.31.mlp.gate",
345 "model.layers.31.mlp.gate.e_score_correction_bias",
346 "model.layers.31.post_attention_layernorm",
347 "model.layers.31.self_attn.indexer.k_norm",
348 "model.layers.31.self_attn.indexer.k_norm.bias",
349 "model.layers.31.self_attn.indexers_proj",
350 "model.layers.31.self_attn.kv_a_layernorm",
351 "model.layers.31.self_attn.q_a_layernorm",
352 "model.layers.32.input_layernorm",
353 "model.layers.32.mlp.gate",
354 "model.layers.32.mlp.gate.e_score_correction_bias",
355 "model.layers.32.post_attention_layernorm",
356 "model.layers.32.self_attn.indexer.k_norm",
357 "model.layers.32.self_attn.indexer.k_norm.bias",
358 "model.layers.32.self_attn.indexers_proj",
359 "model.layers.32.self_attn.kv_a_layernorm",
360 "model.layers.32.self_attn.q_a_layernorm",
361 "model.layers.33.input_layernorm",
362 "model.layers.33.mlp.gate",
363 "model.layers.33.mlp.gate.e_score_correction_bias",
364 "model.layers.33.post_attention_layernorm",
365 "model.layers.33.self_attn.indexer.k_norm",
366 "model.layers.33.self_attn.indexer.k_norm.bias",
367 "model.layers.33.self_attn.indexers_proj",
368 "model.layers.33.self_attn.kv_a_layernorm",
369 "model.layers.33.self_attn.q_a_layernorm",
370 "model.layers.34.input_layernorm",
371 "model.layers.34.mlp.gate",
372 "model.layers.34.mlp.gate.e_score_correction_bias",
373 "model.layers.34.post_attention_layernorm",
374 "model.layers.34.self_attn.indexer.k_norm",
375 "model.layers.34.self_attn.indexer.k_norm.bias",
376 "model.layers.34.self_attn.indexers_proj",
377 "model.layers.34.self_attn.kv_a_layernorm",
378 "model.layers.34.self_attn.q_a_layernorm",
379 "model.layers.35.input_layernorm",
380 "model.layers.35.mlp.gate",
381 "model.layers.35.mlp.gate.e_score_correction_bias",
382 "model.layers.35.post_attention_layernorm",
383 "model.layers.35.self_attn.indexer.k_norm",
384 "model.layers.35.self_attn.indexer.k_norm.bias",
385 "model.layers.35.self_attn.indexers_proj",
386 "model.layers.35.self_attn.kv_a_layernorm",
387 "model.layers.35.self_attn.q_a_layernorm",
388 "model.layers.36.input_layernorm",
389 "model.layers.36.mlp.gate",
390 "model.layers.36.mlp.gate.e_score_correction_bias",
391 "model.layers.36.post_attention_layernorm",
392 "model.layers.36.self_attn.indexer.k_norm",
393 "model.layers.36.self_attn.indexer.k_norm.bias",
394 "model.layers.36.self_attn.indexers_proj",
395 "model.layers.36.self_attn.kv_a_layernorm",
396 "model.layers.36.self_attn.q_a_layernorm",
397 "model.layers.37.input_layernorm",
398 "model.layers.37.mlp.gate",
399 "model.layers.37.mlp.gate.e_score_correction_bias",
400 "model.layers.37.post_attention_layernorm",
401 "model.layers.37.self_attn.indexer.k_norm",
402 "model.layers.37.self_attn.indexer.k_norm.bias",
403 "model.layers.37.self_attn.indexers_proj",
404 "model.layers.37.self_attn.kv_a_layernorm",
405 "model.layers.37.self_attn.q_a_layernorm",
406 "model.layers.38.input_layernorm",
407 "model.layers.38.mlp.gate",
408 "model.layers.38.mlp.gate.e_score_correction_bias",
409 "model.layers.38.post_attention_layernorm",
410 "model.layers.38.self_attn.indexer.k_norm",
411 "model.layers.38.self_attn.indexer.k_norm.bias",
412 "model.layers.38.self_attn.indexers_proj",
413 "model.layers.38.self_attn.kv_a_layernorm",
414 "model.layers.38.self_attn.q_a_layernorm",
415 "model.layers.39.input_layernorm",
416 "model.layers.39.mlp.gate",
417 "model.layers.39.mlp.gate.e_score_correction_bias",
418 "model.layers.39.post_attention_layernorm",
419 "model.layers.39.self_attn.indexer.k_norm",
420 "model.layers.39.self_attn.indexer.k_norm.bias",
421 "model.layers.39.self_attn.indexers_proj",
422 "model.layers.39.self_attn.kv_a_layernorm",
423 "model.layers.39.self_attn.q_a_layernorm",
424 "model.layers.40.input_layernorm",
425 "model.layers.40.mlp.gate",
426 "model.layers.40.mlp.gate.e_score_correction_bias",
427 "model.layers.40.post_attention_layernorm",
428 "model.layers.40.self_attn.indexer.k_norm",
429 "model.layers.40.self_attn.indexer.k_norm.bias",
430 "model.layers.40.self_attn.indexers_proj",
431 "model.layers.40.self_attn.kv_a_layernorm",
432 "model.layers.40.self_attn.q_a_layernorm",
433 "model.layers.41.input_layernorm",
434 "model.layers.41.mlp.gate",
435 "model.layers.41.mlp.gate.e_score_correction_bias",
436 "model.layers.41.post_attention_layernorm",
437 "model.layers.41.self_attn.indexer.k_norm",
438 "model.layers.41.self_attn.indexer.k_norm.bias",
439 "model.layers.41.self_attn.indexers_proj",
440 "model.layers.41.self_attn.kv_a_layernorm",
441 "model.layers.41.self_attn.q_a_layernorm",
442 "model.layers.42.input_layernorm",
443 "model.layers.42.mlp.gate",
444 "model.layers.42.mlp.gate.e_score_correction_bias",
445 "model.layers.42.post_attention_layernorm",
446 "model.layers.42.self_attn.indexer.k_norm",
447 "model.layers.42.self_attn.indexer.k_norm.bias",
448 "model.layers.42.self_attn.indexers_proj",
449 "model.layers.42.self_attn.kv_a_layernorm",
450 "model.layers.42.self_attn.q_a_layernorm",
451 "model.layers.43.input_layernorm",
452 "model.layers.43.mlp.gate",
453 "model.layers.43.mlp.gate.e_score_correction_bias",
454 "model.layers.43.post_attention_layernorm",
455 "model.layers.43.self_attn.indexer.k_norm",
456 "model.layers.43.self_attn.indexer.k_norm.bias",
457 "model.layers.43.self_attn.indexers_proj",
458 "model.layers.43.self_attn.kv_a_layernorm",
459 "model.layers.43.self_attn.q_a_layernorm",
460 "model.layers.44.input_layernorm",
461 "model.layers.44.mlp.gate",
462 "model.layers.44.mlp.gate.e_score_correction_bias",
463 "model.layers.44.post_attention_layernorm",
464 "model.layers.44.self_attn.indexer.k_norm",
465 "model.layers.44.self_attn.indexer.k_norm.bias",
466 "model.layers.44.self_attn.indexers_proj",
467 "model.layers.44.self_attn.kv_a_layernorm",
468 "model.layers.44.self_attn.q_a_layernorm",
469 "model.layers.45.input_layernorm",
470 "model.layers.45.mlp.gate",
471 "model.layers.45.mlp.gate.e_score_correction_bias",
472 "model.layers.45.post_attention_layernorm",
473 "model.layers.45.self_attn.indexer.k_norm",
474 "model.layers.45.self_attn.indexer.k_norm.bias",
475 "model.layers.45.self_attn.indexers_proj",
476 "model.layers.45.self_attn.kv_a_layernorm",
477 "model.layers.45.self_attn.q_a_layernorm",
478 "model.layers.46.input_layernorm",
479 "model.layers.46.mlp.gate",
480 "model.layers.46.mlp.gate.e_score_correction_bias",
481 "model.layers.46.post_attention_layernorm",
482 "model.layers.46.self_attn.indexer.k_norm",
483 "model.layers.46.self_attn.indexer.k_norm.bias",
484 "model.layers.46.self_attn.indexers_proj",
485 "model.layers.46.self_attn.kv_a_layernorm",
486 "model.layers.46.self_attn.q_a_layernorm",
487 "model.layers.47.input_layernorm",
488 "model.layers.47.mlp.gate",
489 "model.layers.47.mlp.gate.e_score_correction_bias",
490 "model.layers.47.post_attention_layernorm",
491 "model.layers.47.self_attn.indexer.k_norm",
492 "model.layers.47.self_attn.indexer.k_norm.bias",
493 "model.layers.47.self_attn.indexers_proj",
494 "model.layers.47.self_attn.kv_a_layernorm",
495 "model.layers.47.self_attn.q_a_layernorm",
496 "model.layers.48.input_layernorm",
497 "model.layers.48.mlp.gate",
498 "model.layers.48.mlp.gate.e_score_correction_bias",
499 "model.layers.48.post_attention_layernorm",
500 "model.layers.48.self_attn.indexer.k_norm",
501 "model.layers.48.self_attn.indexer.k_norm.bias",
502 "model.layers.48.self_attn.indexers_proj",
503 "model.layers.48.self_attn.kv_a_layernorm",
504 "model.layers.48.self_attn.q_a_layernorm",
505 "model.layers.49.input_layernorm",
506 "model.layers.49.mlp.gate",
507 "model.layers.49.mlp.gate.e_score_correction_bias",
508 "model.layers.49.post_attention_layernorm",
509 "model.layers.49.self_attn.indexer.k_norm",
510 "model.layers.49.self_attn.indexer.k_norm.bias",
511 "model.layers.49.self_attn.indexers_proj",
512 "model.layers.49.self_attn.kv_a_layernorm",
513 "model.layers.49.self_attn.q_a_layernorm",
514 "model.layers.50.input_layernorm",
515 "model.layers.50.mlp.gate",
516 "model.layers.50.mlp.gate.e_score_correction_bias",
517 "model.layers.50.post_attention_layernorm",
518 "model.layers.50.self_attn.indexer.k_norm",
519 "model.layers.50.self_attn.indexer.k_norm.bias",
520 "model.layers.50.self_attn.indexers_proj",
521 "model.layers.50.self_attn.kv_a_layernorm",
522 "model.layers.50.self_attn.q_a_layernorm",
523 "model.layers.51.input_layernorm",
524 "model.layers.51.mlp.gate",
525 "model.layers.51.mlp.gate.e_score_correction_bias",
526 "model.layers.51.post_attention_layernorm",
527 "model.layers.51.self_attn.indexer.k_norm",
528 "model.layers.51.self_attn.indexer.k_norm.bias",
529 "model.layers.51.self_attn.indexers_proj",
530 "model.layers.51.self_attn.kv_a_layernorm",
531 "model.layers.51.self_attn.q_a_layernorm",
532 "model.layers.52.input_layernorm",
533 "model.layers.52.mlp.gate",
534 "model.layers.52.mlp.gate.e_score_correction_bias",
535 "model.layers.52.post_attention_layernorm",
536 "model.layers.52.self_attn.indexer.k_norm",
537 "model.layers.52.self_attn.indexer.k_norm.bias",
538 "model.layers.52.self_attn.indexers_proj",
539 "model.layers.52.self_attn.kv_a_layernorm",
540 "model.layers.52.self_attn.q_a_layernorm",
541 "model.layers.53.input_layernorm",
542 "model.layers.53.mlp.gate",
543 "model.layers.53.mlp.gate.e_score_correction_bias",
544 "model.layers.53.post_attention_layernorm",
545 "model.layers.53.self_attn.indexer.k_norm",
546 "model.layers.53.self_attn.indexer.k_norm.bias",
547 "model.layers.53.self_attn.indexers_proj",
548 "model.layers.53.self_attn.kv_a_layernorm",
549 "model.layers.53.self_attn.q_a_layernorm",
550 "model.layers.54.input_layernorm",
551 "model.layers.54.mlp.gate",
552 "model.layers.54.mlp.gate.e_score_correction_bias",
553 "model.layers.54.post_attention_layernorm",
554 "model.layers.54.self_attn.indexer.k_norm",
555 "model.layers.54.self_attn.indexer.k_norm.bias",
556 "model.layers.54.self_attn.indexers_proj",
557 "model.layers.54.self_attn.kv_a_layernorm",
558 "model.layers.54.self_attn.q_a_layernorm",
559 "model.layers.55.input_layernorm",
560 "model.layers.55.mlp.gate",
561 "model.layers.55.mlp.gate.e_score_correction_bias",
562 "model.layers.55.post_attention_layernorm",
563 "model.layers.55.self_attn.indexer.k_norm",
564 "model.layers.55.self_attn.indexer.k_norm.bias",
565 "model.layers.55.self_attn.indexers_proj",
566 "model.layers.55.self_attn.kv_a_layernorm",
567 "model.layers.55.self_attn.q_a_layernorm",
568 "model.layers.56.input_layernorm",
569 "model.layers.56.mlp.gate",
570 "model.layers.56.mlp.gate.e_score_correction_bias",
571 "model.layers.56.post_attention_layernorm",
572 "model.layers.56.self_attn.indexer.k_norm",
573 "model.layers.56.self_attn.indexer.k_norm.bias",
574 "model.layers.56.self_attn.indexers_proj",
575 "model.layers.56.self_attn.kv_a_layernorm",
576 "model.layers.56.self_attn.q_a_layernorm",
577 "model.layers.57.input_layernorm",
578 "model.layers.57.mlp.gate",
579 "model.layers.57.mlp.gate.e_score_correction_bias",
580 "model.layers.57.post_attention_layernorm",
581 "model.layers.57.self_attn.indexer.k_norm",
582 "model.layers.57.self_attn.indexer.k_norm.bias",
583 "model.layers.57.self_attn.indexers_proj",
584 "model.layers.57.self_attn.kv_a_layernorm",
585 "model.layers.57.self_attn.q_a_layernorm",
586 "model.layers.58.input_layernorm",
587 "model.layers.58.mlp.gate",
588 "model.layers.58.mlp.gate.e_score_correction_bias",
589 "model.layers.58.post_attention_layernorm",
590 "model.layers.58.self_attn.indexer.k_norm",
591 "model.layers.58.self_attn.indexer.k_norm.bias",
592 "model.layers.58.self_attn.indexers_proj",
593 "model.layers.58.self_attn.kv_a_layernorm",
594 "model.layers.58.self_attn.q_a_layernorm",
595 "model.layers.59.input_layernorm",
596 "model.layers.59.mlp.gate",
597 "model.layers.59.mlp.gate.e_score_correction_bias",
598 "model.layers.59.post_attention_layernorm",
599 "model.layers.59.self_attn.indexer.k_norm",
600 "model.layers.59.self_attn.indexer.k_norm.bias",
601 "model.layers.59.self_attn.indexers_proj",
602 "model.layers.59.self_attn.kv_a_layernorm",
603 "model.layers.59.self_attn.q_a_layernorm",
604 "model.layers.60.input_layernorm",
605 "model.layers.60.mlp.gate",
606 "model.layers.60.mlp.gate.e_score_correction_bias",
607 "model.layers.60.post_attention_layernorm",
608 "model.layers.60.self_attn.indexer.k_norm",
609 "model.layers.60.self_attn.indexer.k_norm.bias",
610 "model.layers.60.self_attn.indexers_proj",
611 "model.layers.60.self_attn.kv_a_layernorm",
612 "model.layers.60.self_attn.q_a_layernorm",
613 "model.layers.61.input_layernorm",
614 "model.layers.61.mlp.gate",
615 "model.layers.61.mlp.gate.e_score_correction_bias",
616 "model.layers.61.post_attention_layernorm",
617 "model.layers.61.self_attn.indexer.k_norm",
618 "model.layers.61.self_attn.indexer.k_norm.bias",
619 "model.layers.61.self_attn.indexers_proj",
620 "model.layers.61.self_attn.kv_a_layernorm",
621 "model.layers.61.self_attn.q_a_layernorm",
622 "model.layers.62.input_layernorm",
623 "model.layers.62.mlp.gate",
624 "model.layers.62.mlp.gate.e_score_correction_bias",
625 "model.layers.62.post_attention_layernorm",
626 "model.layers.62.self_attn.indexer.k_norm",
627 "model.layers.62.self_attn.indexer.k_norm.bias",
628 "model.layers.62.self_attn.indexers_proj",
629 "model.layers.62.self_attn.kv_a_layernorm",
630 "model.layers.62.self_attn.q_a_layernorm",
631 "model.layers.63.input_layernorm",
632 "model.layers.63.mlp.gate",
633 "model.layers.63.mlp.gate.e_score_correction_bias",
634 "model.layers.63.post_attention_layernorm",
635 "model.layers.63.self_attn.indexer.k_norm",
636 "model.layers.63.self_attn.indexer.k_norm.bias",
637 "model.layers.63.self_attn.indexers_proj",
638 "model.layers.63.self_attn.kv_a_layernorm",
639 "model.layers.63.self_attn.q_a_layernorm",
640 "model.layers.64.input_layernorm",
641 "model.layers.64.mlp.gate",
642 "model.layers.64.mlp.gate.e_score_correction_bias",
643 "model.layers.64.post_attention_layernorm",
644 "model.layers.64.self_attn.indexer.k_norm",
645 "model.layers.64.self_attn.indexer.k_norm.bias",
646 "model.layers.64.self_attn.indexers_proj",
647 "model.layers.64.self_attn.kv_a_layernorm",
648 "model.layers.64.self_attn.q_a_layernorm",
649 "model.layers.65.input_layernorm",
650 "model.layers.65.mlp.gate",
651 "model.layers.65.mlp.gate.e_score_correction_bias",
652 "model.layers.65.post_attention_layernorm",
653 "model.layers.65.self_attn.indexer.k_norm",
654 "model.layers.65.self_attn.indexer.k_norm.bias",
655 "model.layers.65.self_attn.indexers_proj",
656 "model.layers.65.self_attn.kv_a_layernorm",
657 "model.layers.65.self_attn.q_a_layernorm",
658 "model.layers.66.input_layernorm",
659 "model.layers.66.mlp.gate",
660 "model.layers.66.mlp.gate.e_score_correction_bias",
661 "model.layers.66.post_attention_layernorm",
662 "model.layers.66.self_attn.indexer.k_norm",
663 "model.layers.66.self_attn.indexer.k_norm.bias",
664 "model.layers.66.self_attn.indexers_proj",
665 "model.layers.66.self_attn.kv_a_layernorm",
666 "model.layers.66.self_attn.q_a_layernorm",
667 "model.layers.67.input_layernorm",
668 "model.layers.67.mlp.gate",
669 "model.layers.67.mlp.gate.e_score_correction_bias",
670 "model.layers.67.post_attention_layernorm",
671 "model.layers.67.self_attn.indexer.k_norm",
672 "model.layers.67.self_attn.indexer.k_norm.bias",
673 "model.layers.67.self_attn.indexers_proj",
674 "model.layers.67.self_attn.kv_a_layernorm",
675 "model.layers.67.self_attn.q_a_layernorm",
676 "model.layers.68.input_layernorm",
677 "model.layers.68.mlp.gate",
678 "model.layers.68.mlp.gate.e_score_correction_bias",
679 "model.layers.68.post_attention_layernorm",
680 "model.layers.68.self_attn.indexer.k_norm",
681 "model.layers.68.self_attn.indexer.k_norm.bias",
682 "model.layers.68.self_attn.indexers_proj",
683 "model.layers.68.self_attn.kv_a_layernorm",
684 "model.layers.68.self_attn.q_a_layernorm",
685 "model.layers.69.input_layernorm",
686 "model.layers.69.mlp.gate",
687 "model.layers.69.mlp.gate.e_score_correction_bias",
688 "model.layers.69.post_attention_layernorm",
689 "model.layers.69.self_attn.indexer.k_norm",
690 "model.layers.69.self_attn.indexer.k_norm.bias",
691 "model.layers.69.self_attn.indexers_proj",
692 "model.layers.69.self_attn.kv_a_layernorm",
693 "model.layers.69.self_attn.q_a_layernorm",
694 "model.layers.70.input_layernorm",
695 "model.layers.70.mlp.gate",
696 "model.layers.70.mlp.gate.e_score_correction_bias",
697 "model.layers.70.post_attention_layernorm",
698 "model.layers.70.self_attn.indexer.k_norm",
699 "model.layers.70.self_attn.indexer.k_norm.bias",
700 "model.layers.70.self_attn.indexers_proj",
701 "model.layers.70.self_attn.kv_a_layernorm",
702 "model.layers.70.self_attn.q_a_layernorm",
703 "model.layers.71.input_layernorm",
704 "model.layers.71.mlp.gate",
705 "model.layers.71.mlp.gate.e_score_correction_bias",
706 "model.layers.71.post_attention_layernorm",
707 "model.layers.71.self_attn.indexer.k_norm",
708 "model.layers.71.self_attn.indexer.k_norm.bias",
709 "model.layers.71.self_attn.indexers_proj",
710 "model.layers.71.self_attn.kv_a_layernorm",
711 "model.layers.71.self_attn.q_a_layernorm",
712 "model.layers.72.input_layernorm",
713 "model.layers.72.mlp.gate",
714 "model.layers.72.mlp.gate.e_score_correction_bias",
715 "model.layers.72.post_attention_layernorm",
716 "model.layers.72.self_attn.indexer.k_norm",
717 "model.layers.72.self_attn.indexer.k_norm.bias",
718 "model.layers.72.self_attn.indexers_proj",
719 "model.layers.72.self_attn.kv_a_layernorm",
720 "model.layers.72.self_attn.q_a_layernorm",
721 "model.layers.73.input_layernorm",
722 "model.layers.73.mlp.gate",
723 "model.layers.73.mlp.gate.e_score_correction_bias",
724 "model.layers.73.post_attention_layernorm",
725 "model.layers.73.self_attn.indexer.k_norm",
726 "model.layers.73.self_attn.indexer.k_norm.bias",
727 "model.layers.73.self_attn.indexers_proj",
728 "model.layers.73.self_attn.kv_a_layernorm",
729 "model.layers.73.self_attn.q_a_layernorm",
730 "model.layers.74.input_layernorm",
731 "model.layers.74.mlp.gate",
732 "model.layers.74.mlp.gate.e_score_correction_bias",
733 "model.layers.74.post_attention_layernorm",
734 "model.layers.74.self_attn.indexer.k_norm",
735 "model.layers.74.self_attn.indexer.k_norm.bias",
736 "model.layers.74.self_attn.indexers_proj",
737 "model.layers.74.self_attn.kv_a_layernorm",
738 "model.layers.74.self_attn.q_a_layernorm",
739 "model.layers.75.input_layernorm",
740 "model.layers.75.mlp.gate",
741 "model.layers.75.mlp.gate.e_score_correction_bias",
742 "model.layers.75.post_attention_layernorm",
743 "model.layers.75.self_attn.indexer.k_norm",
744 "model.layers.75.self_attn.indexer.k_norm.bias",
745 "model.layers.75.self_attn.indexers_proj",
746 "model.layers.75.self_attn.kv_a_layernorm",
747 "model.layers.75.self_attn.q_a_layernorm",
748 "model.layers.76.input_layernorm",
749 "model.layers.76.mlp.gate",
750 "model.layers.76.mlp.gate.e_score_correction_bias",
751 "model.layers.76.post_attention_layernorm",
752 "model.layers.76.self_attn.indexer.k_norm",
753 "model.layers.76.self_attn.indexer.k_norm.bias",
754 "model.layers.76.self_attn.indexers_proj",
755 "model.layers.76.self_attn.kv_a_layernorm",
756 "model.layers.76.self_attn.q_a_layernorm",
757 "model.layers.77.input_layernorm",
758 "model.layers.77.mlp.gate",
759 "model.layers.77.mlp.gate.e_score_correction_bias",
760 "model.layers.77.post_attention_layernorm",
761 "model.layers.77.self_attn.indexer.k_norm",
762 "model.layers.77.self_attn.indexer.k_norm.bias",
763 "model.layers.77.self_attn.indexers_proj",
764 "model.layers.77.self_attn.kv_a_layernorm",
765 "model.layers.77.self_attn.q_a_layernorm",
766 "model.layers.78.eh_proj",
767 "model.layers.78.enorm",
768 "model.layers.78.hnorm",
769 "model.layers.78.input_layernorm",
770 "model.layers.78.mlp.gate",
771 "model.layers.78.mlp.gate.e_score_correction_bias",
772 "model.layers.78.post_attention_layernorm",
773 "model.layers.78.self_attn.indexer.k_norm",
774 "model.layers.78.self_attn.indexer.k_norm.bias",
775 "model.layers.78.self_attn.indexers_proj",
776 "model.layers.78.self_attn.kv_a_layernorm",
777 "model.layers.78.self_attn.q_a_layernorm",
778 "model.layers.78.shared_head.norm",
779 "model.norm"
780 ]
781 }
782 }
783