config.json · GLM-5-FP8

1

{

2

"architectures": [

3

"GlmMoeDsaForCausalLM"

4

],

5

"attention_bias": false,

6

"attention_dropout": 0.0,

7

"dtype": "bfloat16",

8

"eos_token_id": [

9

154820,

10

154827,

11

154829

12

],

13

"ep_size": 1,

14

"first_k_dense_replace": 3,

15

"hidden_act": "silu",

16

"head_dim": 64,

17

"hidden_size": 6144,

18

"index_head_dim": 128,

19

"index_n_heads": 32,

20

"index_topk": 2048,

21

"indexer_rope_interleave": true,

22

"initializer_range": 0.02,

23

"intermediate_size": 12288,

24

"kv_lora_rank": 512,

25

"max_position_embeddings": 202752,

26

"moe_intermediate_size": 2048,

27

"moe_layer_freq": 1,

28

"model_type": "glm_moe_dsa",

29

"n_group": 1,

30

"n_routed_experts": 256,

31

"n_shared_experts": 1,

32

"norm_topk_prob": true,

33

"num_attention_heads": 64,

34

"num_experts_per_tok": 8,

35

"num_hidden_layers": 78,

36

"num_key_value_heads": 64,

37

"num_nextn_predict_layers": 1,

38

"pad_token_id": 154820,

39

"pretraining_tp": 1,

40

"q_lora_rank": 2048,

41

"qk_head_dim": 256,

42

"qk_nope_head_dim": 192,

43

"qk_rope_head_dim": 64,

44

"rms_norm_eps": 1e-05,

45

"rope_interleave": true,

46

"rope_parameters": {

47

"rope_theta": 1000000,

48

"rope_type": "default"

49

},

50

"routed_scaling_factor": 2.5,

51

"scoring_func": "sigmoid",

52

"tie_word_embeddings": false,

53

"topk_group": 1,

54

"topk_method": "noaux_tc",

55

"transformers_version": "5.0.2.dev0",

56

"use_cache": true,

57

"v_head_dim": 256,

58

"vocab_size": 154880,

59

"quantization_config": {

60

"activation_scheme": "dynamic",

61

"fmt": "e4m3",

62

"quant_method": "fp8",

63

"weight_block_size": [

64

128,

65

128

66

],

67

"modules_to_not_convert": [

68

"lm_head",

69

"model.embed_tokens",

70

"model.layers.0.input_layernorm",

71

"model.layers.0.post_attention_layernorm",

72

"model.layers.0.self_attn.indexer.k_norm",

73

"model.layers.0.self_attn.indexer.k_norm.bias",

74

"model.layers.0.self_attn.indexers_proj",

75

"model.layers.0.self_attn.kv_a_layernorm",

76

"model.layers.0.self_attn.q_a_layernorm",

77

"model.layers.1.input_layernorm",

78

"model.layers.1.post_attention_layernorm",

79

"model.layers.1.self_attn.indexer.k_norm",

80

"model.layers.1.self_attn.indexer.k_norm.bias",

81

"model.layers.1.self_attn.indexers_proj",

82

"model.layers.1.self_attn.kv_a_layernorm",

83

"model.layers.1.self_attn.q_a_layernorm",

84

"model.layers.2.input_layernorm",

85

"model.layers.2.post_attention_layernorm",

86

"model.layers.2.self_attn.indexer.k_norm",

87

"model.layers.2.self_attn.indexer.k_norm.bias",

88

"model.layers.2.self_attn.indexers_proj",

89

"model.layers.2.self_attn.kv_a_layernorm",

90

"model.layers.2.self_attn.q_a_layernorm",

91

"model.layers.3.input_layernorm",

92

"model.layers.3.mlp.gate",

93

"model.layers.3.mlp.gate.e_score_correction_bias",

94

"model.layers.3.post_attention_layernorm",

95

"model.layers.3.self_attn.indexer.k_norm",

96

"model.layers.3.self_attn.indexer.k_norm.bias",

97

"model.layers.3.self_attn.indexers_proj",

98

"model.layers.3.self_attn.kv_a_layernorm",

99

"model.layers.3.self_attn.q_a_layernorm",

100

"model.layers.4.input_layernorm",

101

"model.layers.4.mlp.gate",

102

"model.layers.4.mlp.gate.e_score_correction_bias",

103

"model.layers.4.post_attention_layernorm",

104

"model.layers.4.self_attn.indexer.k_norm",

105

"model.layers.4.self_attn.indexer.k_norm.bias",

106

"model.layers.4.self_attn.indexers_proj",

107

"model.layers.4.self_attn.kv_a_layernorm",

108

"model.layers.4.self_attn.q_a_layernorm",

109

"model.layers.5.input_layernorm",

110

"model.layers.5.mlp.gate",

111

"model.layers.5.mlp.gate.e_score_correction_bias",

112

"model.layers.5.post_attention_layernorm",

113

"model.layers.5.self_attn.indexer.k_norm",

114

"model.layers.5.self_attn.indexer.k_norm.bias",

115

"model.layers.5.self_attn.indexers_proj",

116

"model.layers.5.self_attn.kv_a_layernorm",

117

"model.layers.5.self_attn.q_a_layernorm",

118

"model.layers.6.input_layernorm",

119

"model.layers.6.mlp.gate",

120

"model.layers.6.mlp.gate.e_score_correction_bias",

121

"model.layers.6.post_attention_layernorm",

122

"model.layers.6.self_attn.indexer.k_norm",

123

"model.layers.6.self_attn.indexer.k_norm.bias",

124

"model.layers.6.self_attn.indexers_proj",

125

"model.layers.6.self_attn.kv_a_layernorm",

126

"model.layers.6.self_attn.q_a_layernorm",

127

"model.layers.7.input_layernorm",

128

"model.layers.7.mlp.gate",

129

"model.layers.7.mlp.gate.e_score_correction_bias",

130

"model.layers.7.post_attention_layernorm",

131

"model.layers.7.self_attn.indexer.k_norm",

132

"model.layers.7.self_attn.indexer.k_norm.bias",

133

"model.layers.7.self_attn.indexers_proj",

134

"model.layers.7.self_attn.kv_a_layernorm",

135

"model.layers.7.self_attn.q_a_layernorm",

136

"model.layers.8.input_layernorm",

137

"model.layers.8.mlp.gate",

138

"model.layers.8.mlp.gate.e_score_correction_bias",

139

"model.layers.8.post_attention_layernorm",

140

"model.layers.8.self_attn.indexer.k_norm",

141

"model.layers.8.self_attn.indexer.k_norm.bias",

142

"model.layers.8.self_attn.indexers_proj",

143

"model.layers.8.self_attn.kv_a_layernorm",

144

"model.layers.8.self_attn.q_a_layernorm",

145

"model.layers.9.input_layernorm",

146

"model.layers.9.mlp.gate",

147

"model.layers.9.mlp.gate.e_score_correction_bias",

148

"model.layers.9.post_attention_layernorm",

149

"model.layers.9.self_attn.indexer.k_norm",

150

"model.layers.9.self_attn.indexer.k_norm.bias",

151

"model.layers.9.self_attn.indexers_proj",

152

"model.layers.9.self_attn.kv_a_layernorm",

153

"model.layers.9.self_attn.q_a_layernorm",

154

"model.layers.10.input_layernorm",

155

"model.layers.10.mlp.gate",

156

"model.layers.10.mlp.gate.e_score_correction_bias",

157

"model.layers.10.post_attention_layernorm",

158

"model.layers.10.self_attn.indexer.k_norm",

159

"model.layers.10.self_attn.indexer.k_norm.bias",

160

"model.layers.10.self_attn.indexers_proj",

161

"model.layers.10.self_attn.kv_a_layernorm",

162

"model.layers.10.self_attn.q_a_layernorm",

163

"model.layers.11.input_layernorm",

164

"model.layers.11.mlp.gate",

165

"model.layers.11.mlp.gate.e_score_correction_bias",

166

"model.layers.11.post_attention_layernorm",

167

"model.layers.11.self_attn.indexer.k_norm",

168

"model.layers.11.self_attn.indexer.k_norm.bias",

169

"model.layers.11.self_attn.indexers_proj",

170

"model.layers.11.self_attn.kv_a_layernorm",

171

"model.layers.11.self_attn.q_a_layernorm",

172

"model.layers.12.input_layernorm",

173

"model.layers.12.mlp.gate",

174

"model.layers.12.mlp.gate.e_score_correction_bias",

175

"model.layers.12.post_attention_layernorm",

176

"model.layers.12.self_attn.indexer.k_norm",

177

"model.layers.12.self_attn.indexer.k_norm.bias",

178

"model.layers.12.self_attn.indexers_proj",

179

"model.layers.12.self_attn.kv_a_layernorm",

180

"model.layers.12.self_attn.q_a_layernorm",

181

"model.layers.13.input_layernorm",

182

"model.layers.13.mlp.gate",

183

"model.layers.13.mlp.gate.e_score_correction_bias",

184

"model.layers.13.post_attention_layernorm",

185

"model.layers.13.self_attn.indexer.k_norm",

186

"model.layers.13.self_attn.indexer.k_norm.bias",

187

"model.layers.13.self_attn.indexers_proj",

188

"model.layers.13.self_attn.kv_a_layernorm",

189

"model.layers.13.self_attn.q_a_layernorm",

190

"model.layers.14.input_layernorm",

191

"model.layers.14.mlp.gate",

192

"model.layers.14.mlp.gate.e_score_correction_bias",

193

"model.layers.14.post_attention_layernorm",

194

"model.layers.14.self_attn.indexer.k_norm",

195

"model.layers.14.self_attn.indexer.k_norm.bias",

196

"model.layers.14.self_attn.indexers_proj",

197

"model.layers.14.self_attn.kv_a_layernorm",

198

"model.layers.14.self_attn.q_a_layernorm",

199

"model.layers.15.input_layernorm",

200

"model.layers.15.mlp.gate",

201

"model.layers.15.mlp.gate.e_score_correction_bias",

202

"model.layers.15.post_attention_layernorm",

203

"model.layers.15.self_attn.indexer.k_norm",

204

"model.layers.15.self_attn.indexer.k_norm.bias",

205

"model.layers.15.self_attn.indexers_proj",

206

"model.layers.15.self_attn.kv_a_layernorm",

207

"model.layers.15.self_attn.q_a_layernorm",

208

"model.layers.16.input_layernorm",

209

"model.layers.16.mlp.gate",

210

"model.layers.16.mlp.gate.e_score_correction_bias",

211

"model.layers.16.post_attention_layernorm",

212

"model.layers.16.self_attn.indexer.k_norm",

213

"model.layers.16.self_attn.indexer.k_norm.bias",

214

"model.layers.16.self_attn.indexers_proj",

215

"model.layers.16.self_attn.kv_a_layernorm",

216

"model.layers.16.self_attn.q_a_layernorm",

217

"model.layers.17.input_layernorm",

218

"model.layers.17.mlp.gate",

219

"model.layers.17.mlp.gate.e_score_correction_bias",

220

"model.layers.17.post_attention_layernorm",

221

"model.layers.17.self_attn.indexer.k_norm",

222

"model.layers.17.self_attn.indexer.k_norm.bias",

223

"model.layers.17.self_attn.indexers_proj",

224

"model.layers.17.self_attn.kv_a_layernorm",

225

"model.layers.17.self_attn.q_a_layernorm",

226

"model.layers.18.input_layernorm",

227

"model.layers.18.mlp.gate",

228

"model.layers.18.mlp.gate.e_score_correction_bias",

229

"model.layers.18.post_attention_layernorm",

230

"model.layers.18.self_attn.indexer.k_norm",

231

"model.layers.18.self_attn.indexer.k_norm.bias",

232

"model.layers.18.self_attn.indexers_proj",

233

"model.layers.18.self_attn.kv_a_layernorm",

234

"model.layers.18.self_attn.q_a_layernorm",

235

"model.layers.19.input_layernorm",

236

"model.layers.19.mlp.gate",

237

"model.layers.19.mlp.gate.e_score_correction_bias",

238

"model.layers.19.post_attention_layernorm",

239

"model.layers.19.self_attn.indexer.k_norm",

240

"model.layers.19.self_attn.indexer.k_norm.bias",

241

"model.layers.19.self_attn.indexers_proj",

242

"model.layers.19.self_attn.kv_a_layernorm",

243

"model.layers.19.self_attn.q_a_layernorm",

244

"model.layers.20.input_layernorm",

245

"model.layers.20.mlp.gate",

246

"model.layers.20.mlp.gate.e_score_correction_bias",

247

"model.layers.20.post_attention_layernorm",

248

"model.layers.20.self_attn.indexer.k_norm",

249

"model.layers.20.self_attn.indexer.k_norm.bias",

250

"model.layers.20.self_attn.indexers_proj",

251

"model.layers.20.self_attn.kv_a_layernorm",

252

"model.layers.20.self_attn.q_a_layernorm",

253

"model.layers.21.input_layernorm",

254

"model.layers.21.mlp.gate",

255

"model.layers.21.mlp.gate.e_score_correction_bias",

256

"model.layers.21.post_attention_layernorm",

257

"model.layers.21.self_attn.indexer.k_norm",

258

"model.layers.21.self_attn.indexer.k_norm.bias",

259

"model.layers.21.self_attn.indexers_proj",

260

"model.layers.21.self_attn.kv_a_layernorm",

261

"model.layers.21.self_attn.q_a_layernorm",

262

"model.layers.22.input_layernorm",

263

"model.layers.22.mlp.gate",

264

"model.layers.22.mlp.gate.e_score_correction_bias",

265

"model.layers.22.post_attention_layernorm",

266

"model.layers.22.self_attn.indexer.k_norm",

267

"model.layers.22.self_attn.indexer.k_norm.bias",

268

"model.layers.22.self_attn.indexers_proj",

269

"model.layers.22.self_attn.kv_a_layernorm",

270

"model.layers.22.self_attn.q_a_layernorm",

271

"model.layers.23.input_layernorm",

272

"model.layers.23.mlp.gate",

273

"model.layers.23.mlp.gate.e_score_correction_bias",

274

"model.layers.23.post_attention_layernorm",

275

"model.layers.23.self_attn.indexer.k_norm",

276

"model.layers.23.self_attn.indexer.k_norm.bias",

277

"model.layers.23.self_attn.indexers_proj",

278

"model.layers.23.self_attn.kv_a_layernorm",

279

"model.layers.23.self_attn.q_a_layernorm",

280

"model.layers.24.input_layernorm",

281

"model.layers.24.mlp.gate",

282

"model.layers.24.mlp.gate.e_score_correction_bias",

283

"model.layers.24.post_attention_layernorm",

284

"model.layers.24.self_attn.indexer.k_norm",

285

"model.layers.24.self_attn.indexer.k_norm.bias",

286

"model.layers.24.self_attn.indexers_proj",

287

"model.layers.24.self_attn.kv_a_layernorm",

288

"model.layers.24.self_attn.q_a_layernorm",

289

"model.layers.25.input_layernorm",

290

"model.layers.25.mlp.gate",

291

"model.layers.25.mlp.gate.e_score_correction_bias",

292

"model.layers.25.post_attention_layernorm",

293

"model.layers.25.self_attn.indexer.k_norm",

294

"model.layers.25.self_attn.indexer.k_norm.bias",

295

"model.layers.25.self_attn.indexers_proj",

296

"model.layers.25.self_attn.kv_a_layernorm",

297

"model.layers.25.self_attn.q_a_layernorm",

298

"model.layers.26.input_layernorm",

299

"model.layers.26.mlp.gate",

300

"model.layers.26.mlp.gate.e_score_correction_bias",

301

"model.layers.26.post_attention_layernorm",

302

"model.layers.26.self_attn.indexer.k_norm",

303

"model.layers.26.self_attn.indexer.k_norm.bias",

304

"model.layers.26.self_attn.indexers_proj",

305

"model.layers.26.self_attn.kv_a_layernorm",

306

"model.layers.26.self_attn.q_a_layernorm",

307

"model.layers.27.input_layernorm",

308

"model.layers.27.mlp.gate",

309

"model.layers.27.mlp.gate.e_score_correction_bias",

310

"model.layers.27.post_attention_layernorm",

311

"model.layers.27.self_attn.indexer.k_norm",

312

"model.layers.27.self_attn.indexer.k_norm.bias",

313

"model.layers.27.self_attn.indexers_proj",

314

"model.layers.27.self_attn.kv_a_layernorm",

315

"model.layers.27.self_attn.q_a_layernorm",

316

"model.layers.28.input_layernorm",

317

"model.layers.28.mlp.gate",

318

"model.layers.28.mlp.gate.e_score_correction_bias",

319

"model.layers.28.post_attention_layernorm",

320

"model.layers.28.self_attn.indexer.k_norm",

321

"model.layers.28.self_attn.indexer.k_norm.bias",

322

"model.layers.28.self_attn.indexers_proj",

323

"model.layers.28.self_attn.kv_a_layernorm",

324

"model.layers.28.self_attn.q_a_layernorm",

325

"model.layers.29.input_layernorm",

326

"model.layers.29.mlp.gate",

327

"model.layers.29.mlp.gate.e_score_correction_bias",

328

"model.layers.29.post_attention_layernorm",

329

"model.layers.29.self_attn.indexer.k_norm",

330

"model.layers.29.self_attn.indexer.k_norm.bias",

331

"model.layers.29.self_attn.indexers_proj",

332

"model.layers.29.self_attn.kv_a_layernorm",

333

"model.layers.29.self_attn.q_a_layernorm",

334

"model.layers.30.input_layernorm",

335

"model.layers.30.mlp.gate",

336

"model.layers.30.mlp.gate.e_score_correction_bias",

337

"model.layers.30.post_attention_layernorm",

338

"model.layers.30.self_attn.indexer.k_norm",

339

"model.layers.30.self_attn.indexer.k_norm.bias",

340

"model.layers.30.self_attn.indexers_proj",

341

"model.layers.30.self_attn.kv_a_layernorm",

342

"model.layers.30.self_attn.q_a_layernorm",

343

"model.layers.31.input_layernorm",

344

"model.layers.31.mlp.gate",

345

"model.layers.31.mlp.gate.e_score_correction_bias",

346

"model.layers.31.post_attention_layernorm",

347

"model.layers.31.self_attn.indexer.k_norm",

348

"model.layers.31.self_attn.indexer.k_norm.bias",

349

"model.layers.31.self_attn.indexers_proj",

350

"model.layers.31.self_attn.kv_a_layernorm",

351

"model.layers.31.self_attn.q_a_layernorm",

352

"model.layers.32.input_layernorm",

353

"model.layers.32.mlp.gate",

354

"model.layers.32.mlp.gate.e_score_correction_bias",

355

"model.layers.32.post_attention_layernorm",

356

"model.layers.32.self_attn.indexer.k_norm",

357

"model.layers.32.self_attn.indexer.k_norm.bias",

358

"model.layers.32.self_attn.indexers_proj",

359

"model.layers.32.self_attn.kv_a_layernorm",

360

"model.layers.32.self_attn.q_a_layernorm",

361

"model.layers.33.input_layernorm",

362

"model.layers.33.mlp.gate",

363

"model.layers.33.mlp.gate.e_score_correction_bias",

364

"model.layers.33.post_attention_layernorm",

365

"model.layers.33.self_attn.indexer.k_norm",

366

"model.layers.33.self_attn.indexer.k_norm.bias",

367

"model.layers.33.self_attn.indexers_proj",

368

"model.layers.33.self_attn.kv_a_layernorm",

369

"model.layers.33.self_attn.q_a_layernorm",

370

"model.layers.34.input_layernorm",

371

"model.layers.34.mlp.gate",

372

"model.layers.34.mlp.gate.e_score_correction_bias",

373

"model.layers.34.post_attention_layernorm",

374

"model.layers.34.self_attn.indexer.k_norm",

375

"model.layers.34.self_attn.indexer.k_norm.bias",

376

"model.layers.34.self_attn.indexers_proj",

377

"model.layers.34.self_attn.kv_a_layernorm",

378

"model.layers.34.self_attn.q_a_layernorm",

379

"model.layers.35.input_layernorm",

380

"model.layers.35.mlp.gate",

381

"model.layers.35.mlp.gate.e_score_correction_bias",

382

"model.layers.35.post_attention_layernorm",

383

"model.layers.35.self_attn.indexer.k_norm",

384

"model.layers.35.self_attn.indexer.k_norm.bias",

385

"model.layers.35.self_attn.indexers_proj",

386

"model.layers.35.self_attn.kv_a_layernorm",

387

"model.layers.35.self_attn.q_a_layernorm",

388

"model.layers.36.input_layernorm",

389

"model.layers.36.mlp.gate",

390

"model.layers.36.mlp.gate.e_score_correction_bias",

391

"model.layers.36.post_attention_layernorm",

392

"model.layers.36.self_attn.indexer.k_norm",

393

"model.layers.36.self_attn.indexer.k_norm.bias",

394

"model.layers.36.self_attn.indexers_proj",

395

"model.layers.36.self_attn.kv_a_layernorm",

396

"model.layers.36.self_attn.q_a_layernorm",

397

"model.layers.37.input_layernorm",

398

"model.layers.37.mlp.gate",

399

"model.layers.37.mlp.gate.e_score_correction_bias",

400

"model.layers.37.post_attention_layernorm",

401

"model.layers.37.self_attn.indexer.k_norm",

402

"model.layers.37.self_attn.indexer.k_norm.bias",

403

"model.layers.37.self_attn.indexers_proj",

404

"model.layers.37.self_attn.kv_a_layernorm",

405

"model.layers.37.self_attn.q_a_layernorm",

406

"model.layers.38.input_layernorm",

407

"model.layers.38.mlp.gate",

408

"model.layers.38.mlp.gate.e_score_correction_bias",

409

"model.layers.38.post_attention_layernorm",

410

"model.layers.38.self_attn.indexer.k_norm",

411

"model.layers.38.self_attn.indexer.k_norm.bias",

412

"model.layers.38.self_attn.indexers_proj",

413

"model.layers.38.self_attn.kv_a_layernorm",

414

"model.layers.38.self_attn.q_a_layernorm",

415

"model.layers.39.input_layernorm",

416

"model.layers.39.mlp.gate",

417

"model.layers.39.mlp.gate.e_score_correction_bias",

418

"model.layers.39.post_attention_layernorm",

419

"model.layers.39.self_attn.indexer.k_norm",

420

"model.layers.39.self_attn.indexer.k_norm.bias",

421

"model.layers.39.self_attn.indexers_proj",

422

"model.layers.39.self_attn.kv_a_layernorm",

423

"model.layers.39.self_attn.q_a_layernorm",

424

"model.layers.40.input_layernorm",

425

"model.layers.40.mlp.gate",

426

"model.layers.40.mlp.gate.e_score_correction_bias",

427

"model.layers.40.post_attention_layernorm",

428

"model.layers.40.self_attn.indexer.k_norm",

429

"model.layers.40.self_attn.indexer.k_norm.bias",

430

"model.layers.40.self_attn.indexers_proj",

431

"model.layers.40.self_attn.kv_a_layernorm",

432

"model.layers.40.self_attn.q_a_layernorm",

433

"model.layers.41.input_layernorm",

434

"model.layers.41.mlp.gate",

435

"model.layers.41.mlp.gate.e_score_correction_bias",

436

"model.layers.41.post_attention_layernorm",

437

"model.layers.41.self_attn.indexer.k_norm",

438

"model.layers.41.self_attn.indexer.k_norm.bias",

439

"model.layers.41.self_attn.indexers_proj",

440

"model.layers.41.self_attn.kv_a_layernorm",

441

"model.layers.41.self_attn.q_a_layernorm",

442

"model.layers.42.input_layernorm",

443

"model.layers.42.mlp.gate",

444

"model.layers.42.mlp.gate.e_score_correction_bias",

445

"model.layers.42.post_attention_layernorm",

446

"model.layers.42.self_attn.indexer.k_norm",

447

"model.layers.42.self_attn.indexer.k_norm.bias",

448

"model.layers.42.self_attn.indexers_proj",

449

"model.layers.42.self_attn.kv_a_layernorm",

450

"model.layers.42.self_attn.q_a_layernorm",

451

"model.layers.43.input_layernorm",

452

"model.layers.43.mlp.gate",

453

"model.layers.43.mlp.gate.e_score_correction_bias",

454

"model.layers.43.post_attention_layernorm",

455

"model.layers.43.self_attn.indexer.k_norm",

456

"model.layers.43.self_attn.indexer.k_norm.bias",

457

"model.layers.43.self_attn.indexers_proj",

458

"model.layers.43.self_attn.kv_a_layernorm",

459

"model.layers.43.self_attn.q_a_layernorm",

460

"model.layers.44.input_layernorm",

461

"model.layers.44.mlp.gate",

462

"model.layers.44.mlp.gate.e_score_correction_bias",

463

"model.layers.44.post_attention_layernorm",

464

"model.layers.44.self_attn.indexer.k_norm",

465

"model.layers.44.self_attn.indexer.k_norm.bias",

466

"model.layers.44.self_attn.indexers_proj",

467

"model.layers.44.self_attn.kv_a_layernorm",

468

"model.layers.44.self_attn.q_a_layernorm",

469

"model.layers.45.input_layernorm",

470

"model.layers.45.mlp.gate",

471

"model.layers.45.mlp.gate.e_score_correction_bias",

472

"model.layers.45.post_attention_layernorm",

473

"model.layers.45.self_attn.indexer.k_norm",

474

"model.layers.45.self_attn.indexer.k_norm.bias",

475

"model.layers.45.self_attn.indexers_proj",

476

"model.layers.45.self_attn.kv_a_layernorm",

477

"model.layers.45.self_attn.q_a_layernorm",

478

"model.layers.46.input_layernorm",

479

"model.layers.46.mlp.gate",

480

"model.layers.46.mlp.gate.e_score_correction_bias",

481

"model.layers.46.post_attention_layernorm",

482

"model.layers.46.self_attn.indexer.k_norm",

483

"model.layers.46.self_attn.indexer.k_norm.bias",

484

"model.layers.46.self_attn.indexers_proj",

485

"model.layers.46.self_attn.kv_a_layernorm",

486

"model.layers.46.self_attn.q_a_layernorm",

487

"model.layers.47.input_layernorm",

488

"model.layers.47.mlp.gate",

489

"model.layers.47.mlp.gate.e_score_correction_bias",

490

"model.layers.47.post_attention_layernorm",

491

"model.layers.47.self_attn.indexer.k_norm",

492

"model.layers.47.self_attn.indexer.k_norm.bias",

493

"model.layers.47.self_attn.indexers_proj",

494

"model.layers.47.self_attn.kv_a_layernorm",

495

"model.layers.47.self_attn.q_a_layernorm",

496

"model.layers.48.input_layernorm",

497

"model.layers.48.mlp.gate",

498

"model.layers.48.mlp.gate.e_score_correction_bias",

499

"model.layers.48.post_attention_layernorm",

500

"model.layers.48.self_attn.indexer.k_norm",

501

"model.layers.48.self_attn.indexer.k_norm.bias",

502

"model.layers.48.self_attn.indexers_proj",

503

"model.layers.48.self_attn.kv_a_layernorm",

504

"model.layers.48.self_attn.q_a_layernorm",

505

"model.layers.49.input_layernorm",

506

"model.layers.49.mlp.gate",

507

"model.layers.49.mlp.gate.e_score_correction_bias",

508

"model.layers.49.post_attention_layernorm",

509

"model.layers.49.self_attn.indexer.k_norm",

510

"model.layers.49.self_attn.indexer.k_norm.bias",

511

"model.layers.49.self_attn.indexers_proj",

512

"model.layers.49.self_attn.kv_a_layernorm",

513

"model.layers.49.self_attn.q_a_layernorm",

514

"model.layers.50.input_layernorm",

515

"model.layers.50.mlp.gate",

516

"model.layers.50.mlp.gate.e_score_correction_bias",

517

"model.layers.50.post_attention_layernorm",

518

"model.layers.50.self_attn.indexer.k_norm",

519

"model.layers.50.self_attn.indexer.k_norm.bias",

520

"model.layers.50.self_attn.indexers_proj",

521

"model.layers.50.self_attn.kv_a_layernorm",

522

"model.layers.50.self_attn.q_a_layernorm",

523

"model.layers.51.input_layernorm",

524

"model.layers.51.mlp.gate",

525

"model.layers.51.mlp.gate.e_score_correction_bias",

526

"model.layers.51.post_attention_layernorm",

527

"model.layers.51.self_attn.indexer.k_norm",

528

"model.layers.51.self_attn.indexer.k_norm.bias",

529

"model.layers.51.self_attn.indexers_proj",

530

"model.layers.51.self_attn.kv_a_layernorm",

531

"model.layers.51.self_attn.q_a_layernorm",

532

"model.layers.52.input_layernorm",

533

"model.layers.52.mlp.gate",

534

"model.layers.52.mlp.gate.e_score_correction_bias",

535

"model.layers.52.post_attention_layernorm",

536

"model.layers.52.self_attn.indexer.k_norm",

537

"model.layers.52.self_attn.indexer.k_norm.bias",

538

"model.layers.52.self_attn.indexers_proj",

539

"model.layers.52.self_attn.kv_a_layernorm",

540

"model.layers.52.self_attn.q_a_layernorm",

541

"model.layers.53.input_layernorm",

542

"model.layers.53.mlp.gate",

543

"model.layers.53.mlp.gate.e_score_correction_bias",

544

"model.layers.53.post_attention_layernorm",

545

"model.layers.53.self_attn.indexer.k_norm",

546

"model.layers.53.self_attn.indexer.k_norm.bias",

547

"model.layers.53.self_attn.indexers_proj",

548

"model.layers.53.self_attn.kv_a_layernorm",

549

"model.layers.53.self_attn.q_a_layernorm",

550

"model.layers.54.input_layernorm",

551

"model.layers.54.mlp.gate",

552

"model.layers.54.mlp.gate.e_score_correction_bias",

553

"model.layers.54.post_attention_layernorm",

554

"model.layers.54.self_attn.indexer.k_norm",

555

"model.layers.54.self_attn.indexer.k_norm.bias",

556

"model.layers.54.self_attn.indexers_proj",

557

"model.layers.54.self_attn.kv_a_layernorm",

558

"model.layers.54.self_attn.q_a_layernorm",

559

"model.layers.55.input_layernorm",

560

"model.layers.55.mlp.gate",

561

"model.layers.55.mlp.gate.e_score_correction_bias",

562

"model.layers.55.post_attention_layernorm",

563

"model.layers.55.self_attn.indexer.k_norm",

564

"model.layers.55.self_attn.indexer.k_norm.bias",

565

"model.layers.55.self_attn.indexers_proj",

566

"model.layers.55.self_attn.kv_a_layernorm",

567

"model.layers.55.self_attn.q_a_layernorm",

568

"model.layers.56.input_layernorm",

569

"model.layers.56.mlp.gate",

570

"model.layers.56.mlp.gate.e_score_correction_bias",

571

"model.layers.56.post_attention_layernorm",

572

"model.layers.56.self_attn.indexer.k_norm",

573

"model.layers.56.self_attn.indexer.k_norm.bias",

574

"model.layers.56.self_attn.indexers_proj",

575

"model.layers.56.self_attn.kv_a_layernorm",

576

"model.layers.56.self_attn.q_a_layernorm",

577

"model.layers.57.input_layernorm",

578

"model.layers.57.mlp.gate",

579

"model.layers.57.mlp.gate.e_score_correction_bias",

580

"model.layers.57.post_attention_layernorm",

581

"model.layers.57.self_attn.indexer.k_norm",

582

"model.layers.57.self_attn.indexer.k_norm.bias",

583

"model.layers.57.self_attn.indexers_proj",

584

"model.layers.57.self_attn.kv_a_layernorm",

585

"model.layers.57.self_attn.q_a_layernorm",

586

"model.layers.58.input_layernorm",

587

"model.layers.58.mlp.gate",

588

"model.layers.58.mlp.gate.e_score_correction_bias",

589

"model.layers.58.post_attention_layernorm",

590

"model.layers.58.self_attn.indexer.k_norm",

591

"model.layers.58.self_attn.indexer.k_norm.bias",

592

"model.layers.58.self_attn.indexers_proj",

593

"model.layers.58.self_attn.kv_a_layernorm",

594

"model.layers.58.self_attn.q_a_layernorm",

595

"model.layers.59.input_layernorm",

596

"model.layers.59.mlp.gate",

597

"model.layers.59.mlp.gate.e_score_correction_bias",

598

"model.layers.59.post_attention_layernorm",

599

"model.layers.59.self_attn.indexer.k_norm",

600

"model.layers.59.self_attn.indexer.k_norm.bias",

601

"model.layers.59.self_attn.indexers_proj",

602

"model.layers.59.self_attn.kv_a_layernorm",

603

"model.layers.59.self_attn.q_a_layernorm",

604

"model.layers.60.input_layernorm",

605

"model.layers.60.mlp.gate",

606

"model.layers.60.mlp.gate.e_score_correction_bias",

607

"model.layers.60.post_attention_layernorm",

608

"model.layers.60.self_attn.indexer.k_norm",

609

"model.layers.60.self_attn.indexer.k_norm.bias",

610

"model.layers.60.self_attn.indexers_proj",

611

"model.layers.60.self_attn.kv_a_layernorm",

612

"model.layers.60.self_attn.q_a_layernorm",

613

"model.layers.61.input_layernorm",

614

"model.layers.61.mlp.gate",

615

"model.layers.61.mlp.gate.e_score_correction_bias",

616

"model.layers.61.post_attention_layernorm",

617

"model.layers.61.self_attn.indexer.k_norm",

618

"model.layers.61.self_attn.indexer.k_norm.bias",

619

"model.layers.61.self_attn.indexers_proj",

620

"model.layers.61.self_attn.kv_a_layernorm",

621

"model.layers.61.self_attn.q_a_layernorm",

622

"model.layers.62.input_layernorm",

623

"model.layers.62.mlp.gate",

624

"model.layers.62.mlp.gate.e_score_correction_bias",

625

"model.layers.62.post_attention_layernorm",

626

"model.layers.62.self_attn.indexer.k_norm",

627

"model.layers.62.self_attn.indexer.k_norm.bias",

628

"model.layers.62.self_attn.indexers_proj",

629

"model.layers.62.self_attn.kv_a_layernorm",

630

"model.layers.62.self_attn.q_a_layernorm",

631

"model.layers.63.input_layernorm",

632

"model.layers.63.mlp.gate",

633

"model.layers.63.mlp.gate.e_score_correction_bias",

634

"model.layers.63.post_attention_layernorm",

635

"model.layers.63.self_attn.indexer.k_norm",

636

"model.layers.63.self_attn.indexer.k_norm.bias",

637

"model.layers.63.self_attn.indexers_proj",

638

"model.layers.63.self_attn.kv_a_layernorm",

639

"model.layers.63.self_attn.q_a_layernorm",

640

"model.layers.64.input_layernorm",

641

"model.layers.64.mlp.gate",

642

"model.layers.64.mlp.gate.e_score_correction_bias",

643

"model.layers.64.post_attention_layernorm",

644

"model.layers.64.self_attn.indexer.k_norm",

645

"model.layers.64.self_attn.indexer.k_norm.bias",

646

"model.layers.64.self_attn.indexers_proj",

647

"model.layers.64.self_attn.kv_a_layernorm",

648

"model.layers.64.self_attn.q_a_layernorm",

649

"model.layers.65.input_layernorm",

650

"model.layers.65.mlp.gate",

651

"model.layers.65.mlp.gate.e_score_correction_bias",

652

"model.layers.65.post_attention_layernorm",

653

"model.layers.65.self_attn.indexer.k_norm",

654

"model.layers.65.self_attn.indexer.k_norm.bias",

655

"model.layers.65.self_attn.indexers_proj",

656

"model.layers.65.self_attn.kv_a_layernorm",

657

"model.layers.65.self_attn.q_a_layernorm",

658

"model.layers.66.input_layernorm",

659

"model.layers.66.mlp.gate",

660

"model.layers.66.mlp.gate.e_score_correction_bias",

661

"model.layers.66.post_attention_layernorm",

662

"model.layers.66.self_attn.indexer.k_norm",

663

"model.layers.66.self_attn.indexer.k_norm.bias",

664

"model.layers.66.self_attn.indexers_proj",

665

"model.layers.66.self_attn.kv_a_layernorm",

666

"model.layers.66.self_attn.q_a_layernorm",

667

"model.layers.67.input_layernorm",

668

"model.layers.67.mlp.gate",

669

"model.layers.67.mlp.gate.e_score_correction_bias",

670

"model.layers.67.post_attention_layernorm",

671

"model.layers.67.self_attn.indexer.k_norm",

672

"model.layers.67.self_attn.indexer.k_norm.bias",

673

"model.layers.67.self_attn.indexers_proj",

674

"model.layers.67.self_attn.kv_a_layernorm",

675

"model.layers.67.self_attn.q_a_layernorm",

676

"model.layers.68.input_layernorm",

677

"model.layers.68.mlp.gate",

678

"model.layers.68.mlp.gate.e_score_correction_bias",

679

"model.layers.68.post_attention_layernorm",

680

"model.layers.68.self_attn.indexer.k_norm",

681

"model.layers.68.self_attn.indexer.k_norm.bias",

682

"model.layers.68.self_attn.indexers_proj",

683

"model.layers.68.self_attn.kv_a_layernorm",

684

"model.layers.68.self_attn.q_a_layernorm",

685

"model.layers.69.input_layernorm",

686

"model.layers.69.mlp.gate",

687

"model.layers.69.mlp.gate.e_score_correction_bias",

688

"model.layers.69.post_attention_layernorm",

689

"model.layers.69.self_attn.indexer.k_norm",

690

"model.layers.69.self_attn.indexer.k_norm.bias",

691

"model.layers.69.self_attn.indexers_proj",

692

"model.layers.69.self_attn.kv_a_layernorm",

693

"model.layers.69.self_attn.q_a_layernorm",

694

"model.layers.70.input_layernorm",

695

"model.layers.70.mlp.gate",

696

"model.layers.70.mlp.gate.e_score_correction_bias",

697

"model.layers.70.post_attention_layernorm",

698

"model.layers.70.self_attn.indexer.k_norm",

699

"model.layers.70.self_attn.indexer.k_norm.bias",

700

"model.layers.70.self_attn.indexers_proj",

701

"model.layers.70.self_attn.kv_a_layernorm",

702

"model.layers.70.self_attn.q_a_layernorm",

703

"model.layers.71.input_layernorm",

704

"model.layers.71.mlp.gate",

705

"model.layers.71.mlp.gate.e_score_correction_bias",

706

"model.layers.71.post_attention_layernorm",

707

"model.layers.71.self_attn.indexer.k_norm",

708

"model.layers.71.self_attn.indexer.k_norm.bias",

709

"model.layers.71.self_attn.indexers_proj",

710

"model.layers.71.self_attn.kv_a_layernorm",

711

"model.layers.71.self_attn.q_a_layernorm",

712

"model.layers.72.input_layernorm",

713

"model.layers.72.mlp.gate",

714

"model.layers.72.mlp.gate.e_score_correction_bias",

715

"model.layers.72.post_attention_layernorm",

716

"model.layers.72.self_attn.indexer.k_norm",

717

"model.layers.72.self_attn.indexer.k_norm.bias",

718

"model.layers.72.self_attn.indexers_proj",

719

"model.layers.72.self_attn.kv_a_layernorm",

720

"model.layers.72.self_attn.q_a_layernorm",

721

"model.layers.73.input_layernorm",

722

"model.layers.73.mlp.gate",

723

"model.layers.73.mlp.gate.e_score_correction_bias",

724

"model.layers.73.post_attention_layernorm",

725

"model.layers.73.self_attn.indexer.k_norm",

726

"model.layers.73.self_attn.indexer.k_norm.bias",

727

"model.layers.73.self_attn.indexers_proj",

728

"model.layers.73.self_attn.kv_a_layernorm",

729

"model.layers.73.self_attn.q_a_layernorm",

730

"model.layers.74.input_layernorm",

731

"model.layers.74.mlp.gate",

732

"model.layers.74.mlp.gate.e_score_correction_bias",

733

"model.layers.74.post_attention_layernorm",

734

"model.layers.74.self_attn.indexer.k_norm",

735

"model.layers.74.self_attn.indexer.k_norm.bias",

736

"model.layers.74.self_attn.indexers_proj",

737

"model.layers.74.self_attn.kv_a_layernorm",

738

"model.layers.74.self_attn.q_a_layernorm",

739

"model.layers.75.input_layernorm",

740

"model.layers.75.mlp.gate",

741

"model.layers.75.mlp.gate.e_score_correction_bias",

742

"model.layers.75.post_attention_layernorm",

743

"model.layers.75.self_attn.indexer.k_norm",

744

"model.layers.75.self_attn.indexer.k_norm.bias",

745

"model.layers.75.self_attn.indexers_proj",

746

"model.layers.75.self_attn.kv_a_layernorm",

747

"model.layers.75.self_attn.q_a_layernorm",

748

"model.layers.76.input_layernorm",

749

"model.layers.76.mlp.gate",

750

"model.layers.76.mlp.gate.e_score_correction_bias",

751

"model.layers.76.post_attention_layernorm",

752

"model.layers.76.self_attn.indexer.k_norm",

753

"model.layers.76.self_attn.indexer.k_norm.bias",

754

"model.layers.76.self_attn.indexers_proj",

755

"model.layers.76.self_attn.kv_a_layernorm",

756

"model.layers.76.self_attn.q_a_layernorm",

757

"model.layers.77.input_layernorm",

758

"model.layers.77.mlp.gate",

759

"model.layers.77.mlp.gate.e_score_correction_bias",

760

"model.layers.77.post_attention_layernorm",

761

"model.layers.77.self_attn.indexer.k_norm",

762

"model.layers.77.self_attn.indexer.k_norm.bias",

763

"model.layers.77.self_attn.indexers_proj",

764

"model.layers.77.self_attn.kv_a_layernorm",

765

"model.layers.77.self_attn.q_a_layernorm",

766

"model.layers.78.eh_proj",

767

"model.layers.78.enorm",

768

"model.layers.78.hnorm",

769

"model.layers.78.input_layernorm",

770

"model.layers.78.mlp.gate",

771

"model.layers.78.mlp.gate.e_score_correction_bias",

772

"model.layers.78.post_attention_layernorm",

773

"model.layers.78.self_attn.indexer.k_norm",

774

"model.layers.78.self_attn.indexer.k_norm.bias",

775

"model.layers.78.self_attn.indexers_proj",

776

"model.layers.78.self_attn.kv_a_layernorm",

777

"model.layers.78.self_attn.q_a_layernorm",

778

"model.layers.78.shared_head.norm",

779

"model.norm"

780

]

781

}

782

}

783