config.json
1.2 KB · 49 lines · json Raw
1 {
2 "architectures": [
3 "Glm4MoeForCausalLM"
4 ],
5 "attention_bias": true,
6 "attention_dropout": 0.0,
7 "auto_map": {
8 "AutoConfig": "configuration_glm4_moe.Glm4MoeConfig",
9 "AutoModel": "modeling_glm4_moe.Glm4MoeModel",
10 "AutoModelForCausalLM": "modeling_glm4_moe.Glm4MoeForCausalLM"
11 },
12 "dtype": "bfloat16",
13 "eos_token_id": [
14 151334,
15 151329
16 ],
17 "first_k_dense_replace": 1,
18 "head_dim": 128,
19 "hidden_act": "silu",
20 "hidden_size": 4096,
21 "initializer_range": 0.02,
22 "intermediate_size": 10944,
23 "max_position_embeddings": 131072,
24 "model_type": "glm4_moe",
25 "moe_intermediate_size": 1408,
26 "n_group": 1,
27 "n_routed_experts": 128,
28 "n_shared_experts": 1,
29 "norm_topk_prob": true,
30 "num_attention_heads": 96,
31 "num_experts_per_tok": 8,
32 "num_hidden_layers": 46,
33 "num_key_value_heads": 8,
34 "num_nextn_predict_layers": 1,
35 "pad_token_id": 151329,
36 "partial_rotary_factor": 0.5,
37 "rms_norm_eps": 1e-05,
38 "rope_scaling": null,
39 "rope_theta": 1000000,
40 "routed_scaling_factor": 1.0,
41 "tie_word_embeddings": false,
42 "topk_group": 1,
43 "transformers_version": "4.56.1",
44 "use_cache": false,
45 "use_grouped_mm": true,
46 "use_qk_norm": false,
47 "vocab_size": 151552
48 }
49