config.json
1.8 KB · 77 lines · json Raw
1 {
2 "architectures": [
3 "GptOssForCausalLM"
4 ],
5 "attention_bias": true,
6 "attention_dropout": 0.0,
7 "eos_token_id": 200002,
8 "experts_per_token": 4,
9 "head_dim": 64,
10 "hidden_act": "silu",
11 "hidden_size": 2880,
12 "initial_context_length": 4096,
13 "initializer_range": 0.02,
14 "intermediate_size": 2880,
15 "layer_types": [
16 "sliding_attention",
17 "full_attention",
18 "sliding_attention",
19 "full_attention",
20 "sliding_attention",
21 "full_attention",
22 "sliding_attention",
23 "full_attention",
24 "sliding_attention",
25 "full_attention",
26 "sliding_attention",
27 "full_attention",
28 "sliding_attention",
29 "full_attention",
30 "sliding_attention",
31 "full_attention",
32 "sliding_attention",
33 "full_attention",
34 "sliding_attention",
35 "full_attention",
36 "sliding_attention",
37 "full_attention",
38 "sliding_attention",
39 "full_attention"
40 ],
41 "max_position_embeddings": 131072,
42 "model_type": "gpt_oss",
43 "num_attention_heads": 64,
44 "num_experts_per_tok": 4,
45 "num_hidden_layers": 24,
46 "num_key_value_heads": 8,
47 "num_local_experts": 32,
48 "output_router_logits": false,
49 "pad_token_id": 199999,
50 "quantization_config": {
51 "modules_to_not_convert": [
52 "model.layers.*.self_attn",
53 "model.layers.*.mlp.router",
54 "model.embed_tokens",
55 "lm_head"
56 ],
57 "quant_method": "mxfp4"
58 },
59 "rms_norm_eps": 1e-05,
60 "rope_scaling": {
61 "beta_fast": 32.0,
62 "beta_slow": 1.0,
63 "factor": 32.0,
64 "original_max_position_embeddings": 4096,
65 "rope_type": "yarn",
66 "truncate": false
67 },
68 "rope_theta": 150000,
69 "router_aux_loss_coef": 0.9,
70 "sliding_window": 128,
71 "swiglu_limit": 7.0,
72 "tie_word_embeddings": false,
73 "transformers_version": "4.55.0.dev0",
74 "use_cache": true,
75 "vocab_size": 201088
76 }
77