config.json
2.0 KB · 89 lines · json Raw
1 {
2 "architectures": [
3 "GptOssForCausalLM"
4 ],
5 "attention_bias": true,
6 "attention_dropout": 0.0,
7 "eos_token_id": 200002,
8 "experts_per_token": 4,
9 "head_dim": 64,
10 "hidden_act": "silu",
11 "hidden_size": 2880,
12 "initial_context_length": 4096,
13 "initializer_range": 0.02,
14 "intermediate_size": 2880,
15 "layer_types": [
16 "sliding_attention",
17 "full_attention",
18 "sliding_attention",
19 "full_attention",
20 "sliding_attention",
21 "full_attention",
22 "sliding_attention",
23 "full_attention",
24 "sliding_attention",
25 "full_attention",
26 "sliding_attention",
27 "full_attention",
28 "sliding_attention",
29 "full_attention",
30 "sliding_attention",
31 "full_attention",
32 "sliding_attention",
33 "full_attention",
34 "sliding_attention",
35 "full_attention",
36 "sliding_attention",
37 "full_attention",
38 "sliding_attention",
39 "full_attention",
40 "sliding_attention",
41 "full_attention",
42 "sliding_attention",
43 "full_attention",
44 "sliding_attention",
45 "full_attention",
46 "sliding_attention",
47 "full_attention",
48 "sliding_attention",
49 "full_attention",
50 "sliding_attention",
51 "full_attention"
52 ],
53 "max_position_embeddings": 131072,
54 "model_type": "gpt_oss",
55 "num_attention_heads": 64,
56 "num_experts_per_tok": 4,
57 "num_hidden_layers": 36,
58 "num_key_value_heads": 8,
59 "num_local_experts": 128,
60 "output_router_logits": false,
61 "pad_token_id": 199999,
62 "quantization_config": {
63 "modules_to_not_convert": [
64 "model.layers.*.self_attn",
65 "model.layers.*.mlp.router",
66 "model.embed_tokens",
67 "lm_head"
68 ],
69 "quant_method": "mxfp4"
70 },
71 "rms_norm_eps": 1e-05,
72 "rope_scaling": {
73 "beta_fast": 32.0,
74 "beta_slow": 1.0,
75 "factor": 32.0,
76 "original_max_position_embeddings": 4096,
77 "rope_type": "yarn",
78 "truncate": false
79 },
80 "rope_theta": 150000,
81 "router_aux_loss_coef": 0.9,
82 "sliding_window": 128,
83 "swiglu_limit": 7.0,
84 "tie_word_embeddings": false,
85 "transformers_version": "4.55.0.dev0",
86 "use_cache": true,
87 "vocab_size": 201088
88 }
89