config.json

1.8 KB · 77 lines · json Raw

1	`{`
2	`"architectures": [`
3	`"GptOssForCausalLM"`
4	`],`
5	`"attention_bias": true,`
6	`"attention_dropout": 0.0,`
7	`"eos_token_id": 200002,`
8	`"experts_per_token": 4,`
9	`"head_dim": 64,`
10	`"hidden_act": "silu",`
11	`"hidden_size": 2880,`
12	`"initial_context_length": 4096,`
13	`"initializer_range": 0.02,`
14	`"intermediate_size": 2880,`
15	`"layer_types": [`
16	`"sliding_attention",`
17	`"full_attention",`
18	`"sliding_attention",`
19	`"full_attention",`
20	`"sliding_attention",`
21	`"full_attention",`
22	`"sliding_attention",`
23	`"full_attention",`
24	`"sliding_attention",`
25	`"full_attention",`
26	`"sliding_attention",`
27	`"full_attention",`
28	`"sliding_attention",`
29	`"full_attention",`
30	`"sliding_attention",`
31	`"full_attention",`
32	`"sliding_attention",`
33	`"full_attention",`
34	`"sliding_attention",`
35	`"full_attention",`
36	`"sliding_attention",`
37	`"full_attention",`
38	`"sliding_attention",`
39	`"full_attention"`
40	`],`
41	`"max_position_embeddings": 131072,`
42	`"model_type": "gpt_oss",`
43	`"num_attention_heads": 64,`
44	`"num_experts_per_tok": 4,`
45	`"num_hidden_layers": 24,`
46	`"num_key_value_heads": 8,`
47	`"num_local_experts": 32,`
48	`"output_router_logits": false,`
49	`"pad_token_id": 199999,`
50	`"quantization_config": {`
51	`"modules_to_not_convert": [`
52	`"model.layers.*.self_attn",`
53	`"model.layers.*.mlp.router",`
54	`"model.embed_tokens",`
55	`"lm_head"`
56	`],`
57	`"quant_method": "mxfp4"`
58	`},`
59	`"rms_norm_eps": 1e-05,`
60	`"rope_scaling": {`
61	`"beta_fast": 32.0,`
62	`"beta_slow": 1.0,`
63	`"factor": 32.0,`
64	`"original_max_position_embeddings": 4096,`
65	`"rope_type": "yarn",`
66	`"truncate": false`
67	`},`
68	`"rope_theta": 150000,`
69	`"router_aux_loss_coef": 0.9,`
70	`"sliding_window": 128,`
71	`"swiglu_limit": 7.0,`
72	`"tie_word_embeddings": false,`
73	`"transformers_version": "4.55.0.dev0",`
74	`"use_cache": true,`
75	`"vocab_size": 201088`
76	`}`
77