config.json
2.6 KB · 114 lines · json Raw
1 {
2 "_sliding_window_pattern": 1,
3 "architectures": [
4 "Gemma3ForCausalLM"
5 ],
6 "attention_bias": false,
7 "attention_dropout": 0.0,
8 "attn_logit_softcapping": null,
9 "bos_token_id": 128000,
10 "cache_implementation": "hybrid",
11 "dtype": "bfloat16",
12 "eos_token_id": 128001,
13 "final_logit_softcapping": 30.0,
14 "head_dim": 128,
15 "hidden_act": "gelu_pytorch_tanh",
16 "hidden_activation": "gelu_pytorch_tanh",
17 "hidden_size": 4096,
18 "initializer_range": 0.02,
19 "intermediate_size": 16384,
20 "layer_type": [
21 "full_attention",
22 "full_attention",
23 "full_attention",
24 "full_attention",
25 "full_attention",
26 "full_attention",
27 "full_attention",
28 "full_attention",
29 "full_attention",
30 "full_attention",
31 "full_attention",
32 "full_attention",
33 "full_attention",
34 "full_attention",
35 "full_attention",
36 "full_attention",
37 "full_attention",
38 "full_attention",
39 "full_attention",
40 "full_attention",
41 "full_attention",
42 "full_attention",
43 "full_attention",
44 "full_attention",
45 "full_attention",
46 "full_attention",
47 "full_attention",
48 "full_attention",
49 "full_attention",
50 "full_attention",
51 "full_attention",
52 "full_attention"
53 ],
54 "layer_types": [
55 "full_attention",
56 "full_attention",
57 "full_attention",
58 "full_attention",
59 "full_attention",
60 "full_attention",
61 "full_attention",
62 "full_attention",
63 "full_attention",
64 "full_attention",
65 "full_attention",
66 "full_attention",
67 "full_attention",
68 "full_attention",
69 "full_attention",
70 "full_attention",
71 "full_attention",
72 "full_attention",
73 "full_attention",
74 "full_attention",
75 "full_attention",
76 "full_attention",
77 "full_attention",
78 "full_attention",
79 "full_attention",
80 "full_attention",
81 "full_attention",
82 "full_attention",
83 "full_attention",
84 "full_attention",
85 "full_attention",
86 "full_attention"
87 ],
88 "max_position_embeddings": 32768,
89 "model_type": "gemma3_text",
90 "num_attention_heads": 32,
91 "num_hidden_layers": 32,
92 "num_key_value_heads": 8,
93 "pad_token_id": 128001,
94 "query_pre_attn_scalar": 128,
95 "rms_norm_eps": 1e-06,
96 "rope_local_base_freq": 10000,
97 "rope_scaling": {
98 "attn_factor": 1.0,
99 "beta_fast": 64.0,
100 "beta_slow": 1.0,
101 "extrapolation_factor": 1.0,
102 "factor": 4.0,
103 "original_max_position_embeddings": 8192,
104 "rope_type": "yarn"
105 },
106 "rope_theta": 10000,
107 "sliding_window": 32768,
108 "sliding_window_pattern": 1,
109 "transformers_version": "4.57.6",
110 "use_bidirectional_attention": false,
111 "use_cache": false,
112 "vocab_size": 128256
113 }
114