config.json
6.0 KB · 286 lines · json Raw
1 {
2 "model_version": "HunyuanImage-3.0",
3 "add_classification_head": false,
4 "anyres_pooling_size": 2,
5 "anyres_vit_max_image_size": null,
6 "anyres_vit_two_views": false,
7 "architectures": [
8 "HunyuanImage3ForCausalMM"
9 ],
10 "auto_map": {
11 "AutoConfig": "configuration_hunyuan.HunyuanImage3Config",
12 "AutoModel": "hunyuan.HunyuanImage3Model",
13 "AutoModelForCausalLM": "hunyuan.HunyuanImage3ForCausalMM"
14 },
15 "attention_bias": false,
16 "attention_dropout": 0.0,
17 "attention_head_dim": 128,
18 "bos_token_id": 127958,
19 "cla_share_factor": 2,
20 "class_num": 0,
21 "dense_list": [
22 4096,
23 0
24 ],
25 "eod_token_id": 3,
26 "eos_token_id": 127957,
27 "group_limited_greedy": false,
28 "hidden_act": "silu",
29 "hidden_size": 4096,
30 "im_end_id": 128001,
31 "im_newline_id": 11,
32 "im_start_id": 128000,
33 "image_token_id": 128006,
34 "initializer_range": 0.02,
35 "intermediate_size": 3072,
36 "kv_lora_rank": null,
37 "mask_init_id": 12,
38 "max_position_embeddings": 22800,
39 "mlp_bias": false,
40 "model_type": "hunyuan_image_3_moe",
41 "moe_drop_tokens": false,
42 "moe_intermediate_size": [
43 3072,
44 3072,
45 3072,
46 3072,
47 3072,
48 3072,
49 3072,
50 3072,
51 3072,
52 3072,
53 3072,
54 3072,
55 3072,
56 3072,
57 3072,
58 3072,
59 3072,
60 3072,
61 3072,
62 3072,
63 3072,
64 3072,
65 3072,
66 3072,
67 3072,
68 3072,
69 3072,
70 3072,
71 3072,
72 3072,
73 3072,
74 3072
75 ],
76 "moe_layer_num_skipped": 0,
77 "moe_random_routing_dropped_token": false,
78 "moe_topk": [
79 8,
80 8,
81 8,
82 8,
83 8,
84 8,
85 8,
86 8,
87 8,
88 8,
89 8,
90 8,
91 8,
92 8,
93 8,
94 8,
95 8,
96 8,
97 8,
98 8,
99 8,
100 8,
101 8,
102 8,
103 8,
104 8,
105 8,
106 8,
107 8,
108 8,
109 8,
110 8
111 ],
112 "n_group": false,
113 "norm_topk_prob": true,
114 "norm_type": "rms",
115 "num_attention_heads": 32,
116 "num_experts": 64,
117 "num_hidden_layers": 32,
118 "num_key_value_heads": 8,
119 "num_media_embeds": 257,
120 "num_shared_expert": [
121 1,
122 1,
123 1,
124 1,
125 1,
126 1,
127 1,
128 1,
129 1,
130 1,
131 1,
132 1,
133 1,
134 1,
135 1,
136 1,
137 1,
138 1,
139 1,
140 1,
141 1,
142 1,
143 1,
144 1,
145 1,
146 1,
147 1,
148 1,
149 1,
150 1,
151 1,
152 1
153 ],
154 "pad_id": 128009,
155 "pad_token_id": 128009,
156 "pool_type": "last",
157 "position_embedding_xdrope": false,
158 "pretraining_tp": 1,
159 "q_lora_rank": null,
160 "qk_nope_head_dim": null,
161 "qk_rope_head_dim": null,
162 "rms_norm_eps": 1e-05,
163 "rope_scaling": {
164 "alpha": 1.0,
165 "beta_fast": 32,
166 "beta_slow": 1,
167 "factor": 1.0,
168 "mscale": 1.0,
169 "mscale_all_dim": 1.0,
170 "type": "custom"
171 },
172 "rope_theta": 10000.0,
173 "routed_scaling_factor": false,
174 "skip_cls_token": false,
175 "text_end_id": 7,
176 "text_start_id": 6,
177 "tie_word_embeddings": false,
178 "topk_group": false,
179 "torch_dtype": "bfloat16",
180 "transformers_version": "4.50.0",
181 "use_cache": true,
182 "use_cla": false,
183 "use_mixed_mlp_moe": true,
184 "use_mla": false,
185 "use_qk_norm": true,
186 "use_rotary_pos_emb": true,
187 "v_head_dim": null,
188 "video_end_id": 10,
189 "video_start_id": 9,
190 "vit_add_patchemb_bias": false,
191 "vit_input_resolution": 224,
192 "vit_mapping_type": "resampler",
193 "vit_norm_type": "fused",
194 "vit_patch": 1,
195 "vit_path": null,
196 "vit_remove_prenorm": false,
197 "vit_token": 64,
198 "vit_type": "siglip2-so400m-patch16-naflex",
199 "vit_used_rms_norm": false,
200 "vocab_size": 133120,
201 "xdrope_section": null,
202 "head_dim": 128,
203 "rope_type": "2d",
204 "vae_downsample_factor": [
205 16,
206 16
207 ],
208 "vit_downsample_factor": [
209 16,
210 16
211 ],
212 "cond_token_attn_type": "joint_full",
213 "cond_image_type": "vae_vit",
214 "vae_type": "hunyuan-image-vae-v1",
215 "vae_dtype": "float32",
216 "vae_autocast_dtype": "float16",
217 "vae": {
218 "_class_name": "AutoencoderKLConv3D",
219 "block_out_channels": [
220 128,
221 256,
222 512,
223 1024,
224 1024
225 ],
226 "in_channels": 3,
227 "out_channels": 3,
228 "latent_channels": 32,
229 "layers_per_block": 2,
230 "ffactor_spatial": 16,
231 "ffactor_temporal": 4,
232 "sample_size": 384,
233 "sample_tsize": 96,
234 "downsample_match_channel": true,
235 "upsample_match_channel": true,
236 "scaling_factor": 0.562679178327931
237 },
238 "vit": {
239 "_attn_implementation": "sdpa",
240 "attention_dropout": 0.0,
241 "hidden_act": "gelu_pytorch_tanh",
242 "hidden_size": 1152,
243 "intermediate_size": 4304,
244 "layer_norm_eps": 1e-06,
245 "num_attention_heads": 16,
246 "num_channels": 3,
247 "num_hidden_layers": 27,
248 "num_patches": 256,
249 "patch_size": 16,
250 "torch_dtype": "float32",
251 "output_attentions": false,
252 "output_hidden_states": false,
253 "use_return_dict": true
254 },
255 "vit_processor": {
256 "do_convert_rgb": null,
257 "do_normalize": true,
258 "do_rescale": true,
259 "do_resize": true,
260 "image_mean": [
261 0.5,
262 0.5,
263 0.5
264 ],
265 "image_processor_type": "Siglip2ImageProcessorFast",
266 "image_std": [
267 0.5,
268 0.5,
269 0.5
270 ],
271 "max_num_patches": 1024,
272 "patch_size": 16,
273 "processor_class": "Siglip2Processor",
274 "resample": 2,
275 "rescale_factor": 0.00392156862745098
276 },
277 "vit_aligner": {
278 "projector_type": "mlp_gelu",
279 "input_dim": 1152,
280 "n_embed": 4096,
281 "depth": 2,
282 "torch_dtype": "float32"
283 }
284 }
285
286