config.json
| 1 | { |
| 2 | "model_version": "HunyuanImage-3.0", |
| 3 | "add_classification_head": false, |
| 4 | "anyres_pooling_size": 2, |
| 5 | "anyres_vit_max_image_size": null, |
| 6 | "anyres_vit_two_views": false, |
| 7 | "architectures": [ |
| 8 | "HunyuanImage3ForCausalMM" |
| 9 | ], |
| 10 | "auto_map": { |
| 11 | "AutoConfig": "configuration_hunyuan.HunyuanImage3Config", |
| 12 | "AutoModel": "hunyuan.HunyuanImage3Model", |
| 13 | "AutoModelForCausalLM": "hunyuan.HunyuanImage3ForCausalMM" |
| 14 | }, |
| 15 | "attention_bias": false, |
| 16 | "attention_dropout": 0.0, |
| 17 | "attention_head_dim": 128, |
| 18 | "bos_token_id": 127958, |
| 19 | "cla_share_factor": 2, |
| 20 | "class_num": 0, |
| 21 | "dense_list": [ |
| 22 | 4096, |
| 23 | 0 |
| 24 | ], |
| 25 | "eod_token_id": 3, |
| 26 | "eos_token_id": 127957, |
| 27 | "group_limited_greedy": false, |
| 28 | "hidden_act": "silu", |
| 29 | "hidden_size": 4096, |
| 30 | "im_end_id": 128001, |
| 31 | "im_newline_id": 11, |
| 32 | "im_start_id": 128000, |
| 33 | "image_token_id": 128006, |
| 34 | "initializer_range": 0.02, |
| 35 | "intermediate_size": 3072, |
| 36 | "kv_lora_rank": null, |
| 37 | "mask_init_id": 12, |
| 38 | "max_position_embeddings": 22800, |
| 39 | "mlp_bias": false, |
| 40 | "model_type": "hunyuan_image_3_moe", |
| 41 | "moe_drop_tokens": false, |
| 42 | "moe_intermediate_size": [ |
| 43 | 3072, |
| 44 | 3072, |
| 45 | 3072, |
| 46 | 3072, |
| 47 | 3072, |
| 48 | 3072, |
| 49 | 3072, |
| 50 | 3072, |
| 51 | 3072, |
| 52 | 3072, |
| 53 | 3072, |
| 54 | 3072, |
| 55 | 3072, |
| 56 | 3072, |
| 57 | 3072, |
| 58 | 3072, |
| 59 | 3072, |
| 60 | 3072, |
| 61 | 3072, |
| 62 | 3072, |
| 63 | 3072, |
| 64 | 3072, |
| 65 | 3072, |
| 66 | 3072, |
| 67 | 3072, |
| 68 | 3072, |
| 69 | 3072, |
| 70 | 3072, |
| 71 | 3072, |
| 72 | 3072, |
| 73 | 3072, |
| 74 | 3072 |
| 75 | ], |
| 76 | "moe_layer_num_skipped": 0, |
| 77 | "moe_random_routing_dropped_token": false, |
| 78 | "moe_topk": [ |
| 79 | 8, |
| 80 | 8, |
| 81 | 8, |
| 82 | 8, |
| 83 | 8, |
| 84 | 8, |
| 85 | 8, |
| 86 | 8, |
| 87 | 8, |
| 88 | 8, |
| 89 | 8, |
| 90 | 8, |
| 91 | 8, |
| 92 | 8, |
| 93 | 8, |
| 94 | 8, |
| 95 | 8, |
| 96 | 8, |
| 97 | 8, |
| 98 | 8, |
| 99 | 8, |
| 100 | 8, |
| 101 | 8, |
| 102 | 8, |
| 103 | 8, |
| 104 | 8, |
| 105 | 8, |
| 106 | 8, |
| 107 | 8, |
| 108 | 8, |
| 109 | 8, |
| 110 | 8 |
| 111 | ], |
| 112 | "n_group": false, |
| 113 | "norm_topk_prob": true, |
| 114 | "norm_type": "rms", |
| 115 | "num_attention_heads": 32, |
| 116 | "num_experts": 64, |
| 117 | "num_hidden_layers": 32, |
| 118 | "num_key_value_heads": 8, |
| 119 | "num_media_embeds": 257, |
| 120 | "num_shared_expert": [ |
| 121 | 1, |
| 122 | 1, |
| 123 | 1, |
| 124 | 1, |
| 125 | 1, |
| 126 | 1, |
| 127 | 1, |
| 128 | 1, |
| 129 | 1, |
| 130 | 1, |
| 131 | 1, |
| 132 | 1, |
| 133 | 1, |
| 134 | 1, |
| 135 | 1, |
| 136 | 1, |
| 137 | 1, |
| 138 | 1, |
| 139 | 1, |
| 140 | 1, |
| 141 | 1, |
| 142 | 1, |
| 143 | 1, |
| 144 | 1, |
| 145 | 1, |
| 146 | 1, |
| 147 | 1, |
| 148 | 1, |
| 149 | 1, |
| 150 | 1, |
| 151 | 1, |
| 152 | 1 |
| 153 | ], |
| 154 | "pad_id": 128009, |
| 155 | "pad_token_id": 128009, |
| 156 | "pool_type": "last", |
| 157 | "position_embedding_xdrope": false, |
| 158 | "pretraining_tp": 1, |
| 159 | "q_lora_rank": null, |
| 160 | "qk_nope_head_dim": null, |
| 161 | "qk_rope_head_dim": null, |
| 162 | "rms_norm_eps": 1e-05, |
| 163 | "rope_scaling": { |
| 164 | "alpha": 1.0, |
| 165 | "beta_fast": 32, |
| 166 | "beta_slow": 1, |
| 167 | "factor": 1.0, |
| 168 | "mscale": 1.0, |
| 169 | "mscale_all_dim": 1.0, |
| 170 | "type": "custom" |
| 171 | }, |
| 172 | "rope_theta": 10000.0, |
| 173 | "routed_scaling_factor": false, |
| 174 | "skip_cls_token": false, |
| 175 | "text_end_id": 7, |
| 176 | "text_start_id": 6, |
| 177 | "tie_word_embeddings": false, |
| 178 | "topk_group": false, |
| 179 | "torch_dtype": "bfloat16", |
| 180 | "transformers_version": "4.50.0", |
| 181 | "use_cache": true, |
| 182 | "use_cla": false, |
| 183 | "use_mixed_mlp_moe": true, |
| 184 | "use_mla": false, |
| 185 | "use_qk_norm": true, |
| 186 | "use_rotary_pos_emb": true, |
| 187 | "v_head_dim": null, |
| 188 | "video_end_id": 10, |
| 189 | "video_start_id": 9, |
| 190 | "vit_add_patchemb_bias": false, |
| 191 | "vit_input_resolution": 224, |
| 192 | "vit_mapping_type": "resampler", |
| 193 | "vit_norm_type": "fused", |
| 194 | "vit_patch": 1, |
| 195 | "vit_path": null, |
| 196 | "vit_remove_prenorm": false, |
| 197 | "vit_token": 64, |
| 198 | "vit_type": "siglip2-so400m-patch16-naflex", |
| 199 | "vit_used_rms_norm": false, |
| 200 | "vocab_size": 133120, |
| 201 | "xdrope_section": null, |
| 202 | "head_dim": 128, |
| 203 | "rope_type": "2d", |
| 204 | "vae_downsample_factor": [ |
| 205 | 16, |
| 206 | 16 |
| 207 | ], |
| 208 | "vit_downsample_factor": [ |
| 209 | 16, |
| 210 | 16 |
| 211 | ], |
| 212 | "cond_token_attn_type": "joint_full", |
| 213 | "cond_image_type": "vae_vit", |
| 214 | "vae_type": "hunyuan-image-vae-v1", |
| 215 | "vae_dtype": "float32", |
| 216 | "vae_autocast_dtype": "float16", |
| 217 | "vae": { |
| 218 | "_class_name": "AutoencoderKLConv3D", |
| 219 | "block_out_channels": [ |
| 220 | 128, |
| 221 | 256, |
| 222 | 512, |
| 223 | 1024, |
| 224 | 1024 |
| 225 | ], |
| 226 | "in_channels": 3, |
| 227 | "out_channels": 3, |
| 228 | "latent_channels": 32, |
| 229 | "layers_per_block": 2, |
| 230 | "ffactor_spatial": 16, |
| 231 | "ffactor_temporal": 4, |
| 232 | "sample_size": 384, |
| 233 | "sample_tsize": 96, |
| 234 | "downsample_match_channel": true, |
| 235 | "upsample_match_channel": true, |
| 236 | "scaling_factor": 0.562679178327931 |
| 237 | }, |
| 238 | "vit": { |
| 239 | "_attn_implementation": "sdpa", |
| 240 | "attention_dropout": 0.0, |
| 241 | "hidden_act": "gelu_pytorch_tanh", |
| 242 | "hidden_size": 1152, |
| 243 | "intermediate_size": 4304, |
| 244 | "layer_norm_eps": 1e-06, |
| 245 | "num_attention_heads": 16, |
| 246 | "num_channels": 3, |
| 247 | "num_hidden_layers": 27, |
| 248 | "num_patches": 256, |
| 249 | "patch_size": 16, |
| 250 | "torch_dtype": "float32", |
| 251 | "output_attentions": false, |
| 252 | "output_hidden_states": false, |
| 253 | "use_return_dict": true |
| 254 | }, |
| 255 | "vit_processor": { |
| 256 | "do_convert_rgb": null, |
| 257 | "do_normalize": true, |
| 258 | "do_rescale": true, |
| 259 | "do_resize": true, |
| 260 | "image_mean": [ |
| 261 | 0.5, |
| 262 | 0.5, |
| 263 | 0.5 |
| 264 | ], |
| 265 | "image_processor_type": "Siglip2ImageProcessorFast", |
| 266 | "image_std": [ |
| 267 | 0.5, |
| 268 | 0.5, |
| 269 | 0.5 |
| 270 | ], |
| 271 | "max_num_patches": 1024, |
| 272 | "patch_size": 16, |
| 273 | "processor_class": "Siglip2Processor", |
| 274 | "resample": 2, |
| 275 | "rescale_factor": 0.00392156862745098 |
| 276 | }, |
| 277 | "vit_aligner": { |
| 278 | "projector_type": "mlp_gelu", |
| 279 | "input_dim": 1152, |
| 280 | "n_embed": 4096, |
| 281 | "depth": 2, |
| 282 | "torch_dtype": "float32" |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | |