configuration_hunyuan.py · HunyuanImage-3.0

1

# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");

2

# you may not use this file except in compliance with the License.

3

# You may obtain a copy of the License at

4

#

5

# https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE

6

#

7

# Unless required by applicable law or agreed to in writing, software

8

# distributed under the License is distributed on an "AS IS" BASIS,

9

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

10

# See the License for the specific language governing permissions and

11

# limitations under the License.

12

# ==============================================================================

13

14

from transformers.configuration_utils import PretrainedConfig

15

from transformers.utils import logging

16

from typing import List, Union

17

18

19

logger = logging.get_logger(__name__)

20

21

22

class HunyuanImage3Config(PretrainedConfig):

23

r"""

24

This is the configuration class to store the configuration of a [`HunyuanImage3Model`]. It is used to instantiate

25

an Hunyuan model according to the specified arguments, defining the model architecture. Instantiating a

26

configuration with the defaults will yield a similar configuration to that of the Hunyuan-7B.

27

28

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the

29

documentation from [`PretrainedConfig`] for more information.

30

31

32

Args:

33

vocab_size (`int`, *optional*, defaults to 32000):

34

Vocabulary size of the Hunyuan Image 3 model. Defines the number of different tokens that can be

35

represented by the `inputs_ids` passed when calling [`HunyuanImage3Model`]

36

hidden_size (`int`, *optional*, defaults to 4096):

37

Dimension of the hidden representations.

38

intermediate_size (`int`, *optional*, defaults to 11008):

39

Dimension of the MLP representations or shared MLP representations.

40

moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):

41

Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.

42

num_hidden_layers (`int`, *optional*, defaults to 32):

43

Number of hidden layers in the Transformer decoder.

44

num_attention_heads (`int`, *optional*, defaults to 32):

45

Number of attention heads for each attention layer in the Transformer decoder.

46

num_key_value_heads (`int`, *optional*):

47

This is the number of key_value heads that should be used to implement Grouped Query Attention. If

48

`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if

49

`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When

50

converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed

51

by meanpooling all the original heads within that group. For more details checkout [this

52

paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to

53

`num_attention_heads`.

54

hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):

55

The non-linear activation function (function or string) in the decoder.

56

max_position_embeddings (`int`, *optional*, defaults to 2048):

57

The maximum sequence length that this model might ever be used with.

58

initializer_range (`float`, *optional*, defaults to 0.02):

59

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

60

rms_norm_eps (`float`, *optional*, defaults to 1e-06):

61

The epsilon used by the rms normalization layers.

62

use_cache (`bool`, *optional*, defaults to `True`):

63

Whether or not the model should return the last key/values attentions (not used by all models). Only

64

relevant if `config.is_decoder=True`.

65

pad_token_id (`int`, *optional*):

66

Padding token id.

67

bos_token_id (`int`, *optional*, defaults to 1):

68

Beginning of stream token id.

69

eos_token_id (`int`, *optional*, defaults to 2):

70

End of stream token id.

71

pretraining_tp (`int`, *optional*, defaults to 1):

72

Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this

73

document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is

74

necessary to ensure exact reproducibility of the pretraining results. Please refer to [this

75

issue](https://github.com/pytorch/pytorch/issues/76232).

76

tie_word_embeddings (`bool`, *optional*, defaults to `False`):

77

Whether to tie weight embeddings

78

rope_theta (`float`, *optional*, defaults to 10000.0):

79

The base period of the RoPE embeddings.

80

rope_scaling (`Dict`, *optional*):

81

Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling

82

strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is

83

`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update

84

`max_position_embeddings` to the expected new maximum. See the following thread for more information on how

85

these scaling strategies behave:

86

https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an

87

experimental feature, subject to breaking API changes in future versions.

88

attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):

89

Whether to use a bias in the query, key, value and output projection layers during self-attention.

90

attention_dropout (`float`, *optional*, defaults to 0.0):

91

The dropout ratio for the attention probabilities.

92

use_qk_norm (`bool`, *optional*, defaults to `False`):

93

Whether query and key in attention use norm

94

use_cla (`bool`, *optional*, defaults to `False`):

95

Whether to use CLA in attention

96

cla_share_factor (`int`, *optional*, defaults to 1):

97

The share factor of CLA

98

num_experts (`int` or `List`, *optional*, defaults to 1):

99

The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.

100

num_shared_expert (`int` or `List`, *optional*, defaults to 1):

101

The number of shared experts for moe. If it is a list, it will be used as the number of shared experts

102

for each layer.

103

moe_topk (`int` or `List`, *optional*, defaults to 1):

104

The topk value for moe. If it is a list, it will be used as the topk value for each layer.

105

capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):

106

The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.

107

moe_layer_num_skipped (`int`, *optional*, defaults to 0):

108

First moe_layer_num_skipped layers do not use MoE.

109

"""

110

111

model_type = "Hunyuan"

112

keys_to_ignore_at_inference = ["past_key_values"]

113

114

def __init__(

115

self,

116

vocab_size=290943,

117

hidden_size=4096,

118

intermediate_size: int=11008,

119

moe_intermediate_size: Union[int, List]=None,

120

num_hidden_layers=32,

121

num_attention_heads=32,

122

num_key_value_heads=None,

123

attention_head_dim=None,

124

hidden_act="silu",

125

max_position_embeddings=2048,

126

initializer_range=0.02,

127

rms_norm_eps=1e-5,

128

use_cache=True,

129

pad_token_id=0,

130

bos_token_id=1,

131

eos_token_id=2,

132

eod_token_id=3,

133

im_start_id=4,

134

im_end_id=5,

135

text_start_id=6,

136

text_end_id=7,

137

image_token_id=8,

138

video_start_id=9,

139

video_end_id=10,

140

im_newline_id=11,

141

mask_init_id=12,

142

pretraining_tp=1,

143

tie_word_embeddings=False,

144

rope_theta=10000.0,

145

rope_scaling=None,

146

attention_bias=False,

147

mlp_bias=False,

148

attention_dropout=0.0,

149

use_qk_norm=False,

150

use_rotary_pos_emb=True,

151

use_cla=False,

152

cla_share_factor=1,

153

norm_type="hf_rms",

154

num_experts: Union[int, List] = 1,

155

use_mixed_mlp_moe=False,

156

num_shared_expert: Union[int, List] = 1,

157

moe_topk: Union[int, List] = 1,

158

capacity_factor: int = 1.0,

159

moe_drop_tokens=False,

160

moe_random_routing_dropped_token=False,

161

use_mla=False,

162

kv_lora_rank=512,

163

q_lora_rank=1536,

164

qk_rope_head_dim=64,

165

v_head_dim=128,

166

qk_nope_head_dim=128,

167

moe_layer_num_skipped=0,

168

norm_topk_prob=True,

169

routed_scaling_factor=1.0,

170

group_limited_greedy=False,

171

n_group=None,

172

topk_group=None,

173

add_classification_head=False,

174

class_num=0,

175

pool_type="last",

176

pad_id=-1,

177

# Added

178

moe_impl="eager",

179

vae_downsample_factor=(16, 16), # (h, w)

180

img_proj_type="unet",

181

patch_size=1,

182

patch_embed_hidden_dim=1024,

183

image_base_size=1024,

184

vae=None,

185

vit=None,

186

vit_processor=None,

187

vit_aligner=None,

188

**kwargs,

189

):

190

self.vocab_size = vocab_size

191

self.max_position_embeddings = max_position_embeddings

192

self.hidden_size = hidden_size

193

self.intermediate_size = intermediate_size

194

self.moe_intermediate_size = moe_intermediate_size

195

self.num_hidden_layers = num_hidden_layers

196

self.num_attention_heads = num_attention_heads

197

self.moe_impl = moe_impl

198

self.num_experts = num_experts

199

self.use_mixed_mlp_moe = use_mixed_mlp_moe

200

self.num_shared_expert = num_shared_expert

201

self.moe_topk = moe_topk

202

self.capacity_factor = capacity_factor

203

self.moe_drop_tokens = moe_drop_tokens

204

self.moe_random_routing_dropped_token = moe_random_routing_dropped_token

205

206

if attention_head_dim is not None:

207

self.attention_head_dim = attention_head_dim

208

else:

209

self.attention_head_dim = self.hidden_size // num_attention_heads

210

211

# for backward compatibility

212

if num_key_value_heads is None:

213

num_key_value_heads = num_attention_heads

214

215

self.num_key_value_heads = num_key_value_heads

216

self.hidden_act = hidden_act

217

self.initializer_range = initializer_range

218

self.rms_norm_eps = rms_norm_eps

219

self.pretraining_tp = pretraining_tp

220

self.use_cache = use_cache

221

self.rope_theta = rope_theta

222

self.rope_scaling = rope_scaling

223

self.attention_bias = attention_bias

224

self.mlp_bias = mlp_bias

225

self.attention_dropout = attention_dropout

226

self.use_qk_norm = use_qk_norm

227

self.use_rotary_pos_emb = use_rotary_pos_emb

228

self.use_cla = use_cla

229

self.cla_share_factor = cla_share_factor

230

self.norm_type = norm_type

231

# MLA args

232

self.use_mla = use_mla

233

self.kv_lora_rank = kv_lora_rank

234

self.q_lora_rank = q_lora_rank

235

self.qk_rope_head_dim = qk_rope_head_dim

236

self.qk_nope_head_dim = qk_nope_head_dim

237

self.v_head_dim = v_head_dim

238

239

# DeepSeek related args

240

self.moe_layer_num_skipped = moe_layer_num_skipped

241

self.norm_topk_prob = norm_topk_prob

242

self.routed_scaling_factor = routed_scaling_factor

243

self.group_limited_greedy = group_limited_greedy

244

self.n_group = n_group

245

self.topk_group = topk_group

246

self.add_classification_head = add_classification_head

247

self.class_num = class_num

248

self.pool_type = pool_type

249

self.pad_id = pad_id

250

251

if self.class_num is not None:

252

self.dense_list = [self.hidden_size, self.class_num]

253

254

# ViT args

255

self.vit = vit

256

self.vit_processor = vit_processor

257

self.vit_aligner = vit_aligner

258

259

# Image Gen args

260

self.vae = vae

261

self.vae_downsample_factor = vae_downsample_factor

262

self.img_proj_type = img_proj_type

263

self.patch_size = patch_size

264

self.patch_embed_hidden_dim = patch_embed_hidden_dim

265

self.image_base_size = image_base_size

266

267

# token id

268

self.eod_token_id = eod_token_id

269

self.im_start_id = im_start_id

270

self.im_end_id = im_end_id

271

self.text_start_id = text_start_id

272

self.text_end_id = text_end_id

273

self.image_token_id = image_token_id

274

self.video_start_id = video_start_id

275

self.video_end_id = video_end_id

276

self.im_newline_id = im_newline_id

277

self.mask_init_id = mask_init_id

278

279

super().__init__(

280

pad_token_id=pad_token_id,

281

bos_token_id=bos_token_id,

282

eos_token_id=eos_token_id,

283

tie_word_embeddings=tie_word_embeddings,

284

**kwargs,

285

)

286