configuration_deepseek_v2.py · DeepSeek-OCR-2

1

from transformers.configuration_utils import PretrainedConfig

2

from transformers.utils import logging

3

4

logger = logging.get_logger(__name__)

5

6

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}

7

class DeepseekV2Config(PretrainedConfig):

8

r"""

9

    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek

10

model according to the specified arguments, defining the model architecture. Instantiating a configuration with the

11

defaults will yield a similar configuration to that of the DeepSeek-V2 with multi-latent attention.

12

13

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the

14

documentation from [`PretrainedConfig`] for more information.

15

16

17

Args:

18

vocab_size (`int`, *optional*, defaults to 102400):

19

Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the

20

`inputs_ids` passed when calling [`DeepseekV2Model`]

21

hidden_size (`int`, *optional*, defaults to 4096):

22

Dimension of the hidden representations.

23

intermediate_size (`int`, *optional*, defaults to 11008):

24

Dimension of the MLP representations.

25

moe_intermediate_size (`int`, *optional*, defaults to 1407):

26

Dimension of the MoE representations.

27

num_hidden_layers (`int`, *optional*, defaults to 32):

28

Number of hidden layers in the Transformer decoder.

29

num_attention_heads (`int`, *optional*, defaults to 32):

30

Number of attention heads for each attention layer in the Transformer decoder.

31

n_shared_experts (`int`, *optional*, defaults to None):

32

Number of shared experts, None means dense model.

33

n_routed_experts (`int`, *optional*, defaults to None):

34

Number of routed experts, None means dense model.

35

routed_scaling_factor (`float`, *optional*, defaults to 1.0):

36

Scaling factor or routed experts.

37

topk_method (`str`, *optional*, defaults to `gready`):

38

Topk method used in routed gate.

39

n_group (`int`, *optional*, defaults to None):

40

Number of groups for routed experts.

41

topk_group (`int`, *optional*, defaults to None):

42

            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).

43

num_experts_per_tok (`int`, *optional*, defaults to None):

44

Number of selected experts, None means dense model.

45

moe_layer_freq (`int`, *optional*, defaults to 1):

46

The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.

47

first_k_dense_replace (`int`, *optional*, defaults to 0):

48

Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).

49

\--k dense layers--/

50

norm_topk_prob (`bool`, *optional*, defaults to False):

51

Whether to normalize the weights of the routed experts.

52

scoring_func (`str`, *optional*, defaults to 'softmax'):

53

Method of computing expert weights.

54

aux_loss_alpha (`float`, *optional*, defaults to 0.001):

55

Auxiliary loss weight coefficient.

56

seq_aux = (`bool`, *optional*, defaults to True):

57

Whether to compute the auxiliary loss for each individual sample.

58

num_key_value_heads (`int`, *optional*):

59

This is the number of key_value heads that should be used to implement Grouped Query Attention. If

60

`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if

61

`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When

62

converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed

63

by meanpooling all the original heads within that group. For more details checkout [this

64

paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to

65

`num_attention_heads`.

66

hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):

67

The non-linear activation function (function or string) in the decoder.

68

max_position_embeddings (`int`, *optional*, defaults to 2048):

69

The maximum sequence length that this model might ever be used with.

70

initializer_range (`float`, *optional*, defaults to 0.02):

71

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

72

rms_norm_eps (`float`, *optional*, defaults to 1e-06):

73

The epsilon used by the rms normalization layers.

74

use_cache (`bool`, *optional*, defaults to `True`):

75

Whether or not the model should return the last key/values attentions (not used by all models). Only

76

relevant if `config.is_decoder=True`.

77

pad_token_id (`int`, *optional*):

78

Padding token id.

79

bos_token_id (`int`, *optional*, defaults to 1):

80

Beginning of stream token id.

81

eos_token_id (`int`, *optional*, defaults to 2):

82

End of stream token id.

83

pretraining_tp (`int`, *optional*, defaults to 1):

84

Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this

85

document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is

86

necessary to ensure exact reproducibility of the pretraining results. Please refer to [this

87

issue](https://github.com/pytorch/pytorch/issues/76232).

88

tie_word_embeddings (`bool`, *optional*, defaults to `False`):

89

Whether to tie weight embeddings

90

rope_theta (`float`, *optional*, defaults to 10000.0):

91

The base period of the RoPE embeddings.

92

rope_scaling (`Dict`, *optional*):

93

Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling

94

strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is

95

`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update

96

`max_position_embeddings` to the expected new maximum.

97

attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):

98

Whether to use a bias in the query, key, value and output projection layers during self-attention.

99

attention_dropout (`float`, *optional*, defaults to 0.0):

100

The dropout ratio for the attention probabilities.

101

use_mla (`bool`, *optional*, defaults to `True`): Use multi-latent attention or multi-head attention. If True,

102

the model will use multi-latent attention, otherwise, it will use multi-head attention.

103

104

```python

105

>>> from transformers import DeepseekV2Model, DeepseekV2Config

106

107

>>> # Initializing a Deepseek-V2 style configuration

108

>>> configuration = DeepseekV2Config()

109

110

>>> # Accessing the model configuration

111

>>> configuration = model.config

112

```"""

113

114

model_type = "deepseek_v2"

115

keys_to_ignore_at_inference = ["past_key_values"]

116

117

def __init__(

118

self,

119

vocab_size=102400,

120

hidden_size=4096,

121

intermediate_size=11008,

122

moe_intermediate_size = 1407,

123

num_hidden_layers=30,

124

num_attention_heads=32,

125

num_key_value_heads=32,

126

n_shared_experts = None,

127

n_routed_experts = None,

128

ep_size = 1,

129

routed_scaling_factor = 1.0,

130

kv_lora_rank = 512,

131

q_lora_rank = 1536,

132

qk_rope_head_dim = 64,

133

v_head_dim = 128,

134

qk_nope_head_dim = 128,

135

topk_method = 'gready',

136

n_group = None,

137

topk_group = None,

138

num_experts_per_tok = None,

139

moe_layer_freq = 1,

140

first_k_dense_replace = 0,

141

norm_topk_prob = False,

142

scoring_func = 'softmax',

143

aux_loss_alpha = 0.001,

144

seq_aux = True,

145

hidden_act="silu",

146

max_position_embeddings=2048,

147

initializer_range=0.02,

148

rms_norm_eps=1e-6,

149

use_cache=True,

150

pad_token_id=None,

151

bos_token_id=100000,

152

eos_token_id=100001,

153

pretraining_tp=1,

154

tie_word_embeddings=False,

155

rope_theta=10000.0,

156

rope_scaling=None,

157

attention_bias=False,

158

attention_dropout=0.0,

159

use_mla=True,

160

**kwargs,

161

):

162

self.vocab_size = vocab_size

163

self.max_position_embeddings = max_position_embeddings

164

self.hidden_size = hidden_size

165

self.intermediate_size = intermediate_size

166

self.moe_intermediate_size = moe_intermediate_size

167

self.num_hidden_layers = num_hidden_layers

168

self.num_attention_heads = num_attention_heads

169

self.n_shared_experts = n_shared_experts

170

self.n_routed_experts = n_routed_experts

171

self.ep_size = ep_size

172

self.routed_scaling_factor = routed_scaling_factor

173

self.kv_lora_rank = kv_lora_rank

174

self.q_lora_rank = q_lora_rank

175

self.qk_rope_head_dim = qk_rope_head_dim

176

self.v_head_dim = v_head_dim

177

self.qk_nope_head_dim = qk_nope_head_dim

178

self.topk_method = topk_method

179

self.n_group = n_group

180

self.topk_group = topk_group

181

self.num_experts_per_tok = num_experts_per_tok

182

self.moe_layer_freq = moe_layer_freq

183

self.first_k_dense_replace = first_k_dense_replace

184

self.norm_topk_prob = norm_topk_prob

185

self.scoring_func = scoring_func

186

self.aux_loss_alpha = aux_loss_alpha

187

self.seq_aux = seq_aux

188

# for backward compatibility

189

if num_key_value_heads is None:

190

num_key_value_heads = num_attention_heads

191

192

self.num_key_value_heads = num_key_value_heads

193

self.hidden_act = hidden_act

194

self.initializer_range = initializer_range

195

self.rms_norm_eps = float(rms_norm_eps)

196

self.pretraining_tp = pretraining_tp

197

self.use_cache = use_cache

198

self.rope_theta = rope_theta

199

self.rope_scaling = rope_scaling

200

self.attention_bias = attention_bias

201

self.attention_dropout = attention_dropout

202

self.use_mla = use_mla

203

204

super().__init__(

205

pad_token_id=pad_token_id,

206

bos_token_id=bos_token_id,

207

eos_token_id=eos_token_id,

208

tie_word_embeddings=tie_word_embeddings,

209

**kwargs,

210

)

211