configuration_qwen2.py · LocateAnything-3B

1

# coding=utf-8

2

3

#

4

# Licensed under the Apache License, Version 2.0 (the "License");

5

# you may not use this file except in compliance with the License.

6

# You may obtain a copy of the License at

7

#

8

# http://www.apache.org/licenses/LICENSE-2.0

9

#

10

# Unless required by applicable law or agreed to in writing, software

11

# distributed under the License is distributed on an "AS IS" BASIS,

12

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

# See the License for the specific language governing permissions and

14

# limitations under the License.

15

""" Qwen2 model configuration"""

16

17

from transformers.configuration_utils import PretrainedConfig

18

from transformers.utils import logging

19

20

21

logger = logging.get_logger(__name__)

22

23

QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

24

"Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",

25

}

26

27

28

class Qwen2Config(PretrainedConfig):

29

r"""

30

This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a

31

Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration

32

with the defaults will yield a similar configuration to that of

33

Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).

34

35

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the

36

documentation from [`PretrainedConfig`] for more information.

37

38

39

Args:

40

vocab_size (`int`, *optional*, defaults to 151936):

41

Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the

42

`inputs_ids` passed when calling [`Qwen2Model`]

43

hidden_size (`int`, *optional*, defaults to 4096):

44

Dimension of the hidden representations.

45

intermediate_size (`int`, *optional*, defaults to 22016):

46

Dimension of the MLP representations.

47

num_hidden_layers (`int`, *optional*, defaults to 32):

48

Number of hidden layers in the Transformer encoder.

49

num_attention_heads (`int`, *optional*, defaults to 32):

50

Number of attention heads for each attention layer in the Transformer encoder.

51

num_key_value_heads (`int`, *optional*, defaults to 32):

52

This is the number of key_value heads that should be used to implement Grouped Query Attention. If

53

`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if

54

`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When

55

converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed

56

by meanpooling all the original heads within that group. For more details checkout [this

57

paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.

58

hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):

59

The non-linear activation function (function or string) in the decoder.

60

max_position_embeddings (`int`, *optional*, defaults to 32768):

61

The maximum sequence length that this model might ever be used with.

62

initializer_range (`float`, *optional*, defaults to 0.02):

63

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

64

rms_norm_eps (`float`, *optional*, defaults to 1e-06):

65

The epsilon used by the rms normalization layers.

66

use_cache (`bool`, *optional*, defaults to `True`):

67

Whether or not the model should return the last key/values attentions (not used by all models). Only

68

relevant if `config.is_decoder=True`.

69

tie_word_embeddings (`bool`, *optional*, defaults to `False`):

70

Whether the model's input and output word embeddings should be tied.

71

rope_theta (`float`, *optional*, defaults to 10000.0):

72

The base period of the RoPE embeddings.

73

use_sliding_window (`bool`, *optional*, defaults to `False`):

74

Whether to use sliding window attention.

75

sliding_window (`int`, *optional*, defaults to 4096):

76

Sliding window attention (SWA) window size. If not specified, will default to `4096`.

77

max_window_layers (`int`, *optional*, defaults to 28):

78

            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.

79

attention_dropout (`float`, *optional*, defaults to 0.0):

80

The dropout ratio for the attention probabilities.

81

82

```python

83

>>> from transformers import Qwen2Model, Qwen2Config

84

85

>>> # Initializing a Qwen2 style configuration

86

>>> configuration = Qwen2Config()

87

88

>>> # Initializing a model from the Qwen2-7B style configuration

89

>>> model = Qwen2Model(configuration)

90

91

>>> # Accessing the model configuration

92

>>> configuration = model.config

93

```"""

94

95

model_type = "qwen2"

96

keys_to_ignore_at_inference = ["past_key_values"]

97

98

def __init__(

99

self,

100

vocab_size=151936,

101

hidden_size=4096,

102

intermediate_size=22016,

103

num_hidden_layers=32,

104

num_attention_heads=32,

105

num_key_value_heads=32,

106

hidden_act="silu",

107

max_position_embeddings=32768,

108

initializer_range=0.02,

109

rms_norm_eps=1e-6,

110

use_cache=True,

111

tie_word_embeddings=False,

112

rope_theta=10000.0,

113

use_sliding_window=False,

114

sliding_window=4096,

115

max_window_layers=28,

116

attention_dropout=0.0,

117

**kwargs,

118

):

119

self.vocab_size = vocab_size

120

self.max_position_embeddings = max_position_embeddings

121

self.hidden_size = hidden_size

122

self.intermediate_size = intermediate_size

123

self.num_hidden_layers = num_hidden_layers

124

self.num_attention_heads = num_attention_heads

125

self.use_sliding_window = use_sliding_window

126

self.sliding_window = sliding_window

127

self.max_window_layers = max_window_layers

128

129

# for backward compatibility

130

if num_key_value_heads is None:

131

num_key_value_heads = num_attention_heads

132

133

self.num_key_value_heads = num_key_value_heads

134

self.hidden_act = hidden_act

135

self.initializer_range = initializer_range

136

self.rms_norm_eps = rms_norm_eps

137

self.use_cache = use_cache

138

self.rope_theta = rope_theta

139

self.attention_dropout = attention_dropout

140

if kwargs.get('attn_implementation', None) is None:

141

self.attn_implementation = kwargs['attn_implementation'] = 'flash_attention_2'

142

else:

143

self.attn_implementation = kwargs['attn_implementation']

144

145

super().__init__(

146

tie_word_embeddings=tie_word_embeddings,

147

**kwargs,

148

)

149

1	`# coding=utf-8`
2	`# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.`
3	`#`
4	`# Licensed under the Apache License, Version 2.0 (the "License");`
5	`# you may not use this file except in compliance with the License.`
6	`# You may obtain a copy of the License at`
7	`#`
8	`# http://www.apache.org/licenses/LICENSE-2.0`
9	`#`
10	`# Unless required by applicable law or agreed to in writing, software`
11	`# distributed under the License is distributed on an "AS IS" BASIS,`
12	`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
13	`# See the License for the specific language governing permissions and`
14	`# limitations under the License.`
15	`""" Qwen2 model configuration"""`
16
17	`from transformers.configuration_utils import PretrainedConfig`
18	`from transformers.utils import logging`
19
20
21	`logger = logging.get_logger(__name__)`
22
23	`QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {`
24	`"Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",`
25	`}`
26
27
28	`class Qwen2Config(PretrainedConfig):`
29	`r"""`
30	This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
31	`Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration`
32	`with the defaults will yield a similar configuration to that of`
33	`Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).`
34
35	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36	documentation from [`PretrainedConfig`] for more information.
37
38
39	`Args:`
40	vocab_size (`int`, optional, defaults to 151936):
41	`Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the`
42	`inputs_ids` passed when calling [`Qwen2Model`]
43	hidden_size (`int`, optional, defaults to 4096):
44	`Dimension of the hidden representations.`
45	intermediate_size (`int`, optional, defaults to 22016):
46	`Dimension of the MLP representations.`
47	num_hidden_layers (`int`, optional, defaults to 32):
48	`Number of hidden layers in the Transformer encoder.`
49	num_attention_heads (`int`, optional, defaults to 32):
50	`Number of attention heads for each attention layer in the Transformer encoder.`
51	num_key_value_heads (`int`, optional, defaults to 32):
52	`This is the number of key_value heads that should be used to implement Grouped Query Attention. If`
53	`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54	`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55	`converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed`
56	`by meanpooling all the original heads within that group. For more details checkout [this`
57	paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
58	hidden_act (`str` or `function`, optional, defaults to `"silu"`):
59	`The non-linear activation function (function or string) in the decoder.`
60	max_position_embeddings (`int`, optional, defaults to 32768):
61	`The maximum sequence length that this model might ever be used with.`
62	initializer_range (`float`, optional, defaults to 0.02):
63	`The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`
64	rms_norm_eps (`float`, optional, defaults to 1e-06):
65	`The epsilon used by the rms normalization layers.`
66	use_cache (`bool`, optional, defaults to `True`):
67	`Whether or not the model should return the last key/values attentions (not used by all models). Only`
68	relevant if `config.is_decoder=True`.
69	tie_word_embeddings (`bool`, optional, defaults to `False`):
70	`Whether the model's input and output word embeddings should be tied.`
71	rope_theta (`float`, optional, defaults to 10000.0):
72	`The base period of the RoPE embeddings.`
73	use_sliding_window (`bool`, optional, defaults to `False`):
74	`Whether to use sliding window attention.`
75	sliding_window (`int`, optional, defaults to 4096):
76	Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77	max_window_layers (`int`, optional, defaults to 28):
78	`The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.`
79	attention_dropout (`float`, optional, defaults to 0.0):
80	`The dropout ratio for the attention probabilities.`
81
82	```python
83	`>>> from transformers import Qwen2Model, Qwen2Config`
84
85	`>>> # Initializing a Qwen2 style configuration`
86	`>>> configuration = Qwen2Config()`
87
88	`>>> # Initializing a model from the Qwen2-7B style configuration`
89	`>>> model = Qwen2Model(configuration)`
90
91	`>>> # Accessing the model configuration`
92	`>>> configuration = model.config`
93	```"""
94
95	`model_type = "qwen2"`
96	`keys_to_ignore_at_inference = ["past_key_values"]`
97
98	`def __init__(`
99	`self,`
100	`vocab_size=151936,`
101	`hidden_size=4096,`
102	`intermediate_size=22016,`
103	`num_hidden_layers=32,`
104	`num_attention_heads=32,`
105	`num_key_value_heads=32,`
106	`hidden_act="silu",`
107	`max_position_embeddings=32768,`
108	`initializer_range=0.02,`
109	`rms_norm_eps=1e-6,`
110	`use_cache=True,`
111	`tie_word_embeddings=False,`
112	`rope_theta=10000.0,`
113	`use_sliding_window=False,`
114	`sliding_window=4096,`
115	`max_window_layers=28,`
116	`attention_dropout=0.0,`
117	`**kwargs,`
118	`):`
119	`self.vocab_size = vocab_size`
120	`self.max_position_embeddings = max_position_embeddings`
121	`self.hidden_size = hidden_size`
122	`self.intermediate_size = intermediate_size`
123	`self.num_hidden_layers = num_hidden_layers`
124	`self.num_attention_heads = num_attention_heads`
125	`self.use_sliding_window = use_sliding_window`
126	`self.sliding_window = sliding_window`
127	`self.max_window_layers = max_window_layers`
128
129	`# for backward compatibility`
130	`if num_key_value_heads is None:`
131	`num_key_value_heads = num_attention_heads`
132
133	`self.num_key_value_heads = num_key_value_heads`
134	`self.hidden_act = hidden_act`
135	`self.initializer_range = initializer_range`
136	`self.rms_norm_eps = rms_norm_eps`
137	`self.use_cache = use_cache`
138	`self.rope_theta = rope_theta`
139	`self.attention_dropout = attention_dropout`
140	`if kwargs.get('attn_implementation', None) is None:`
141	`self.attn_implementation = kwargs['attn_implementation'] = 'flash_attention_2'`
142	`else:`
143	`self.attn_implementation = kwargs['attn_implementation']`
144
145	`super().__init__(`
146	`tie_word_embeddings=tie_word_embeddings,`
147	`**kwargs,`
148	`)`
149