configuration_phi3_v.py · Phi-3.5-vision-instruct

1

# coding=utf-8

2

3

#

4

# Licensed under the Apache License, Version 2.0 (the "License");

5

# you may not use this file except in compliance with the License.

6

# You may obtain a copy of the License at

7

#

8

# http://www.apache.org/licenses/LICENSE-2.0

9

#

10

# Unless required by applicable law or agreed to in writing, software

11

# distributed under the License is distributed on an "AS IS" BASIS,

12

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

# See the License for the specific language governing permissions and

14

# limitations under the License.

15

16

""" Phi-3-V model configuration"""

17

18

19

from transformers.configuration_utils import PretrainedConfig

20

from transformers.utils import logging

21

22

23

logger = logging.get_logger(__name__)

24

25

PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {

26

    "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",

27

    "microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct/resolve/main/config.json",

28

}

29

30

31

class Phi3VConfig(PretrainedConfig):

32

r"""

33

This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3

34

model according to the specified arguments, defining the model architecture. Instantiating a configuration with the

35

defaults will yield a similar configuration to that of the

36

[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).

37

38

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the

39

documentation from [`PretrainedConfig`] for more information.

40

41

Args:

42

vocab_size (`int`, *optional*, defaults to 32064):

43

Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the

44

`inputs_ids` passed when calling [`Phi3VModel`].

45

hidden_size (`int`, *optional*, defaults to 3072):

46

Dimension of the hidden representations.

47

intermediate_size (`int`, *optional*, defaults to 8192):

48

Dimension of the MLP representations.

49

num_hidden_layers (`int`, *optional*, defaults to 32):

50

Number of hidden layers in the Transformer decoder.

51

num_attention_heads (`int`, *optional*, defaults to 32):

52

Number of attention heads for each attention layer in the Transformer decoder.

53

num_key_value_heads (`int`, *optional*):

54

This is the number of key_value heads that should be used to implement Grouped Query Attention. If

55

`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if

56

`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When

57

converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed

58

by meanpooling all the original heads within that group. For more details checkout [this

59

paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to

60

`num_attention_heads`.

61

resid_pdrop (`float`, *optional*, defaults to 0.0):

62

Dropout probability for mlp outputs.

63

embd_pdrop (`int`, *optional*, defaults to 0.0):

64

The dropout ratio for the embeddings.

65

attention_dropout (`float`, *optional*, defaults to 0.0):

66

The dropout ratio after computing the attention scores.

67

hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):

68

The non-linear activation function (function or string) in the decoder.

69

max_position_embeddings (`int`, *optional*, defaults to 4096):

70

The maximum sequence length that this model might ever be used with.

71

original_max_position_embeddings (`int`, *optional*, defaults to 4096):

72

The maximum sequence length that this model was trained with. This is used to determine the size of the

73

original RoPE embeddings when using long scaling.

74

initializer_range (`float`, *optional*, defaults to 0.02):

75

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

76

rms_norm_eps (`float`, *optional*, defaults to 1e-05):

77

The epsilon value used for the RMSNorm.

78

use_cache (`bool`, *optional*, defaults to `True`):

79

Whether or not the model should return the last key/values attentions (not used by all models). Only

80

relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.

81

tie_word_embeddings (`bool`, *optional*, defaults to `False`):

82

Whether to tie weight embeddings

83

rope_theta (`float`, *optional*, defaults to 10000.0):

84

The base period of the RoPE embeddings.

85

rope_scaling (`dict`, *optional*):

86

The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must

87

            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and

88

the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size

89

divided by the number of attention heads divided by 2.

90

bos_token_id (`int`, *optional*, defaults to 1):

91

The id of the "beginning-of-sequence" token.

92

eos_token_id (`int`, *optional*, defaults to 32000):

93

The id of the "end-of-sequence" token.

94

pad_token_id (`int`, *optional*, defaults to 32000):

95

The id of the padding token.

96

sliding_window (`int`, *optional*):

97

Sliding window attention window size. If `None`, no sliding window is applied.

98

embd_layer (`str`, *optional*, defaults to `"default"`):

99

            The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text.

100

101

Example:

102

103

```python

104

>>> from transformers import Phi3VModel, Phi3VConfig

105

106

>>> # Initializing a Phi-3-V style configuration

107

>>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")

108

109

>>> # Initializing a model from the configuration

110

>>> model = Phi3VModel(configuration)

111

112

>>> # Accessing the model configuration

113

>>> configuration = model.config

114

```"""

115

116

model_type = "phi3_v"

117

keys_to_ignore_at_inference = ["past_key_values"]

118

119

def __init__(

120

self,

121

vocab_size=32064,

122

hidden_size=3072,

123

intermediate_size=8192,

124

num_hidden_layers=32,

125

num_attention_heads=32,

126

num_key_value_heads=None,

127

resid_pdrop=0.0,

128

embd_pdrop=0.0,

129

attention_dropout=0.0,

130

hidden_act="silu",

131

max_position_embeddings=4096,

132

original_max_position_embeddings=4096,

133

initializer_range=0.02,

134

rms_norm_eps=1e-5,

135

use_cache=True,

136

tie_word_embeddings=False,

137

rope_theta=10000.0,

138

rope_scaling=None,

139

bos_token_id=1,

140

eos_token_id=32000,

141

pad_token_id=32000,

142

sliding_window=None,

143

embd_layer: str = "default",

144

**kwargs,

145

):

146

self.vocab_size = vocab_size

147

self.hidden_size = hidden_size

148

self.intermediate_size = intermediate_size

149

self.num_hidden_layers = num_hidden_layers

150

self.num_attention_heads = num_attention_heads

151

152

if num_key_value_heads is None:

153

num_key_value_heads = num_attention_heads

154

155

self.num_key_value_heads = num_key_value_heads

156

self.resid_pdrop = resid_pdrop

157

self.embd_pdrop = embd_pdrop

158

self.attention_dropout = attention_dropout

159

self.hidden_act = hidden_act

160

self.max_position_embeddings = max_position_embeddings

161

self.original_max_position_embeddings = original_max_position_embeddings

162

self.initializer_range = initializer_range

163

self.rms_norm_eps = rms_norm_eps

164

self.use_cache = use_cache

165

self.rope_theta = rope_theta

166

self.rope_scaling = rope_scaling

167

self._rope_scaling_validation()

168

self.sliding_window = sliding_window

169

self.embd_layer = embd_layer

170

171

172

super().__init__(

173

bos_token_id=bos_token_id,

174

eos_token_id=eos_token_id,

175

pad_token_id=pad_token_id,

176

tie_word_embeddings=tie_word_embeddings,

177

**kwargs,

178

)

179

180

def _rope_scaling_validation(self):

181

"""

182

Validate the `rope_scaling` configuration.

183

"""

184

if self.rope_scaling is None:

185

return

186

187

if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:

188

raise ValueError(

189

"`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "

190

f"got {self.rope_scaling}"

191

)

192

rope_scaling_type = self.rope_scaling.get("type", None)

193

rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)

194

rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)

195

if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:

196

raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")

197

if not (

198

isinstance(rope_scaling_short_factor, list)

199

and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)

200

):

201

raise ValueError(

202

f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"

203

)

204

if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:

205

raise ValueError(

206

                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"

207

)

208

if not (

209

isinstance(rope_scaling_long_factor, list)

210

and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)

211

):

212

raise ValueError(

213

f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"

214

)

215

if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:

216

raise ValueError(

217

                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"

218

)

1	`# coding=utf-8`
2	`# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.`
3	`#`
4	`# Licensed under the Apache License, Version 2.0 (the "License");`
5	`# you may not use this file except in compliance with the License.`
6	`# You may obtain a copy of the License at`
7	`#`
8	`# http://www.apache.org/licenses/LICENSE-2.0`
9	`#`
10	`# Unless required by applicable law or agreed to in writing, software`
11	`# distributed under the License is distributed on an "AS IS" BASIS,`
12	`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
13	`# See the License for the specific language governing permissions and`
14	`# limitations under the License.`
15
16	`""" Phi-3-V model configuration"""`
17
18
19	`from transformers.configuration_utils import PretrainedConfig`
20	`from transformers.utils import logging`
21
22
23	`logger = logging.get_logger(__name__)`
24
25	`PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {`
26	`"microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",`
27	`"microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct/resolve/main/config.json",`
28	`}`
29
30
31	`class Phi3VConfig(PretrainedConfig):`
32	`r"""`
33	This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
34	`model according to the specified arguments, defining the model architecture. Instantiating a configuration with the`
35	`defaults will yield a similar configuration to that of the`
36	`[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).`
37
38	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39	documentation from [`PretrainedConfig`] for more information.
40
41	`Args:`
42	vocab_size (`int`, optional, defaults to 32064):
43	`Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the`
44	`inputs_ids` passed when calling [`Phi3VModel`].
45	hidden_size (`int`, optional, defaults to 3072):
46	`Dimension of the hidden representations.`
47	intermediate_size (`int`, optional, defaults to 8192):
48	`Dimension of the MLP representations.`
49	num_hidden_layers (`int`, optional, defaults to 32):
50	`Number of hidden layers in the Transformer decoder.`
51	num_attention_heads (`int`, optional, defaults to 32):
52	`Number of attention heads for each attention layer in the Transformer decoder.`
53	num_key_value_heads (`int`, optional):
54	`This is the number of key_value heads that should be used to implement Grouped Query Attention. If`
55	`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56	`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57	`converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed`
58	`by meanpooling all the original heads within that group. For more details checkout [this`
59	`paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to`
60	`num_attention_heads`.
61	resid_pdrop (`float`, optional, defaults to 0.0):
62	`Dropout probability for mlp outputs.`
63	embd_pdrop (`int`, optional, defaults to 0.0):
64	`The dropout ratio for the embeddings.`
65	attention_dropout (`float`, optional, defaults to 0.0):
66	`The dropout ratio after computing the attention scores.`
67	hidden_act (`str` or `function`, optional, defaults to `"silu"`):
68	`The non-linear activation function (function or string) in the decoder.`
69	max_position_embeddings (`int`, optional, defaults to 4096):
70	`The maximum sequence length that this model might ever be used with.`
71	original_max_position_embeddings (`int`, optional, defaults to 4096):
72	`The maximum sequence length that this model was trained with. This is used to determine the size of the`
73	`original RoPE embeddings when using long scaling.`
74	initializer_range (`float`, optional, defaults to 0.02):
75	`The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`
76	rms_norm_eps (`float`, optional, defaults to 1e-05):
77	`The epsilon value used for the RMSNorm.`
78	use_cache (`bool`, optional, defaults to `True`):
79	`Whether or not the model should return the last key/values attentions (not used by all models). Only`
80	relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
81	tie_word_embeddings (`bool`, optional, defaults to `False`):
82	`Whether to tie weight embeddings`
83	rope_theta (`float`, optional, defaults to 10000.0):
84	`The base period of the RoPE embeddings.`
85	rope_scaling (`dict`, optional):
86	The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87	contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
88	the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89	`divided by the number of attention heads divided by 2.`
90	bos_token_id (`int`, optional, defaults to 1):
91	`The id of the "beginning-of-sequence" token.`
92	eos_token_id (`int`, optional, defaults to 32000):
93	`The id of the "end-of-sequence" token.`
94	pad_token_id (`int`, optional, defaults to 32000):
95	`The id of the padding token.`
96	sliding_window (`int`, optional):
97	Sliding window attention window size. If `None`, no sliding window is applied.
98	embd_layer (`str`, optional, defaults to `"default"`):
99	The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text.
100
101	`Example:`
102
103	```python
104	`>>> from transformers import Phi3VModel, Phi3VConfig`
105
106	`>>> # Initializing a Phi-3-V style configuration`
107	`>>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")`
108
109	`>>> # Initializing a model from the configuration`
110	`>>> model = Phi3VModel(configuration)`
111
112	`>>> # Accessing the model configuration`
113	`>>> configuration = model.config`
114	```"""
115
116	`model_type = "phi3_v"`
117	`keys_to_ignore_at_inference = ["past_key_values"]`
118
119	`def __init__(`
120	`self,`
121	`vocab_size=32064,`
122	`hidden_size=3072,`
123	`intermediate_size=8192,`
124	`num_hidden_layers=32,`
125	`num_attention_heads=32,`
126	`num_key_value_heads=None,`
127	`resid_pdrop=0.0,`
128	`embd_pdrop=0.0,`
129	`attention_dropout=0.0,`
130	`hidden_act="silu",`
131	`max_position_embeddings=4096,`
132	`original_max_position_embeddings=4096,`
133	`initializer_range=0.02,`
134	`rms_norm_eps=1e-5,`
135	`use_cache=True,`
136	`tie_word_embeddings=False,`
137	`rope_theta=10000.0,`
138	`rope_scaling=None,`
139	`bos_token_id=1,`
140	`eos_token_id=32000,`
141	`pad_token_id=32000,`
142	`sliding_window=None,`
143	`embd_layer: str = "default",`
144	`**kwargs,`
145	`):`
146	`self.vocab_size = vocab_size`
147	`self.hidden_size = hidden_size`
148	`self.intermediate_size = intermediate_size`
149	`self.num_hidden_layers = num_hidden_layers`
150	`self.num_attention_heads = num_attention_heads`
151
152	`if num_key_value_heads is None:`
153	`num_key_value_heads = num_attention_heads`
154
155	`self.num_key_value_heads = num_key_value_heads`
156	`self.resid_pdrop = resid_pdrop`
157	`self.embd_pdrop = embd_pdrop`
158	`self.attention_dropout = attention_dropout`
159	`self.hidden_act = hidden_act`
160	`self.max_position_embeddings = max_position_embeddings`
161	`self.original_max_position_embeddings = original_max_position_embeddings`
162	`self.initializer_range = initializer_range`
163	`self.rms_norm_eps = rms_norm_eps`
164	`self.use_cache = use_cache`
165	`self.rope_theta = rope_theta`
166	`self.rope_scaling = rope_scaling`
167	`self._rope_scaling_validation()`
168	`self.sliding_window = sliding_window`
169	`self.embd_layer = embd_layer`
170
171
172	`super().__init__(`
173	`bos_token_id=bos_token_id,`
174	`eos_token_id=eos_token_id,`
175	`pad_token_id=pad_token_id,`
176	`tie_word_embeddings=tie_word_embeddings,`
177	`**kwargs,`
178	`)`
179
180	`def _rope_scaling_validation(self):`
181	`"""`
182	Validate the `rope_scaling` configuration.
183	`"""`
184	`if self.rope_scaling is None:`
185	`return`
186
187	`if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:`
188	`raise ValueError(`
189	"`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
190	`f"got {self.rope_scaling}"`
191	`)`
192	`rope_scaling_type = self.rope_scaling.get("type", None)`
193	`rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)`
194	`rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)`
195	`if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:`
196	raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
197	`if not (`
198	`isinstance(rope_scaling_short_factor, list)`
199	`and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)`
200	`):`
201	`raise ValueError(`
202	f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
203	`)`
204	`if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:`
205	`raise ValueError(`
206	f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
207	`)`
208	`if not (`
209	`isinstance(rope_scaling_long_factor, list)`
210	`and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)`
211	`):`
212	`raise ValueError(`
213	f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
214	`)`
215	`if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:`
216	`raise ValueError(`
217	f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
218	`)`