tokenization_qwen.py
10.6 KB · 267 lines · python Raw
1
2 from typing import List, Optional
3 from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer
4 from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast
5 from tokenizers import processors
6
7 VOCAB_FILES_NAMES = {
8 "vocab_file": "vocab.json",
9 "merges_file": "merges.txt",
10 "tokenizer_file": "tokenizer.json",
11 }
12
13 class Qwen2Tokenizer(OriginalQwen2Tokenizer):
14 """
15 Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
16
17 Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
18 be encoded differently whether it is at the beginning of the sentence (without space) or not:
19
20 ```python
21 >>> from transformers import Qwen2Tokenizer
22
23 >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
24 >>> tokenizer("Hello world")["input_ids"]
25 [9707, 1879]
26
27 >>> tokenizer(" Hello world")["input_ids"]
28 [21927, 1879]
29 ```
30 This is expected.
31
32 You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
33
34 This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
35 this superclass for more information regarding those methods.
36
37 Args:
38 vocab_file (`str`):
39 Path to the vocabulary file.
40 merges_file (`str`):
41 Path to the merges file.
42 errors (`str`, *optional*, defaults to `"replace"`):
43 Paradigm to follow when decoding bytes to UTF-8. See
44 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
45 unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
46 The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
47 token instead.
48 bos_token (`str`, *optional*):
49 The beginning of sequence token. Not applicable for this tokenizer.
50 eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
51 The end of sequence token.
52 pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
53 The token used for padding, for example when batching sequences of different lengths.
54 clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
55 Whether or not the model should cleanup the spaces that were added when splitting the input text during the
56 tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
57 split_special_tokens (`bool`, *optional*, defaults to `False`):
58 Whether or not the special tokens should be split during the tokenization process. The default behavior is
59 to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
60 ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
61 '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
62 add_eos_token (`bool`, *optional*, defaults to `False`):
63 Whether or not to add an `eos_token` at the end of sequences.
64 """
65
66 def __init__(
67 self,
68 vocab_file,
69 merges_file,
70 errors="replace",
71 unk_token="<|endoftext|>",
72 bos_token=None,
73 eos_token="<|endoftext|>",
74 pad_token="<|endoftext|>",
75 clean_up_tokenization_spaces=False,
76 split_special_tokens=False,
77 add_eos_token=False,
78 **kwargs,
79 ):
80 # The add_eos_token code was inspired by the LlamaTokenizer
81 self.add_eos_token = add_eos_token
82
83 super().__init__(
84 vocab_file=vocab_file,
85 merges_file=merges_file,
86 errors=errors,
87 unk_token=unk_token,
88 bos_token=bos_token,
89 eos_token=eos_token,
90 pad_token=pad_token,
91 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
92 split_special_tokens=split_special_tokens,
93 add_eos_token=add_eos_token,
94 **kwargs,
95 )
96
97 def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
98 eos_token_id = [self.eos_token_id] if self.add_eos_token else []
99
100 output = token_ids_0 + eos_token_id
101
102 if token_ids_1 is not None:
103 output = output + token_ids_1 + eos_token_id
104
105 return output
106
107 def get_special_tokens_mask(
108 self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
109 ) -> List[int]:
110 """
111 Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
112 special tokens using the tokenizer `prepare_for_model` method.
113
114 Args:
115 token_ids_0 (`List[int]`):
116 List of IDs.
117 token_ids_1 (`List[int]`, *optional*):
118 Optional second list of IDs for sequence pairs.
119 already_has_special_tokens (`bool`, *optional*, defaults to `False`):
120 Whether or not the token list is already formatted with special tokens for the model.
121
122 Returns:
123 `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
124 """
125 if already_has_special_tokens:
126 return super().get_special_tokens_mask(
127 token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
128 )
129
130 eos_token_id = [1] if self.add_eos_token else []
131
132 if token_ids_1 is None:
133 return ([0] * len(token_ids_0)) + eos_token_id
134 return (
135 ([0] * len(token_ids_0))
136 + eos_token_id
137 + ([0] * len(token_ids_1))
138 + eos_token_id
139 )
140
141 def create_token_type_ids_from_sequences(
142 self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
143 ) -> List[int]:
144 """
145 Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
146 sequence pair mask has the following format:
147
148 ```
149 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
150 | first sequence | second sequence |
151 ```
152
153 if token_ids_1 is None, only returns the first portion of the mask (0s).
154
155 Args:
156 token_ids_0 (`List[int]`):
157 List of ids.
158 token_ids_1 (`List[int]`, *optional*):
159 Optional second list of IDs for sequence pairs.
160
161 Returns:
162 `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
163 """
164 eos_token_id = [self.eos_token_id] if self.add_eos_token else []
165
166 output = [0] * len(token_ids_0 + eos_token_id)
167
168 if token_ids_1 is not None:
169 output += [1] * len(token_ids_1 + eos_token_id)
170
171 return output
172
173 class Qwen2TokenizerFast(OriginalQwen2TokenizerFast):
174 """
175 Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
176 Byte-Pair-Encoding.
177
178 Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
179 be encoded differently whether it is at the beginning of the sentence (without space) or not:
180
181 ```python
182 >>> from transformers import Qwen2TokenizerFast
183
184 >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
185 >>> tokenizer("Hello world")["input_ids"]
186 [9707, 1879]
187
188 >>> tokenizer(" Hello world")["input_ids"]
189 [21927, 1879]
190 ```
191 This is expected.
192
193 This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
194 refer to this superclass for more information regarding those methods.
195
196 Args:
197 vocab_file (`str`, *optional*):
198 Path to the vocabulary file.
199 merges_file (`str`, *optional*):
200 Path to the merges file.
201 tokenizer_file (`str`, *optional*):
202 Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
203 contains everything needed to load the tokenizer.
204 unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
205 The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
206 token instead. Not applicable to this tokenizer.
207 bos_token (`str`, *optional*):
208 The beginning of sequence token. Not applicable for this tokenizer.
209 eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
210 The end of sequence token.
211 pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
212 The token used for padding, for example when batching sequences of different lengths.
213 add_eos_token (`bool`, *optional*, defaults to `False`):
214 Whether or not to add an `eos_token` at the end of sequences.
215 """
216
217 slow_tokenizer_class = Qwen2Tokenizer
218 padding_side = "left"
219
220 def __init__(
221 self,
222 vocab_file=None,
223 merges_file=None,
224 tokenizer_file=None,
225 unk_token="<|endoftext|>",
226 bos_token=None,
227 eos_token="<|endoftext|>",
228 pad_token="<|endoftext|>",
229 add_eos_token=False,
230 **kwargs,
231 ):
232 super().__init__(
233 vocab_file=vocab_file,
234 merges_file=merges_file,
235 tokenizer_file=tokenizer_file,
236 unk_token=unk_token,
237 bos_token=bos_token,
238 eos_token=eos_token,
239 pad_token=pad_token,
240 **kwargs,
241 )
242
243 self._add_eos_token = add_eos_token
244 self.update_post_processor()
245
246 def update_post_processor(self):
247 """
248 Updates the underlying post processor with the current `eos_token`.
249 """
250 eos = self.eos_token
251 eos_token_id = self.eos_token_id
252 if eos is None and self.add_eos_token:
253 raise ValueError("add_eos_token = True but eos_token = None")
254
255 single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
256 pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
257
258 special_tokens = []
259 if self.add_eos_token:
260 special_tokens.append((eos, eos_token_id))
261 self._tokenizer.post_processor = processors.TemplateProcessing(
262 single=single, pair=pair, special_tokens=special_tokens
263 )
264
265 @property
266 def add_eos_token(self):
267 return self._add_eos_token