tokenization_qwen.py

10.6 KB · 267 lines · python Raw

1
2	`from typing import List, Optional`
3	`from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer`
4	`from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast`
5	`from tokenizers import processors`
6
7	`VOCAB_FILES_NAMES = {`
8	`"vocab_file": "vocab.json",`
9	`"merges_file": "merges.txt",`
10	`"tokenizer_file": "tokenizer.json",`
11	`}`
12
13	`class Qwen2Tokenizer(OriginalQwen2Tokenizer):`
14	`"""`
15	`Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.`
16
17	`Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will`
18	`be encoded differently whether it is at the beginning of the sentence (without space) or not:`
19
20	```python
21	`>>> from transformers import Qwen2Tokenizer`
22
23	`>>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")`
24	`>>> tokenizer("Hello world")["input_ids"]`
25	`[9707, 1879]`
26
27	`>>> tokenizer(" Hello world")["input_ids"]`
28	`[21927, 1879]`
29	```
30	`This is expected.`
31
32	`You should not use GPT2Tokenizer instead, because of the different pretokenization rules.`
33
34	This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
35	`this superclass for more information regarding those methods.`
36
37	`Args:`
38	vocab_file (`str`):
39	`Path to the vocabulary file.`
40	merges_file (`str`):
41	`Path to the merges file.`
42	errors (`str`, optional, defaults to `"replace"`):
43	`Paradigm to follow when decoding bytes to UTF-8. See`
44	`[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.`
45	unk_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
46	`The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this`
47	`token instead.`
48	bos_token (`str`, optional):
49	`The beginning of sequence token. Not applicable for this tokenizer.`
50	eos_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
51	`The end of sequence token.`
52	pad_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
53	`The token used for padding, for example when batching sequences of different lengths.`
54	clean_up_tokenization_spaces (`bool`, optional, defaults to `False`):
55	`Whether or not the model should cleanup the spaces that were added when splitting the input text during the`
56	`tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.`
57	split_special_tokens (`bool`, optional, defaults to `False`):
58	`Whether or not the special tokens should be split during the tokenization process. The default behavior is`
59	to not split special tokens. This means that if `<\|endoftext\|>` is the `eos_token`, then `tokenizer.tokenize("<\|endoftext\|>") =
60	['<\|endoftext\|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<\|endoftext\|>")` will be give `['<',
61	'\|', 'endo', 'ft', 'ext', '\|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
62	add_eos_token (`bool`, optional, defaults to `False`):
63	Whether or not to add an `eos_token` at the end of sequences.
64	`"""`
65
66	`def __init__(`
67	`self,`
68	`vocab_file,`
69	`merges_file,`
70	`errors="replace",`
71	`unk_token="<\|endoftext\|>",`
72	`bos_token=None,`
73	`eos_token="<\|endoftext\|>",`
74	`pad_token="<\|endoftext\|>",`
75	`clean_up_tokenization_spaces=False,`
76	`split_special_tokens=False,`
77	`add_eos_token=False,`
78	`**kwargs,`
79	`):`
80	`# The add_eos_token code was inspired by the LlamaTokenizer`
81	`self.add_eos_token = add_eos_token`
82
83	`super().__init__(`
84	`vocab_file=vocab_file,`
85	`merges_file=merges_file,`
86	`errors=errors,`
87	`unk_token=unk_token,`
88	`bos_token=bos_token,`
89	`eos_token=eos_token,`
90	`pad_token=pad_token,`
91	`clean_up_tokenization_spaces=clean_up_tokenization_spaces,`
92	`split_special_tokens=split_special_tokens,`
93	`add_eos_token=add_eos_token,`
94	`**kwargs,`
95	`)`
96
97	`def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):`
98	`eos_token_id = [self.eos_token_id] if self.add_eos_token else []`
99
100	`output = token_ids_0 + eos_token_id`
101
102	`if token_ids_1 is not None:`
103	`output = output + token_ids_1 + eos_token_id`
104
105	`return output`
106
107	`def get_special_tokens_mask(`
108	`self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False`
109	`) -> List[int]:`
110	`"""`
111	`Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding`
112	special tokens using the tokenizer `prepare_for_model` method.
113
114	`Args:`
115	token_ids_0 (`List[int]`):
116	`List of IDs.`
117	token_ids_1 (`List[int]`, optional):
118	`Optional second list of IDs for sequence pairs.`
119	already_has_special_tokens (`bool`, optional, defaults to `False`):
120	`Whether or not the token list is already formatted with special tokens for the model.`
121
122	`Returns:`
123	`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
124	`"""`
125	`if already_has_special_tokens:`
126	`return super().get_special_tokens_mask(`
127	`token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True`
128	`)`
129
130	`eos_token_id = [1] if self.add_eos_token else []`
131
132	`if token_ids_1 is None:`
133	`return ([0] * len(token_ids_0)) + eos_token_id`
134	`return (`
135	`([0] * len(token_ids_0))`
136	`+ eos_token_id`
137	`+ ([0] * len(token_ids_1))`
138	`+ eos_token_id`
139	`)`
140
141	`def create_token_type_ids_from_sequences(`
142	`self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None`
143	`) -> List[int]:`
144	`"""`
145	`Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT`
146	`sequence pair mask has the following format:`
147
148	```
149	`0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1`
150	`\| first sequence \| second sequence \|`
151	```
152
153	`if token_ids_1 is None, only returns the first portion of the mask (0s).`
154
155	`Args:`
156	token_ids_0 (`List[int]`):
157	`List of ids.`
158	token_ids_1 (`List[int]`, optional):
159	`Optional second list of IDs for sequence pairs.`
160
161	`Returns:`
162	`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
163	`"""`
164	`eos_token_id = [self.eos_token_id] if self.add_eos_token else []`
165
166	`output = [0] * len(token_ids_0 + eos_token_id)`
167
168	`if token_ids_1 is not None:`
169	`output += [1] * len(token_ids_1 + eos_token_id)`
170
171	`return output`
172
173	`class Qwen2TokenizerFast(OriginalQwen2TokenizerFast):`
174	`"""`
175	`Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's tokenizers library). Based on byte-level`
176	`Byte-Pair-Encoding.`
177
178	`Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will`
179	`be encoded differently whether it is at the beginning of the sentence (without space) or not:`
180
181	```python
182	`>>> from transformers import Qwen2TokenizerFast`
183
184	`>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")`
185	`>>> tokenizer("Hello world")["input_ids"]`
186	`[9707, 1879]`
187
188	`>>> tokenizer(" Hello world")["input_ids"]`
189	`[21927, 1879]`
190	```
191	`This is expected.`
192
193	This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
194	`refer to this superclass for more information regarding those methods.`
195
196	`Args:`
197	vocab_file (`str`, optional):
198	`Path to the vocabulary file.`
199	merges_file (`str`, optional):
200	`Path to the merges file.`
201	tokenizer_file (`str`, optional):
202	`Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that`
203	`contains everything needed to load the tokenizer.`
204	unk_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
205	`The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this`
206	`token instead. Not applicable to this tokenizer.`
207	bos_token (`str`, optional):
208	`The beginning of sequence token. Not applicable for this tokenizer.`
209	eos_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
210	`The end of sequence token.`
211	pad_token (`str`, optional, defaults to `"<\|endoftext\|>"`):
212	`The token used for padding, for example when batching sequences of different lengths.`
213	add_eos_token (`bool`, optional, defaults to `False`):
214	Whether or not to add an `eos_token` at the end of sequences.
215	`"""`
216
217	`slow_tokenizer_class = Qwen2Tokenizer`
218	`padding_side = "left"`
219
220	`def __init__(`
221	`self,`
222	`vocab_file=None,`
223	`merges_file=None,`
224	`tokenizer_file=None,`
225	`unk_token="<\|endoftext\|>",`
226	`bos_token=None,`
227	`eos_token="<\|endoftext\|>",`
228	`pad_token="<\|endoftext\|>",`
229	`add_eos_token=False,`
230	`**kwargs,`
231	`):`
232	`super().__init__(`
233	`vocab_file=vocab_file,`
234	`merges_file=merges_file,`
235	`tokenizer_file=tokenizer_file,`
236	`unk_token=unk_token,`
237	`bos_token=bos_token,`
238	`eos_token=eos_token,`
239	`pad_token=pad_token,`
240	`**kwargs,`
241	`)`
242
243	`self._add_eos_token = add_eos_token`
244	`self.update_post_processor()`
245
246	`def update_post_processor(self):`
247	`"""`
248	Updates the underlying post processor with the current `eos_token`.
249	`"""`
250	`eos = self.eos_token`
251	`eos_token_id = self.eos_token_id`
252	`if eos is None and self.add_eos_token:`
253	`raise ValueError("add_eos_token = True but eos_token = None")`
254
255	`single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}"`
256	`pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"`
257
258	`special_tokens = []`
259	`if self.add_eos_token:`
260	`special_tokens.append((eos, eos_token_id))`
261	`self._tokenizer.post_processor = processors.TemplateProcessing(`
262	`single=single, pair=pair, special_tokens=special_tokens`
263	`)`
264
265	`@property`
266	`def add_eos_token(self):`
267	`return self._add_eos_token`