tokenization_qwen.py
| 1 | |
| 2 | from typing import List, Optional |
| 3 | from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer |
| 4 | from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast |
| 5 | from tokenizers import processors |
| 6 | |
| 7 | VOCAB_FILES_NAMES = { |
| 8 | "vocab_file": "vocab.json", |
| 9 | "merges_file": "merges.txt", |
| 10 | "tokenizer_file": "tokenizer.json", |
| 11 | } |
| 12 | |
| 13 | class Qwen2Tokenizer(OriginalQwen2Tokenizer): |
| 14 | """ |
| 15 | Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. |
| 16 | |
| 17 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will |
| 18 | be encoded differently whether it is at the beginning of the sentence (without space) or not: |
| 19 | |
| 20 | ```python |
| 21 | >>> from transformers import Qwen2Tokenizer |
| 22 | |
| 23 | >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer") |
| 24 | >>> tokenizer("Hello world")["input_ids"] |
| 25 | [9707, 1879] |
| 26 | |
| 27 | >>> tokenizer(" Hello world")["input_ids"] |
| 28 | [21927, 1879] |
| 29 | ``` |
| 30 | This is expected. |
| 31 | |
| 32 | You should not use GPT2Tokenizer instead, because of the different pretokenization rules. |
| 33 | |
| 34 | This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to |
| 35 | this superclass for more information regarding those methods. |
| 36 | |
| 37 | Args: |
| 38 | vocab_file (`str`): |
| 39 | Path to the vocabulary file. |
| 40 | merges_file (`str`): |
| 41 | Path to the merges file. |
| 42 | errors (`str`, *optional*, defaults to `"replace"`): |
| 43 | Paradigm to follow when decoding bytes to UTF-8. See |
| 44 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. |
| 45 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 46 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| 47 | token instead. |
| 48 | bos_token (`str`, *optional*): |
| 49 | The beginning of sequence token. Not applicable for this tokenizer. |
| 50 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 51 | The end of sequence token. |
| 52 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 53 | The token used for padding, for example when batching sequences of different lengths. |
| 54 | clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): |
| 55 | Whether or not the model should cleanup the spaces that were added when splitting the input text during the |
| 56 | tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces. |
| 57 | split_special_tokens (`bool`, *optional*, defaults to `False`): |
| 58 | Whether or not the special tokens should be split during the tokenization process. The default behavior is |
| 59 | to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") = |
| 60 | ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<', |
| 61 | '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment. |
| 62 | add_eos_token (`bool`, *optional*, defaults to `False`): |
| 63 | Whether or not to add an `eos_token` at the end of sequences. |
| 64 | """ |
| 65 | |
| 66 | def __init__( |
| 67 | self, |
| 68 | vocab_file, |
| 69 | merges_file, |
| 70 | errors="replace", |
| 71 | unk_token="<|endoftext|>", |
| 72 | bos_token=None, |
| 73 | eos_token="<|endoftext|>", |
| 74 | pad_token="<|endoftext|>", |
| 75 | clean_up_tokenization_spaces=False, |
| 76 | split_special_tokens=False, |
| 77 | add_eos_token=False, |
| 78 | **kwargs, |
| 79 | ): |
| 80 | # The add_eos_token code was inspired by the LlamaTokenizer |
| 81 | self.add_eos_token = add_eos_token |
| 82 | |
| 83 | super().__init__( |
| 84 | vocab_file=vocab_file, |
| 85 | merges_file=merges_file, |
| 86 | errors=errors, |
| 87 | unk_token=unk_token, |
| 88 | bos_token=bos_token, |
| 89 | eos_token=eos_token, |
| 90 | pad_token=pad_token, |
| 91 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
| 92 | split_special_tokens=split_special_tokens, |
| 93 | add_eos_token=add_eos_token, |
| 94 | **kwargs, |
| 95 | ) |
| 96 | |
| 97 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
| 98 | eos_token_id = [self.eos_token_id] if self.add_eos_token else [] |
| 99 | |
| 100 | output = token_ids_0 + eos_token_id |
| 101 | |
| 102 | if token_ids_1 is not None: |
| 103 | output = output + token_ids_1 + eos_token_id |
| 104 | |
| 105 | return output |
| 106 | |
| 107 | def get_special_tokens_mask( |
| 108 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
| 109 | ) -> List[int]: |
| 110 | """ |
| 111 | Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding |
| 112 | special tokens using the tokenizer `prepare_for_model` method. |
| 113 | |
| 114 | Args: |
| 115 | token_ids_0 (`List[int]`): |
| 116 | List of IDs. |
| 117 | token_ids_1 (`List[int]`, *optional*): |
| 118 | Optional second list of IDs for sequence pairs. |
| 119 | already_has_special_tokens (`bool`, *optional*, defaults to `False`): |
| 120 | Whether or not the token list is already formatted with special tokens for the model. |
| 121 | |
| 122 | Returns: |
| 123 | `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| 124 | """ |
| 125 | if already_has_special_tokens: |
| 126 | return super().get_special_tokens_mask( |
| 127 | token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| 128 | ) |
| 129 | |
| 130 | eos_token_id = [1] if self.add_eos_token else [] |
| 131 | |
| 132 | if token_ids_1 is None: |
| 133 | return ([0] * len(token_ids_0)) + eos_token_id |
| 134 | return ( |
| 135 | ([0] * len(token_ids_0)) |
| 136 | + eos_token_id |
| 137 | + ([0] * len(token_ids_1)) |
| 138 | + eos_token_id |
| 139 | ) |
| 140 | |
| 141 | def create_token_type_ids_from_sequences( |
| 142 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| 143 | ) -> List[int]: |
| 144 | """ |
| 145 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT |
| 146 | sequence pair mask has the following format: |
| 147 | |
| 148 | ``` |
| 149 | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| 150 | | first sequence | second sequence | |
| 151 | ``` |
| 152 | |
| 153 | if token_ids_1 is None, only returns the first portion of the mask (0s). |
| 154 | |
| 155 | Args: |
| 156 | token_ids_0 (`List[int]`): |
| 157 | List of ids. |
| 158 | token_ids_1 (`List[int]`, *optional*): |
| 159 | Optional second list of IDs for sequence pairs. |
| 160 | |
| 161 | Returns: |
| 162 | `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). |
| 163 | """ |
| 164 | eos_token_id = [self.eos_token_id] if self.add_eos_token else [] |
| 165 | |
| 166 | output = [0] * len(token_ids_0 + eos_token_id) |
| 167 | |
| 168 | if token_ids_1 is not None: |
| 169 | output += [1] * len(token_ids_1 + eos_token_id) |
| 170 | |
| 171 | return output |
| 172 | |
| 173 | class Qwen2TokenizerFast(OriginalQwen2TokenizerFast): |
| 174 | """ |
| 175 | Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level |
| 176 | Byte-Pair-Encoding. |
| 177 | |
| 178 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will |
| 179 | be encoded differently whether it is at the beginning of the sentence (without space) or not: |
| 180 | |
| 181 | ```python |
| 182 | >>> from transformers import Qwen2TokenizerFast |
| 183 | |
| 184 | >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") |
| 185 | >>> tokenizer("Hello world")["input_ids"] |
| 186 | [9707, 1879] |
| 187 | |
| 188 | >>> tokenizer(" Hello world")["input_ids"] |
| 189 | [21927, 1879] |
| 190 | ``` |
| 191 | This is expected. |
| 192 | |
| 193 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should |
| 194 | refer to this superclass for more information regarding those methods. |
| 195 | |
| 196 | Args: |
| 197 | vocab_file (`str`, *optional*): |
| 198 | Path to the vocabulary file. |
| 199 | merges_file (`str`, *optional*): |
| 200 | Path to the merges file. |
| 201 | tokenizer_file (`str`, *optional*): |
| 202 | Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that |
| 203 | contains everything needed to load the tokenizer. |
| 204 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 205 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| 206 | token instead. Not applicable to this tokenizer. |
| 207 | bos_token (`str`, *optional*): |
| 208 | The beginning of sequence token. Not applicable for this tokenizer. |
| 209 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 210 | The end of sequence token. |
| 211 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| 212 | The token used for padding, for example when batching sequences of different lengths. |
| 213 | add_eos_token (`bool`, *optional*, defaults to `False`): |
| 214 | Whether or not to add an `eos_token` at the end of sequences. |
| 215 | """ |
| 216 | |
| 217 | slow_tokenizer_class = Qwen2Tokenizer |
| 218 | padding_side = "left" |
| 219 | |
| 220 | def __init__( |
| 221 | self, |
| 222 | vocab_file=None, |
| 223 | merges_file=None, |
| 224 | tokenizer_file=None, |
| 225 | unk_token="<|endoftext|>", |
| 226 | bos_token=None, |
| 227 | eos_token="<|endoftext|>", |
| 228 | pad_token="<|endoftext|>", |
| 229 | add_eos_token=False, |
| 230 | **kwargs, |
| 231 | ): |
| 232 | super().__init__( |
| 233 | vocab_file=vocab_file, |
| 234 | merges_file=merges_file, |
| 235 | tokenizer_file=tokenizer_file, |
| 236 | unk_token=unk_token, |
| 237 | bos_token=bos_token, |
| 238 | eos_token=eos_token, |
| 239 | pad_token=pad_token, |
| 240 | **kwargs, |
| 241 | ) |
| 242 | |
| 243 | self._add_eos_token = add_eos_token |
| 244 | self.update_post_processor() |
| 245 | |
| 246 | def update_post_processor(self): |
| 247 | """ |
| 248 | Updates the underlying post processor with the current `eos_token`. |
| 249 | """ |
| 250 | eos = self.eos_token |
| 251 | eos_token_id = self.eos_token_id |
| 252 | if eos is None and self.add_eos_token: |
| 253 | raise ValueError("add_eos_token = True but eos_token = None") |
| 254 | |
| 255 | single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}" |
| 256 | pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}" |
| 257 | |
| 258 | special_tokens = [] |
| 259 | if self.add_eos_token: |
| 260 | special_tokens.append((eos, eos_token_id)) |
| 261 | self._tokenizer.post_processor = processors.TemplateProcessing( |
| 262 | single=single, pair=pair, special_tokens=special_tokens |
| 263 | ) |
| 264 | |
| 265 | @property |
| 266 | def add_eos_token(self): |
| 267 | return self._add_eos_token |