implement _convert_id_to_token
Browse files- tokenization_qwen.py +28 -7
tokenization_qwen.py
CHANGED
|
@@ -78,7 +78,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 78 |
|
| 79 |
self.errors = errors # how to handle errors in decoding
|
| 80 |
|
| 81 |
-
name = "
|
| 82 |
ENDOFTEXT = "<|endoftext|>"
|
| 83 |
IMSTART = "<|im_start|>"
|
| 84 |
IMEND = "<|im_end|>"
|
|
@@ -181,10 +181,6 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 181 |
Args:
|
| 182 |
text (`str`):
|
| 183 |
The sequence to be encoded.
|
| 184 |
-
pair (`str`, *optional*):
|
| 185 |
-
A second sequence to be encoded with the first.
|
| 186 |
-
add_special_tokens (`bool`, *optional*, defaults to `False`):
|
| 187 |
-
Whether or not to add the special tokens associated with the corresponding model.
|
| 188 |
kwargs (additional keyword arguments, *optional*):
|
| 189 |
Will be passed to the underlying model specific encode method. See details in
|
| 190 |
[`~PreTrainedTokenizerBase.__call__`]
|
|
@@ -214,7 +210,31 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 214 |
return self.tokenizer.n_vocab
|
| 215 |
|
| 216 |
def _convert_id_to_token(self, index: int) -> str:
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
def _tokenize(self, text, **kwargs):
|
| 220 |
"""
|
|
@@ -229,9 +249,10 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 229 |
self,
|
| 230 |
token_ids: Union[int, List[int]],
|
| 231 |
skip_special_tokens: bool = False,
|
| 232 |
-
clean_up_tokenization_spaces: bool = None,
|
| 233 |
**kwargs,
|
| 234 |
) -> str:
|
| 235 |
if isinstance(token_ids, int):
|
| 236 |
token_ids = [token_ids]
|
|
|
|
|
|
|
| 237 |
return self.tokenizer.decode(token_ids)
|
|
|
|
| 78 |
|
| 79 |
self.errors = errors # how to handle errors in decoding
|
| 80 |
|
| 81 |
+
name = "Qwen"
|
| 82 |
ENDOFTEXT = "<|endoftext|>"
|
| 83 |
IMSTART = "<|im_start|>"
|
| 84 |
IMEND = "<|im_end|>"
|
|
|
|
| 181 |
Args:
|
| 182 |
text (`str`):
|
| 183 |
The sequence to be encoded.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
kwargs (additional keyword arguments, *optional*):
|
| 185 |
Will be passed to the underlying model specific encode method. See details in
|
| 186 |
[`~PreTrainedTokenizerBase.__call__`]
|
|
|
|
| 210 |
return self.tokenizer.n_vocab
|
| 211 |
|
| 212 |
def _convert_id_to_token(self, index: int) -> str:
|
| 213 |
+
if index >= self.tokenizer.n_vocab:
|
| 214 |
+
return self.unk_token
|
| 215 |
+
return self.tokenizer.decode([index])
|
| 216 |
+
|
| 217 |
+
def _convert_token_to_id(self, token: str) -> int:
|
| 218 |
+
"""Converts a token to an id using the vocab."""
|
| 219 |
+
return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def all_special_tokens(self) -> List[str]:
|
| 223 |
+
"""
|
| 224 |
+
`List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
|
| 225 |
+
|
| 226 |
+
Convert tokens of `tokenizers.AddedToken` type to string.
|
| 227 |
+
"""
|
| 228 |
+
all_toks = [str(s) for s in self.special_tokens.keys()]
|
| 229 |
+
return all_toks
|
| 230 |
+
|
| 231 |
+
@property
|
| 232 |
+
def all_special_ids(self) -> List[int]:
|
| 233 |
+
"""
|
| 234 |
+
`List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
|
| 235 |
+
"""
|
| 236 |
+
all_ids = [v for v in self.special_tokens.values()]
|
| 237 |
+
return all_ids
|
| 238 |
|
| 239 |
def _tokenize(self, text, **kwargs):
|
| 240 |
"""
|
|
|
|
| 249 |
self,
|
| 250 |
token_ids: Union[int, List[int]],
|
| 251 |
skip_special_tokens: bool = False,
|
|
|
|
| 252 |
**kwargs,
|
| 253 |
) -> str:
|
| 254 |
if isinstance(token_ids, int):
|
| 255 |
token_ids = [token_ids]
|
| 256 |
+
if skip_special_tokens:
|
| 257 |
+
token_ids = [i for i in token_ids if i not in self.all_special_ids]
|
| 258 |
return self.tokenizer.decode(token_ids)
|