update tokenization.py
Browse files- tokenization_qwen.py +2 -1
tokenization_qwen.py
CHANGED
|
@@ -42,6 +42,7 @@ SPECIAL_TOKENS = tuple(
|
|
| 42 |
start=SPECIAL_START_ID,
|
| 43 |
)
|
| 44 |
)
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
@@ -160,7 +161,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 160 |
raise ValueError("Adding regular tokens is not supported")
|
| 161 |
for token in new_tokens:
|
| 162 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
| 163 |
-
if surface_form not in
|
| 164 |
raise ValueError("Adding unknown special tokens is not supported")
|
| 165 |
return 0
|
| 166 |
|
|
|
|
| 42 |
start=SPECIAL_START_ID,
|
| 43 |
)
|
| 44 |
)
|
| 45 |
+
SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
|
| 46 |
|
| 47 |
|
| 48 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
|
|
| 161 |
raise ValueError("Adding regular tokens is not supported")
|
| 162 |
for token in new_tokens:
|
| 163 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
| 164 |
+
if surface_form not in SPECIAL_TOKENS_SET:
|
| 165 |
raise ValueError("Adding unknown special tokens is not supported")
|
| 166 |
return 0
|
| 167 |
|