Fix incorrect unk_id assignment
Browse files`self.unk_id` is currently incorrectly assigned the ID of pad_token. Inferring from the original comment, the intention was likely to use `self.unk_token` (the second-to-last special token) to fetch its correct ID; this PR applies this correction.
However, I think this fix might affect model performance without retraining.
- tokenization_kimia.py +1 -1
tokenization_kimia.py
CHANGED
@@ -127,7 +127,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
127 |
self.pad_id: int = self.special_tokens[self.pad_token]
|
128 |
|
129 |
self.unk_token: str = special_tokens[-2]
|
130 |
-
self.unk_id: int = self.special_tokens[self.
|
131 |
|
132 |
self.stop_tokens = {
|
133 |
self.special_tokens["[EOS]"],
|
|
|
127 |
self.pad_id: int = self.special_tokens[self.pad_token]
|
128 |
|
129 |
self.unk_token: str = special_tokens[-2]
|
130 |
+
self.unk_id: int = self.special_tokens[self.unk_token]
|
131 |
|
132 |
self.stop_tokens = {
|
133 |
self.special_tokens["[EOS]"],
|