File size: 5,274 Bytes
1ed1133 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import json
import os
from typing import List, Optional, Union
from transformers import PreTrainedTokenizerFast
class TessarTokenizer(PreTrainedTokenizerFast):
"""
Tessar Tokenizer implementation for Hugging Face Transformers
"""
model_input_names = ['input_ids', 'attention_mask']
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="<unk>",
sep_token="</s>",
pad_token="<pad>",
cls_token="<s>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
max_cell_length=15,
**kwargs
):
"""
Initialize the Tessar Tokenizer with specific token configurations
Args:
vocab_file (str, optional): Path to the vocabulary file
tokenizer_file (str, optional): Path to the pre-trained tokenizer file
do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
"""
# Prepare special tokens
special_tokens = {
"unk_token": unk_token,
"sep_token": sep_token,
"pad_token": pad_token,
"cls_token": cls_token,
"mask_token": mask_token,
"bos_token": bos_token,
"eos_token": eos_token,
}
# Remove None values
special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
# Call parent constructor
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
**special_tokens,
**kwargs
)
# Custom Tessar-specific attributes
self.do_lower_case = do_lower_case
self.max_cell_length = max_cell_length
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
"""
Save the tokenizer vocabulary and special tokens file
Args:
save_directory (str): Directory to save the vocabulary
filename_prefix (str, optional): Prefix for the saved files
Returns:
tuple: Paths to the saved files
"""
# Prepare file paths
vocab_file = os.path.join(
save_directory,
f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
)
# Save special tokens configuration
special_tokens_file = os.path.join(
save_directory,
f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
)
# Save vocabulary
with open(vocab_file, 'w', encoding='utf-8') as f:
json.dump(self.vocab, f, ensure_ascii=False, indent=2)
# Save special tokens configuration
special_tokens_config = {
"unk_token": self.unk_token,
"sep_token": self.sep_token,
"pad_token": self.pad_token,
"cls_token": self.cls_token,
"mask_token": self.mask_token,
"bos_token": self.bos_token,
"eos_token": self.eos_token,
"do_lower_case": self.do_lower_case,
"max_cell_length": self.max_cell_length
}
with open(special_tokens_file, 'w', encoding='utf-8') as f:
json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
return (vocab_file, special_tokens_file)
def _tokenize(self, text: str) -> List[str]:
"""
Custom tokenization method
Args:
text (str): Input text to tokenize
Returns:
List[str]: List of tokens
"""
# Apply lowercase if required
if self.do_lower_case:
text = text.lower()
# Use the parent tokenizer's tokenization method
tokens = super()._tokenize(text)
# Optional: Add custom cell-length truncation
tokens = tokens[:self.max_cell_length]
return tokens
def prepare_for_model(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
**kwargs
) -> dict:
"""
Prepare tokenized inputs for the model
Args:
ids (List[int]): List of input token ids
pair_ids (Optional[List[int]], optional): List of pair token ids
Returns:
dict: Prepared model inputs
"""
# Implement any Tessar-specific model preparation logic
# This method can be extended to add Tessar-specific preprocessing
return super().prepare_for_model(ids, pair_ids, **kwargs)
# Example usage and initialization
def load_tessar_tokenizer(pretrained_model_name_or_path: str):
"""
Load a pretrained Tessar tokenizer
Args:
pretrained_model_name_or_path (str): Path to the pretrained model
Returns:
TessarTokenizer: Initialized tokenizer
"""
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path) |