File size: 5,274 Bytes
1ed1133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import json
import os
from typing import List, Optional, Union

from transformers import PreTrainedTokenizerFast


class TessarTokenizer(PreTrainedTokenizerFast):
    """
    Tessar Tokenizer implementation for Hugging Face Transformers
    """
    
    model_input_names = ['input_ids', 'attention_mask']
    
    def __init__(
        self, 
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="<unk>",
        sep_token="</s>",
        pad_token="<pad>",
        cls_token="<s>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
        max_cell_length=15,
        **kwargs
    ):
        """
        Initialize the Tessar Tokenizer with specific token configurations
        
        Args:
            vocab_file (str, optional): Path to the vocabulary file
            tokenizer_file (str, optional): Path to the pre-trained tokenizer file
            do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
            max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
        """
        # Prepare special tokens
        special_tokens = {
            "unk_token": unk_token,
            "sep_token": sep_token,
            "pad_token": pad_token,
            "cls_token": cls_token,
            "mask_token": mask_token,
            "bos_token": bos_token,
            "eos_token": eos_token,
        }
        
        # Remove None values
        special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
        
        # Call parent constructor
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            **special_tokens,
            **kwargs
        )
        
        # Custom Tessar-specific attributes
        self.do_lower_case = do_lower_case
        self.max_cell_length = max_cell_length
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
        """
        Save the tokenizer vocabulary and special tokens file
        
        Args:
            save_directory (str): Directory to save the vocabulary
            filename_prefix (str, optional): Prefix for the saved files
        
        Returns:
            tuple: Paths to the saved files
        """
        # Prepare file paths
        vocab_file = os.path.join(
            save_directory, 
            f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
        )
        
        # Save special tokens configuration
        special_tokens_file = os.path.join(
            save_directory, 
            f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
        )
        
        # Save vocabulary
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=2)
        
        # Save special tokens configuration
        special_tokens_config = {
            "unk_token": self.unk_token,
            "sep_token": self.sep_token,
            "pad_token": self.pad_token,
            "cls_token": self.cls_token,
            "mask_token": self.mask_token,
            "bos_token": self.bos_token,
            "eos_token": self.eos_token,
            "do_lower_case": self.do_lower_case,
            "max_cell_length": self.max_cell_length
        }
        
        with open(special_tokens_file, 'w', encoding='utf-8') as f:
            json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
        
        return (vocab_file, special_tokens_file)
    
    def _tokenize(self, text: str) -> List[str]:
        """
        Custom tokenization method
        
        Args:
            text (str): Input text to tokenize
        
        Returns:
            List[str]: List of tokens
        """
        # Apply lowercase if required
        if self.do_lower_case:
            text = text.lower()
        
        # Use the parent tokenizer's tokenization method
        tokens = super()._tokenize(text)
        
        # Optional: Add custom cell-length truncation
        tokens = tokens[:self.max_cell_length]
        
        return tokens
    
    def prepare_for_model(
        self, 
        ids: List[int], 
        pair_ids: Optional[List[int]] = None, 
        **kwargs
    ) -> dict:
        """
        Prepare tokenized inputs for the model
        
        Args:
            ids (List[int]): List of input token ids
            pair_ids (Optional[List[int]], optional): List of pair token ids
        
        Returns:
            dict: Prepared model inputs
        """
        # Implement any Tessar-specific model preparation logic
        # This method can be extended to add Tessar-specific preprocessing
        return super().prepare_for_model(ids, pair_ids, **kwargs)

# Example usage and initialization
def load_tessar_tokenizer(pretrained_model_name_or_path: str):
    """
    Load a pretrained Tessar tokenizer
    
    Args:
        pretrained_model_name_or_path (str): Path to the pretrained model
    
    Returns:
        TessarTokenizer: Initialized tokenizer
    """
    return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)