Ericu950 commited on
Commit
1989555
·
verified ·
1 Parent(s): ccecfdf

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +51 -0
  2. tokenizer.py +271 -0
  3. tokenizer_config.json +61 -0
  4. vocab.json +0 -0
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ from transformers import PreTrainedTokenizer
4
+
5
+ # syllabify.py
6
+
7
+ # Define the set of Greek consonants
8
+ CONSONANTS = set('βγδθκπτφχλρσμν')
9
+
10
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
11
+ """
12
+ Generate token_type_ids to distinguish sequences if token_ids_1 is given.
13
+ RoBERTa doesn't use token_type_ids, so we set them all to 0.
14
+ """
15
+ if token_ids_1 is None:
16
+ return [0] * (len(token_ids_0) + 2) # +2 for CLS and SEP
17
+ return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1)
18
+
19
+ def syllabify(tokens):
20
+ """
21
+ Given a list of Greek tokens (letters or diphthongs), returns a list of syllables.
22
+ Each syllable is a list of tokens.
23
+
24
+ The syllabification follows these rules:
25
+ - A syllable must have a vowel (or diphthong) as its nucleus.
26
+ - A single consonant preceding a vowel is considered onset of that syllable.
27
+ - If there are multiple consonants between vowels, the first consonant is attached as coda
28
+ to the preceding syllable, and the remaining form the onset of the following syllable.
29
+ - Any trailing consonants are attached to the last syllable.
30
+ """
31
+ syllables = []
32
+ i = 0
33
+ n = len(tokens)
34
+
35
+ while i < n:
36
+ current = []
37
+
38
+ # Collect initial consonants for the syllable onset.
39
+ while i < n and tokens[i] in CONSONANTS:
40
+ current.append(tokens[i])
41
+ i += 1
42
+
43
+ # If no vowel is encountered, attach remaining consonants to previous syllable if available.
44
+ if i >= n:
45
+ if syllables:
46
+ syllables[-1].extend(current)
47
+ else:
48
+ syllables.append(current)
49
+ break
50
+
51
+ # Add the vowel (nucleus) to the current syllable.
52
+ current.append(tokens[i])
53
+ i += 1
54
+
55
+ # Look ahead to count following consonants until the next vowel.
56
+ start = i
57
+ count = 0
58
+ while i < n and tokens[i] in CONSONANTS:
59
+ count += 1
60
+ i += 1
61
+
62
+ if count == 0:
63
+ # No following consonants: the current syllable is complete.
64
+ syllables.append(current)
65
+ elif count == 1:
66
+ # A single consonant between vowels goes with the following syllable.
67
+ syllables.append(current)
68
+ # "Un-read" the single consonant so it will start the next syllable.
69
+ i = start
70
+ else:
71
+ # For two or more consonants, attach the first to the current syllable as coda,
72
+ # and let the remaining consonant(s) start the next syllable.
73
+ current.append(tokens[start])
74
+ syllables.append(current)
75
+ i = start + 1 # Process remaining consonants in the next iteration.
76
+
77
+ return syllables
78
+
79
+ def syllabify_joined(tokens):
80
+ """
81
+ Convenience function that returns syllables as joined strings instead of lists.
82
+ """
83
+ syllable_lists = syllabify(tokens)
84
+ return [''.join(syl) for syl in syllable_lists]
85
+
86
+ if __name__ == '__main__':
87
+ # Test the syllabification with sample input.
88
+ test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ']
89
+
90
+ print("Syllabified (as lists):")
91
+ syllable_lists = syllabify(test_tokens)
92
+ for syl in syllable_lists:
93
+ print(syl)
94
+
95
+ print("\nSyllabified (joined strings):")
96
+ print(syllabify_joined(test_tokens))
97
+
98
+
99
+ import re
100
+ import unicodedata
101
+
102
+ # === 1. Oxia → Tonos replacements ===
103
+ OXIA_TO_TONOS = {
104
+ "ά": "ά", # U+1F71 → U+03AC (alpha)
105
+ "έ": "έ", # U+1F73 → U+03AD (epsilon)
106
+ "ή": "ή", # U+1F75 → U+03AE (eta)
107
+ "ί": "ί", # U+1F77 → U+03AF (iota)
108
+ "ύ": "ύ", # U+1F7B → U+03CD (upsilon)
109
+ "ό": "ό", # U+1F79 → U+03CC (omicron)
110
+ "ώ": "ώ", # U+1F7D → U+03CE (omega)
111
+ }
112
+
113
+ # === 2. Define diphthong components ===
114
+ diphth_y = {'α', 'ε', 'η', 'ο'}
115
+ upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}
116
+
117
+ diphth_i = {'α', 'ε', 'ο', 'υ'}
118
+ iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}
119
+
120
+ adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ',
121
+ 'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'}
122
+ adscr_i_second = {'ι'}
123
+
124
+ # === 3. Character expansion and diphthong handling ===
125
+ def process_word(word):
126
+ expanded = []
127
+ for char in word:
128
+ if char == 'ζ':
129
+ expanded.extend(['δ', 'σ'])
130
+ elif char == 'ς':
131
+ expanded.append('σ')
132
+ elif char == 'ῥ':
133
+ expanded.append('ρ')
134
+ elif char == 'ξ':
135
+ expanded.extend(['κ', 'σ'])
136
+ elif char == 'ψ':
137
+ expanded.extend(['π', 'σ'])
138
+ else:
139
+ expanded.append(char)
140
+
141
+ combined = []
142
+ i = 0
143
+ while i < len(expanded):
144
+ a = expanded[i]
145
+ b = expanded[i+1] if i + 1 < len(expanded) else ''
146
+
147
+ if a in diphth_y and b in upsilon_forms:
148
+ combined.append(a + b)
149
+ i += 2
150
+ elif a in diphth_i and b in iota_forms:
151
+ combined.append(a + b)
152
+ i += 2
153
+ elif a in adscr_i_first and b in adscr_i_second:
154
+ combined.append(a + b)
155
+ i += 2
156
+ else:
157
+ combined.append(a)
158
+ i += 1
159
+
160
+ return combined
161
+ def replace_oxia_with_tonos(text):
162
+ return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)
163
+
164
+ def preprocess_greek_line(line):
165
+ # Step 1: Normalize oxia → tonos
166
+ line = replace_oxia_with_tonos(line)
167
+
168
+ # Step 2: Extract Greek words
169
+ words = re.findall(
170
+ r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
171
+ r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
172
+ r"ἐἑἒἓἔἕἘἙἜἝ"
173
+ r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
174
+ r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
175
+ r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
176
+ r"ὐὑὒὓὔὕὖὗὙὛὝ"
177
+ r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
178
+ r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
179
+ r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
180
+ r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
181
+ r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
182
+ line.lower()
183
+ )
184
+
185
+ # Step 3: Tokenize & flatten
186
+ token_lists = [process_word(word) for word in words]
187
+ return [token for tokens in token_lists for token in tokens]
188
+
189
+
190
+ class GreekSyllableTokenizer(PreTrainedTokenizer):
191
+ vocab_files_names = {"vocab_file": "vocab.json"}
192
+
193
+ def __init__(self, vocab_file: str, **kwargs):
194
+ # --- 1. ladda vokab -----------------------------------------
195
+ with Path(vocab_file).open(encoding="utf-8") as f:
196
+ self.vocab = json.load(f)
197
+ self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()}
198
+
199
+ # --- 2. sätt default-specials om de inte redan kom i kwargs --
200
+ kwargs.setdefault("pad_token", "[PAD]")
201
+ kwargs.setdefault("unk_token", "[UNK]")
202
+ kwargs.setdefault("bos_token", "[CLS]")
203
+ kwargs.setdefault("eos_token", "[SEP]")
204
+ kwargs.setdefault("cls_token", "[CLS]")
205
+ kwargs.setdefault("sep_token", "[SEP]")
206
+ kwargs.setdefault("mask_token", "[MASK]")
207
+
208
+ # se till att specials finns i vokab med rätt id-ordning
209
+ for sp in [kwargs["bos_token"], kwargs["eos_token"],
210
+ kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]:
211
+ if sp not in self.vocab:
212
+ self.vocab[sp] = len(self.vocab)
213
+ self.ids_to_tokens[self.vocab[sp]] = sp
214
+
215
+ # --- 3. initiera basklassen en gång, utan dubbletter ---------
216
+ super().__init__(**kwargs)
217
+
218
+ # ---------- obligatoriska krokar -------------------------------
219
+ def _tokenize(self, text):
220
+ return syllabify_joined(preprocess_greek_line(text))
221
+
222
+ def _convert_token_to_id(self, token):
223
+ return self.vocab.get(token, self.vocab[self.unk_token])
224
+
225
+ def _convert_id_to_token(self, idx):
226
+ return self.ids_to_tokens.get(idx, self.unk_token)
227
+
228
+ # ---------- LÄGG TILL CLS/SEP AUTOMATISKT -----------------------
229
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
230
+ """
231
+ [CLS] tokens_0 [SEP] (enkel sekvens)
232
+ [CLS] tokens_0 [SEP] tokens_1 [SEP] (par-sekvens)
233
+ """
234
+ if token_ids_1 is None:
235
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
236
+ return ([self.cls_token_id] +
237
+ token_ids_0 +
238
+ [self.sep_token_id] +
239
+ token_ids_1 +
240
+ [self.sep_token_id])
241
+
242
+ def get_special_tokens_mask(self,
243
+ token_ids_0,
244
+ token_ids_1=None,
245
+ already_has_special_tokens=False):
246
+ if already_has_special_tokens:
247
+ return [
248
+ 1 if tid in (self.cls_token_id, self.sep_token_id) else 0
249
+ for tid in (token_ids_0 + (token_ids_1 or []))
250
+ ]
251
+ if token_ids_1 is None:
252
+ return [1] + [0]*len(token_ids_0) + [1]
253
+ return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1]
254
+
255
+ def save_vocabulary(self, save_directory, filename_prefix=None):
256
+ path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json")
257
+ path.parent.mkdir(parents=True, exist_ok=True)
258
+ with path.open("w", encoding="utf-8") as f:
259
+ json.dump(
260
+ {str(k): v for k, v in self.vocab.items()}, # <- fix här
261
+ f,
262
+ ensure_ascii=False,
263
+ indent=2
264
+ )
265
+ return (str(path),)
266
+ def get_vocab(self):
267
+ return self.vocab
268
+
269
+ @property
270
+ def vocab_size(self):
271
+ return len(self.vocab)
tokenizer_config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "42037": {
4
+ "content": "[CLS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "42038": {
12
+ "content": "[SEP]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "42039": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "42040": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "42041": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizer.GreekSyllableTokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "bos_token": "[CLS]",
51
+ "clean_up_tokenization_spaces": false,
52
+ "cls_token": "[CLS]",
53
+ "eos_token": "[SEP]",
54
+ "extra_special_tokens": {},
55
+ "mask_token": "[MASK]",
56
+ "model_max_length": 514,
57
+ "pad_token": "[PAD]",
58
+ "sep_token": "[SEP]",
59
+ "tokenizer_class": "GreekSyllableTokenizer",
60
+ "unk_token": "[UNK]"
61
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff