from pathlib import Path import json from transformers import PreTrainedTokenizer # syllabify.py # Define the set of Greek consonants CONSONANTS = set('βγδθκπτφχλρσμν') def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Generate token_type_ids to distinguish sequences if token_ids_1 is given. RoBERTa doesn't use token_type_ids, so we set them all to 0. """ if token_ids_1 is None: return [0] * (len(token_ids_0) + 2) # +2 for CLS and SEP return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1) def syllabify(tokens): """ Given a list of Greek tokens (letters or diphthongs), returns a list of syllables. Each syllable is a list of tokens. The syllabification follows these rules: - A syllable must have a vowel (or diphthong) as its nucleus. - A single consonant preceding a vowel is considered onset of that syllable. - If there are multiple consonants between vowels, the first consonant is attached as coda to the preceding syllable, and the remaining form the onset of the following syllable. - Any trailing consonants are attached to the last syllable. """ syllables = [] i = 0 n = len(tokens) while i < n: current = [] # Collect initial consonants for the syllable onset. while i < n and tokens[i] in CONSONANTS: current.append(tokens[i]) i += 1 # If no vowel is encountered, attach remaining consonants to previous syllable if available. if i >= n: if syllables: syllables[-1].extend(current) else: syllables.append(current) break # Add the vowel (nucleus) to the current syllable. current.append(tokens[i]) i += 1 # Look ahead to count following consonants until the next vowel. start = i count = 0 while i < n and tokens[i] in CONSONANTS: count += 1 i += 1 if count == 0: # No following consonants: the current syllable is complete. syllables.append(current) elif count == 1: # A single consonant between vowels goes with the following syllable. syllables.append(current) # "Un-read" the single consonant so it will start the next syllable. i = start else: # For two or more consonants, attach the first to the current syllable as coda, # and let the remaining consonant(s) start the next syllable. current.append(tokens[start]) syllables.append(current) i = start + 1 # Process remaining consonants in the next iteration. return syllables def syllabify_joined(tokens): """ Convenience function that returns syllables as joined strings instead of lists. """ syllable_lists = syllabify(tokens) return [''.join(syl) for syl in syllable_lists] if __name__ == '__main__': # Test the syllabification with sample input. test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ'] print("Syllabified (as lists):") syllable_lists = syllabify(test_tokens) for syl in syllable_lists: print(syl) print("\nSyllabified (joined strings):") print(syllabify_joined(test_tokens)) import re import unicodedata # === 1. Oxia → Tonos replacements === OXIA_TO_TONOS = { "ά": "ά", # U+1F71 → U+03AC (alpha) "έ": "έ", # U+1F73 → U+03AD (epsilon) "ή": "ή", # U+1F75 → U+03AE (eta) "ί": "ί", # U+1F77 → U+03AF (iota) "ύ": "ύ", # U+1F7B → U+03CD (upsilon) "ό": "ό", # U+1F79 → U+03CC (omicron) "ώ": "ώ", # U+1F7D → U+03CE (omega) } # === 2. Define diphthong components === diphth_y = {'α', 'ε', 'η', 'ο'} upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'} diphth_i = {'α', 'ε', 'ο', 'υ'} iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'} adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ', 'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'} adscr_i_second = {'ι'} # === 3. Character expansion and diphthong handling === def process_word(word): expanded = [] for char in word: if char == 'ζ': expanded.extend(['δ', 'σ']) elif char == 'ς': expanded.append('σ') elif char == 'ῥ': expanded.append('ρ') elif char == 'ξ': expanded.extend(['κ', 'σ']) elif char == 'ψ': expanded.extend(['π', 'σ']) else: expanded.append(char) combined = [] i = 0 while i < len(expanded): a = expanded[i] b = expanded[i+1] if i + 1 < len(expanded) else '' if a in diphth_y and b in upsilon_forms: combined.append(a + b) i += 2 elif a in diphth_i and b in iota_forms: combined.append(a + b) i += 2 elif a in adscr_i_first and b in adscr_i_second: combined.append(a + b) i += 2 else: combined.append(a) i += 1 return combined def replace_oxia_with_tonos(text): return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text) def preprocess_greek_line(line): # Step 1: Normalize oxia → tonos line = replace_oxia_with_tonos(line) # Step 2: Extract Greek words words = re.findall( r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ" r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ" r"ἐἑἒἓἔἕἘἙἜἝ" r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ" r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ" r"ὀὁὂὃὄὅὈὉὊὋὌὍ" r"ὐὑὒὓὔὕὖὗὙὛὝ" r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ" r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ" r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ" r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ" r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+", line.lower() ) # Step 3: Tokenize & flatten token_lists = [process_word(word) for word in words] return [token for tokens in token_lists for token in tokens] class GreekSyllableTokenizer(PreTrainedTokenizer): vocab_files_names = {"vocab_file": "vocab.json"} def __init__(self, vocab_file: str, **kwargs): # --- 1. ladda vokab ----------------------------------------- with Path(vocab_file).open(encoding="utf-8") as f: self.vocab = json.load(f) self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()} # --- 2. sätt default-specials om de inte redan kom i kwargs -- kwargs.setdefault("pad_token", "[PAD]") kwargs.setdefault("unk_token", "[UNK]") kwargs.setdefault("bos_token", "[CLS]") kwargs.setdefault("eos_token", "[SEP]") kwargs.setdefault("cls_token", "[CLS]") kwargs.setdefault("sep_token", "[SEP]") kwargs.setdefault("mask_token", "[MASK]") # se till att specials finns i vokab med rätt id-ordning for sp in [kwargs["bos_token"], kwargs["eos_token"], kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]: if sp not in self.vocab: self.vocab[sp] = len(self.vocab) self.ids_to_tokens[self.vocab[sp]] = sp # --- 3. initiera basklassen en gång, utan dubbletter --------- super().__init__(**kwargs) # ---------- obligatoriska krokar ------------------------------- def _tokenize(self, text): return syllabify_joined(preprocess_greek_line(text)) def _convert_token_to_id(self, token): return self.vocab.get(token, self.vocab[self.unk_token]) def _convert_id_to_token(self, idx): return self.ids_to_tokens.get(idx, self.unk_token) # ---------- LÄGG TILL CLS/SEP AUTOMATISKT ----------------------- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ [CLS] tokens_0 [SEP] (enkel sekvens) [CLS] tokens_0 [SEP] tokens_1 [SEP] (par-sekvens) """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return ([self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]) def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): if already_has_special_tokens: return [ 1 if tid in (self.cls_token_id, self.sep_token_id) else 0 for tid in (token_ids_0 + (token_ids_1 or [])) ] if token_ids_1 is None: return [1] + [0]*len(token_ids_0) + [1] return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1] def save_vocabulary(self, save_directory, filename_prefix=None): path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json") path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: json.dump( {str(k): v for k, v in self.vocab.items()}, # <- fix här f, ensure_ascii=False, indent=2 ) return (str(path),) def get_vocab(self): return self.vocab @property def vocab_size(self): return len(self.vocab)