File size: 10,195 Bytes
from pathlib import Path
import json
from transformers import PreTrainedTokenizer

# syllabify.py

# Define the set of Greek consonants
CONSONANTS = set('βγδθκπτφχλρσμν')

def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
    """
    Generate token_type_ids to distinguish sequences if token_ids_1 is given.
    RoBERTa doesn't use token_type_ids, so we set them all to 0.
    """
    if token_ids_1 is None:
        return [0] * (len(token_ids_0) + 2)  # +2 for CLS and SEP
    return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1)
    
def syllabify(tokens):
    """
    Given a list of Greek tokens (letters or diphthongs), returns a list of syllables.
    Each syllable is a list of tokens.
    
    The syllabification follows these rules:
    - A syllable must have a vowel (or diphthong) as its nucleus.
    - A single consonant preceding a vowel is considered onset of that syllable.
    - If there are multiple consonants between vowels, the first consonant is attached as coda 
      to the preceding syllable, and the remaining form the onset of the following syllable.
    - Any trailing consonants are attached to the last syllable.
    """
    syllables = []
    i = 0
    n = len(tokens)
    
    while i < n:
        current = []
        
        # Collect initial consonants for the syllable onset.
        while i < n and tokens[i] in CONSONANTS:
            current.append(tokens[i])
            i += 1
        
        # If no vowel is encountered, attach remaining consonants to previous syllable if available.
        if i >= n:
            if syllables:
                syllables[-1].extend(current)
            else:
                syllables.append(current)
            break
        
        # Add the vowel (nucleus) to the current syllable.
        current.append(tokens[i])
        i += 1
        
        # Look ahead to count following consonants until the next vowel.
        start = i
        count = 0
        while i < n and tokens[i] in CONSONANTS:
            count += 1
            i += 1
        
        if count == 0:
            # No following consonants: the current syllable is complete.
            syllables.append(current)
        elif count == 1:
            # A single consonant between vowels goes with the following syllable.
            syllables.append(current)
            # "Un-read" the single consonant so it will start the next syllable.
            i = start
        else:
            # For two or more consonants, attach the first to the current syllable as coda,
            # and let the remaining consonant(s) start the next syllable.
            current.append(tokens[start])
            syllables.append(current)
            i = start + 1  # Process remaining consonants in the next iteration.
    
    return syllables

def syllabify_joined(tokens):
    """
    Convenience function that returns syllables as joined strings instead of lists.
    """
    syllable_lists = syllabify(tokens)
    return [''.join(syl) for syl in syllable_lists]

if __name__ == '__main__':
    # Test the syllabification with sample input.
    test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ']
    
    print("Syllabified (as lists):")
    syllable_lists = syllabify(test_tokens)
    for syl in syllable_lists:
        print(syl)
    
    print("\nSyllabified (joined strings):")
    print(syllabify_joined(test_tokens))


import re
import unicodedata

# === 1. Oxia → Tonos replacements ===
OXIA_TO_TONOS = {
    "ά": "ά",  # U+1F71 → U+03AC (alpha)
    "έ": "έ",  # U+1F73 → U+03AD (epsilon)
    "ή": "ή",  # U+1F75 → U+03AE (eta)
    "ί": "ί",  # U+1F77 → U+03AF (iota)
    "ύ": "ύ",  # U+1F7B → U+03CD (upsilon)
    "ό": "ό",  # U+1F79 → U+03CC (omicron)
    "ώ": "ώ",  # U+1F7D → U+03CE (omega)
}

# === 2. Define diphthong components ===
diphth_y = {'α', 'ε', 'η', 'ο'}
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}

diphth_i = {'α', 'ε', 'ο', 'υ'}
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}

adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ',
                 'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'}
adscr_i_second = {'ι'}

# === 3. Character expansion and diphthong handling ===
def process_word(word):
    expanded = []
    for char in word:
        if char == 'ζ':
            expanded.extend(['δ', 'σ'])
        elif char == 'ς':
            expanded.append('σ')
        elif char == 'ῥ':
            expanded.append('ρ')
        elif char == 'ξ':
            expanded.extend(['κ', 'σ'])
        elif char == 'ψ':
            expanded.extend(['π', 'σ'])
        else:
            expanded.append(char)

    combined = []
    i = 0
    while i < len(expanded):
        a = expanded[i]
        b = expanded[i+1] if i + 1 < len(expanded) else ''

        if a in diphth_y and b in upsilon_forms:
            combined.append(a + b)
            i += 2
        elif a in diphth_i and b in iota_forms:
            combined.append(a + b)
            i += 2
        elif a in adscr_i_first and b in adscr_i_second:
            combined.append(a + b)
            i += 2
        else:
            combined.append(a)
            i += 1

    return combined
def replace_oxia_with_tonos(text):
    return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)
    
def preprocess_greek_line(line):
    # Step 1: Normalize oxia → tonos
    line = replace_oxia_with_tonos(line)

    # Step 2: Extract Greek words
    words = re.findall(
        r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
        r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
        r"ἐἑἒἓἔἕἘἙἜἝ"
        r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
        r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
        r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
        r"ὐὑὒὓὔὕὖὗὙὛὝ"
        r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
        r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
        r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
        r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
        r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
        line.lower()
    )

    # Step 3: Tokenize & flatten
    token_lists = [process_word(word) for word in words]
    return [token for tokens in token_lists for token in tokens]


class GreekSyllableTokenizer(PreTrainedTokenizer):
    vocab_files_names = {"vocab_file": "vocab.json"}

    def __init__(self, vocab_file: str, **kwargs):
        # --- 1. ladda vokab -----------------------------------------
        with Path(vocab_file).open(encoding="utf-8") as f:
            self.vocab = json.load(f)
        self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()}

        # --- 2. sätt default-specials om de inte redan kom i kwargs --
        kwargs.setdefault("pad_token",  "[PAD]")
        kwargs.setdefault("unk_token",  "[UNK]")
        kwargs.setdefault("bos_token",  "[CLS]")
        kwargs.setdefault("eos_token",  "[SEP]")
        kwargs.setdefault("cls_token",  "[CLS]")
        kwargs.setdefault("sep_token",  "[SEP]")
        kwargs.setdefault("mask_token", "[MASK]")

        # se till att specials finns i vokab med rätt id-ordning
        for sp in [kwargs["bos_token"], kwargs["eos_token"],
                   kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]:
            if sp not in self.vocab:
                self.vocab[sp] = len(self.vocab)
                self.ids_to_tokens[self.vocab[sp]] = sp

        # --- 3. initiera basklassen en gång, utan dubbletter ---------
        super().__init__(**kwargs)

    # ---------- obligatoriska krokar -------------------------------
    def _tokenize(self, text):
        return syllabify_joined(preprocess_greek_line(text))

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, idx):
        return self.ids_to_tokens.get(idx, self.unk_token)

    # ---------- LÄGG TILL CLS/SEP AUTOMATISKT -----------------------
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        [CLS]  tokens_0  [SEP]      (enkel sekvens)
        [CLS]  tokens_0  [SEP] tokens_1 [SEP]   (par-sekvens)
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        return ([self.cls_token_id] +
                token_ids_0 +
                [self.sep_token_id] +
                token_ids_1 +
                [self.sep_token_id])

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        if already_has_special_tokens:
            return [
                1 if tid in (self.cls_token_id, self.sep_token_id) else 0
                for tid in (token_ids_0 + (token_ids_1 or []))
            ]
        if token_ids_1 is None:
            return [1] + [0]*len(token_ids_0) + [1]
        return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1]

    def save_vocabulary(self, save_directory, filename_prefix=None):
        path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json")
        path.parent.mkdir(parents=True, exist_ok=True)
        with path.open("w", encoding="utf-8") as f:
            json.dump(
                {str(k): v for k, v in self.vocab.items()},  # <- fix här
                f,
                ensure_ascii=False,
                indent=2
            )
        return (str(path),)
    def get_vocab(self):
        return self.vocab

    @property
    def vocab_size(self):
        return len(self.vocab)