Upload tokenizer

Browse files

Files changed (4) hide show

special_tokens_map.json +51 -0
tokenizer.py +271 -0
tokenizer_config.json +61 -0
vocab.json +0 -0

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from pathlib import Path
+import json
+from transformers import PreTrainedTokenizer
+# syllabify.py
+# Define the set of Greek consonants
+CONSONANTS = set('βγδθκπτφχλρσμν')
+def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    """
+    Generate token_type_ids to distinguish sequences if token_ids_1 is given.
+    RoBERTa doesn't use token_type_ids, so we set them all to 0.
+    """
+    if token_ids_1 is None:
+        return [0] * (len(token_ids_0) + 2)  # +2 for CLS and SEP
+    return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1)
+def syllabify(tokens):
+    """
+    Given a list of Greek tokens (letters or diphthongs), returns a list of syllables.
+    Each syllable is a list of tokens.
+    The syllabification follows these rules:
+    - A syllable must have a vowel (or diphthong) as its nucleus.
+    - A single consonant preceding a vowel is considered onset of that syllable.
+    - If there are multiple consonants between vowels, the first consonant is attached as coda
+      to the preceding syllable, and the remaining form the onset of the following syllable.
+    - Any trailing consonants are attached to the last syllable.
+    """
+    syllables = []
+    i = 0
+    n = len(tokens)
+    while i < n:
+        current = []
+        # Collect initial consonants for the syllable onset.
+        while i < n and tokens[i] in CONSONANTS:
+            current.append(tokens[i])
+            i += 1
+        # If no vowel is encountered, attach remaining consonants to previous syllable if available.
+        if i >= n:
+            if syllables:
+                syllables[-1].extend(current)
+            else:
+                syllables.append(current)
+            break
+        # Add the vowel (nucleus) to the current syllable.
+        current.append(tokens[i])
+        i += 1
+        # Look ahead to count following consonants until the next vowel.
+        start = i
+        count = 0
+        while i < n and tokens[i] in CONSONANTS:
+            count += 1
+            i += 1
+        if count == 0:
+            # No following consonants: the current syllable is complete.
+            syllables.append(current)
+        elif count == 1:
+            # A single consonant between vowels goes with the following syllable.
+            syllables.append(current)
+            # "Un-read" the single consonant so it will start the next syllable.
+            i = start
+        else:
+            # For two or more consonants, attach the first to the current syllable as coda,
+            # and let the remaining consonant(s) start the next syllable.
+            current.append(tokens[start])
+            syllables.append(current)
+            i = start + 1  # Process remaining consonants in the next iteration.
+    return syllables
+def syllabify_joined(tokens):
+    """
+    Convenience function that returns syllables as joined strings instead of lists.
+    """
+    syllable_lists = syllabify(tokens)
+    return [''.join(syl) for syl in syllable_lists]
+if __name__ == '__main__':
+    # Test the syllabification with sample input.
+    test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ']
+    print("Syllabified (as lists):")
+    syllable_lists = syllabify(test_tokens)
+    for syl in syllable_lists:
+        print(syl)
+    print("\nSyllabified (joined strings):")
+    print(syllabify_joined(test_tokens))
+import re
+import unicodedata
+# === 1. Oxia → Tonos replacements ===
+OXIA_TO_TONOS = {
+    "ά": "ά",  # U+1F71 → U+03AC (alpha)
+    "έ": "έ",  # U+1F73 → U+03AD (epsilon)
+    "ή": "ή",  # U+1F75 → U+03AE (eta)
+    "ί": "ί",  # U+1F77 → U+03AF (iota)
+    "ύ": "ύ",  # U+1F7B → U+03CD (upsilon)
+    "ό": "ό",  # U+1F79 → U+03CC (omicron)
+    "ώ": "ώ",  # U+1F7D → U+03CE (omega)
+}
+# === 2. Define diphthong components ===
+diphth_y = {'α', 'ε', 'η', 'ο'}
+upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}
+diphth_i = {'α', 'ε', 'ο', 'υ'}
+iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}
+adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ',
+                 'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'}
+adscr_i_second = {'ι'}
+# === 3. Character expansion and diphthong handling ===
+def process_word(word):
+    expanded = []
+    for char in word:
+        if char == 'ζ':
+            expanded.extend(['δ', 'σ'])
+        elif char == 'ς':
+            expanded.append('σ')
+        elif char == 'ῥ':
+            expanded.append('ρ')
+        elif char == 'ξ':
+            expanded.extend(['κ', 'σ'])
+        elif char == 'ψ':
+            expanded.extend(['π', 'σ'])
+        else:
+            expanded.append(char)
+    combined = []
+    i = 0
+    while i < len(expanded):
+        a = expanded[i]
+        b = expanded[i+1] if i + 1 < len(expanded) else ''
+        if a in diphth_y and b in upsilon_forms:
+            combined.append(a + b)
+            i += 2
+        elif a in diphth_i and b in iota_forms:
+            combined.append(a + b)
+            i += 2
+        elif a in adscr_i_first and b in adscr_i_second:
+            combined.append(a + b)
+            i += 2
+        else:
+            combined.append(a)
+            i += 1
+    return combined
+def replace_oxia_with_tonos(text):
+    return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text)
+def preprocess_greek_line(line):
+    # Step 1: Normalize oxia → tonos
+    line = replace_oxia_with_tonos(line)
+    # Step 2: Extract Greek words
+    words = re.findall(
+        r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"
+        r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ"
+        r"ἐἑἒἓἔἕἘἙἜἝ"
+        r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ"
+        r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ"
+        r"ὀὁὂὃὄὅὈὉὊὋὌὍ"
+        r"ὐὑὒὓὔὕὖὗὙὛὝ"
+        r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ"
+        r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ"
+        r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ"
+        r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ"
+        r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+",
+        line.lower()
+    )
+    # Step 3: Tokenize & flatten
+    token_lists = [process_word(word) for word in words]
+    return [token for tokens in token_lists for token in tokens]
+class GreekSyllableTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    def __init__(self, vocab_file: str, **kwargs):
+        # --- 1. ladda vokab -----------------------------------------
+        with Path(vocab_file).open(encoding="utf-8") as f:
+            self.vocab = json.load(f)
+        self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()}
+        # --- 2. sätt default-specials om de inte redan kom i kwargs --
+        kwargs.setdefault("pad_token",  "[PAD]")
+        kwargs.setdefault("unk_token",  "[UNK]")
+        kwargs.setdefault("bos_token",  "[CLS]")
+        kwargs.setdefault("eos_token",  "[SEP]")
+        kwargs.setdefault("cls_token",  "[CLS]")
+        kwargs.setdefault("sep_token",  "[SEP]")
+        kwargs.setdefault("mask_token", "[MASK]")
+        # se till att specials finns i vokab med rätt id-ordning
+        for sp in [kwargs["bos_token"], kwargs["eos_token"],
+                   kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]:
+            if sp not in self.vocab:
+                self.vocab[sp] = len(self.vocab)
+                self.ids_to_tokens[self.vocab[sp]] = sp
+        # --- 3. initiera basklassen en gång, utan dubbletter ---------
+        super().__init__(**kwargs)
+    # ---------- obligatoriska krokar -------------------------------
+    def _tokenize(self, text):
+        return syllabify_joined(preprocess_greek_line(text))
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab[self.unk_token])
+    def _convert_id_to_token(self, idx):
+        return self.ids_to_tokens.get(idx, self.unk_token)
+    # ---------- LÄGG TILL CLS/SEP AUTOMATISKT -----------------------
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        [CLS]  tokens_0  [SEP]      (enkel sekvens)
+        [CLS]  tokens_0  [SEP] tokens_1 [SEP]   (par-sekvens)
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        return ([self.cls_token_id] +
+                token_ids_0 +
+                [self.sep_token_id] +
+                token_ids_1 +
+                [self.sep_token_id])
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        if already_has_special_tokens:
+            return [
+                1 if tid in (self.cls_token_id, self.sep_token_id) else 0
+                for tid in (token_ids_0 + (token_ids_1 or []))
+            ]
+        if token_ids_1 is None:
+            return [1] + [0]*len(token_ids_0) + [1]
+        return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1]
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(
+                {str(k): v for k, v in self.vocab.items()},  # <- fix här
+                f,
+                ensure_ascii=False,
+                indent=2
+            )
+        return (str(path),)
+    def get_vocab(self):
+        return self.vocab
+    @property
+    def vocab_size(self):
+        return len(self.vocab)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "added_tokens_decoder": {
+    "42037": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42038": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42039": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42040": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42041": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.GreekSyllableTokenizer",
+      null
+    ]
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 514,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "GreekSyllableTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff