|
from pathlib import Path |
|
import json |
|
from transformers import PreTrainedTokenizer |
|
|
|
|
|
|
|
|
|
CONSONANTS = set('βγδθκπτφχλρσμν') |
|
|
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): |
|
""" |
|
Generate token_type_ids to distinguish sequences if token_ids_1 is given. |
|
RoBERTa doesn't use token_type_ids, so we set them all to 0. |
|
""" |
|
if token_ids_1 is None: |
|
return [0] * (len(token_ids_0) + 2) |
|
return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 1) |
|
|
|
def syllabify(tokens): |
|
""" |
|
Given a list of Greek tokens (letters or diphthongs), returns a list of syllables. |
|
Each syllable is a list of tokens. |
|
|
|
The syllabification follows these rules: |
|
- A syllable must have a vowel (or diphthong) as its nucleus. |
|
- A single consonant preceding a vowel is considered onset of that syllable. |
|
- If there are multiple consonants between vowels, the first consonant is attached as coda |
|
to the preceding syllable, and the remaining form the onset of the following syllable. |
|
- Any trailing consonants are attached to the last syllable. |
|
""" |
|
syllables = [] |
|
i = 0 |
|
n = len(tokens) |
|
|
|
while i < n: |
|
current = [] |
|
|
|
|
|
while i < n and tokens[i] in CONSONANTS: |
|
current.append(tokens[i]) |
|
i += 1 |
|
|
|
|
|
if i >= n: |
|
if syllables: |
|
syllables[-1].extend(current) |
|
else: |
|
syllables.append(current) |
|
break |
|
|
|
|
|
current.append(tokens[i]) |
|
i += 1 |
|
|
|
|
|
start = i |
|
count = 0 |
|
while i < n and tokens[i] in CONSONANTS: |
|
count += 1 |
|
i += 1 |
|
|
|
if count == 0: |
|
|
|
syllables.append(current) |
|
elif count == 1: |
|
|
|
syllables.append(current) |
|
|
|
i = start |
|
else: |
|
|
|
|
|
current.append(tokens[start]) |
|
syllables.append(current) |
|
i = start + 1 |
|
|
|
return syllables |
|
|
|
def syllabify_joined(tokens): |
|
""" |
|
Convenience function that returns syllables as joined strings instead of lists. |
|
""" |
|
syllable_lists = syllabify(tokens) |
|
return [''.join(syl) for syl in syllable_lists] |
|
|
|
if __name__ == '__main__': |
|
|
|
test_tokens = ['σ', 'τ', 'έ', 'ρ', 'κ', 'σ', 'α', 'σ', 'ἀ', 'ν', 'έ', 'χ', 'ει', 'θ', 'ού', 'ρ', 'ι', 'ο', 'σ', 'αἴ', 'α', 'σ'] |
|
|
|
print("Syllabified (as lists):") |
|
syllable_lists = syllabify(test_tokens) |
|
for syl in syllable_lists: |
|
print(syl) |
|
|
|
print("\nSyllabified (joined strings):") |
|
print(syllabify_joined(test_tokens)) |
|
|
|
|
|
import re |
|
import unicodedata |
|
|
|
|
|
OXIA_TO_TONOS = { |
|
"ά": "ά", |
|
"έ": "έ", |
|
"ή": "ή", |
|
"ί": "ί", |
|
"ύ": "ύ", |
|
"ό": "ό", |
|
"ώ": "ώ", |
|
} |
|
|
|
|
|
diphth_y = {'α', 'ε', 'η', 'ο'} |
|
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'} |
|
|
|
diphth_i = {'α', 'ε', 'ο', 'υ'} |
|
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'} |
|
|
|
adscr_i_first = {'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ','ᾶ','ῆ','ῶ', |
|
'ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ','ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'} |
|
adscr_i_second = {'ι'} |
|
|
|
|
|
def process_word(word): |
|
expanded = [] |
|
for char in word: |
|
if char == 'ζ': |
|
expanded.extend(['δ', 'σ']) |
|
elif char == 'ς': |
|
expanded.append('σ') |
|
elif char == 'ῥ': |
|
expanded.append('ρ') |
|
elif char == 'ξ': |
|
expanded.extend(['κ', 'σ']) |
|
elif char == 'ψ': |
|
expanded.extend(['π', 'σ']) |
|
else: |
|
expanded.append(char) |
|
|
|
combined = [] |
|
i = 0 |
|
while i < len(expanded): |
|
a = expanded[i] |
|
b = expanded[i+1] if i + 1 < len(expanded) else '' |
|
|
|
if a in diphth_y and b in upsilon_forms: |
|
combined.append(a + b) |
|
i += 2 |
|
elif a in diphth_i and b in iota_forms: |
|
combined.append(a + b) |
|
i += 2 |
|
elif a in adscr_i_first and b in adscr_i_second: |
|
combined.append(a + b) |
|
i += 2 |
|
else: |
|
combined.append(a) |
|
i += 1 |
|
|
|
return combined |
|
def replace_oxia_with_tonos(text): |
|
return ''.join(OXIA_TO_TONOS.get(ch, ch) for ch in text) |
|
|
|
def preprocess_greek_line(line): |
|
|
|
line = replace_oxia_with_tonos(line) |
|
|
|
|
|
words = re.findall( |
|
r"[ΆΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ" |
|
r"ἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎ" |
|
r"ἐἑἒἓἔἕἘἙἜἝ" |
|
r"ἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮ" |
|
r"ἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾ" |
|
r"ὀὁὂὃὄὅὈὉὊὋὌὍ" |
|
r"ὐὑὒὓὔὕὖὗὙὛὝ" |
|
r"ὠὡὢὣὤὥὦὧὨὩὪὫὬὭὮὯ" |
|
r"ὰὲὴὶὸὺὼᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍ" |
|
r"ᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝ" |
|
r"ᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯ" |
|
r"ᾲᾳᾴᾶᾷῂῃῄῆῇῒῖῗῢῤῥῦῧῬῲῳῴῶῷ]+", |
|
line.lower() |
|
) |
|
|
|
|
|
token_lists = [process_word(word) for word in words] |
|
return [token for tokens in token_lists for token in tokens] |
|
|
|
|
|
class GreekSyllableTokenizer(PreTrainedTokenizer): |
|
vocab_files_names = {"vocab_file": "vocab.json"} |
|
|
|
def __init__(self, vocab_file: str, **kwargs): |
|
|
|
with Path(vocab_file).open(encoding="utf-8") as f: |
|
self.vocab = json.load(f) |
|
self.ids_to_tokens = {idx: tok for tok, idx in self.vocab.items()} |
|
|
|
|
|
kwargs.setdefault("pad_token", "[PAD]") |
|
kwargs.setdefault("unk_token", "[UNK]") |
|
kwargs.setdefault("bos_token", "[CLS]") |
|
kwargs.setdefault("eos_token", "[SEP]") |
|
kwargs.setdefault("cls_token", "[CLS]") |
|
kwargs.setdefault("sep_token", "[SEP]") |
|
kwargs.setdefault("mask_token", "[MASK]") |
|
|
|
|
|
for sp in [kwargs["bos_token"], kwargs["eos_token"], |
|
kwargs["unk_token"], kwargs["pad_token"], kwargs["mask_token"]]: |
|
if sp not in self.vocab: |
|
self.vocab[sp] = len(self.vocab) |
|
self.ids_to_tokens[self.vocab[sp]] = sp |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def _tokenize(self, text): |
|
return syllabify_joined(preprocess_greek_line(text)) |
|
|
|
def _convert_token_to_id(self, token): |
|
return self.vocab.get(token, self.vocab[self.unk_token]) |
|
|
|
def _convert_id_to_token(self, idx): |
|
return self.ids_to_tokens.get(idx, self.unk_token) |
|
|
|
|
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
|
""" |
|
[CLS] tokens_0 [SEP] (enkel sekvens) |
|
[CLS] tokens_0 [SEP] tokens_1 [SEP] (par-sekvens) |
|
""" |
|
if token_ids_1 is None: |
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
|
return ([self.cls_token_id] + |
|
token_ids_0 + |
|
[self.sep_token_id] + |
|
token_ids_1 + |
|
[self.sep_token_id]) |
|
|
|
def get_special_tokens_mask(self, |
|
token_ids_0, |
|
token_ids_1=None, |
|
already_has_special_tokens=False): |
|
if already_has_special_tokens: |
|
return [ |
|
1 if tid in (self.cls_token_id, self.sep_token_id) else 0 |
|
for tid in (token_ids_0 + (token_ids_1 or [])) |
|
] |
|
if token_ids_1 is None: |
|
return [1] + [0]*len(token_ids_0) + [1] |
|
return [1] + [0]*len(token_ids_0) + [1] + [0]*len(token_ids_1) + [1] |
|
|
|
def save_vocabulary(self, save_directory, filename_prefix=None): |
|
path = Path(save_directory) / (("" if filename_prefix is None else filename_prefix) + "vocab.json") |
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
with path.open("w", encoding="utf-8") as f: |
|
json.dump( |
|
{str(k): v for k, v in self.vocab.items()}, |
|
f, |
|
ensure_ascii=False, |
|
indent=2 |
|
) |
|
return (str(path),) |
|
def get_vocab(self): |
|
return self.vocab |
|
|
|
@property |
|
def vocab_size(self): |
|
return len(self.vocab) |