Levanti Transliterator
This model converts diacritics in Palestinian colloquial Arabic to their estimated pronunciation via Hebrew vowels. It can be used to transliterate diacritized Palestinian Arabic text into Hebrew or English. The model is trained on a special subset of the Levanti dataset (to be released later).
The model is fine-tuned from Google's CANINE-s character level LM with a token classification head.
Each token (letter) of the input is classified into either of 7 classes: 'O' if not a diacritic, or one of 6 Hebrew vowels (see model.config.id2label
This model can be used in conjunction with Levanti Diacritizer, which add diacritics to raw Palestinian Arabic text.
Example Usage
from transformers import CanineForTokenClassification, AutoTokenizer
import torch
model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
def diacritics2hebrew_vowels(text, model, tokenizer):
tokens = tokenizer(text, return_tensors="pt")
with torch.no_grad():
pred = model(**tokens)
pred = pred.logits.argmax(-1).tolist()
pred = pred[0][1:-1] # remove CLS and SEP
output = []
for p, c in zip(pred, text):
if p != model.config.label2id["O"]:
output = "".join(output)
return output
# to convert arabic diacritics to Hebrew diacritics (Tsere, Holam, Patah, Shva, Kubutz, Hiriq)
text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
Out[1]: 'لַازֵم نִعְطִي رַشַّات وִقַائִيֵّة لִلشַّجַر '
arabic_to_hebrew = {
# regular letters
"ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
"آ": "אא", "ى": "א", "ب": "ב", "ت": "ת", "ث": "ת'", "ج": "ג'",
"ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
"س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
"ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כ", "ل": "ל",
"م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
# special characters
"،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
final_letters = {
"ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
def to_taatik(arabic):
taatik = []
for index, letter in enumerate(arabic):
if (
(index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
letter in final_letters
elif letter not in arabic_to_hebrew:
return "".join(taatik)
# to convert consonants and create full hebrew transliteration (Taatik)
Out[2]: "לַאזֵם נִעְטִי רַשַّאת וִקַאאִיֵّה לִלשַّג'ַר "
arabic_to_english = {
"ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
"آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
"ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
"س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
"ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
"م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
"َ": "a", "ُ": "u", "ِ": "i",
"،": ",",
"ֹ": "o", # holam
"ַ": "a", # patah
"ִ": "i", # hiriq
"ְ": "", # shva
"ֻ": "u", # kubutz
'ֵ': "e",
"ّ": "SHADDA" # shadda
vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
def to_translit(arabic):
translit = []
for letter in arabic:
if letter not in arabic_to_english:
translit.append([letter, letter])
if arabic_to_english[letter] == "SHADDA":
if translit[-1][0] in vowels:
translit[-2][1] = translit[-2][1].upper()
translit[-1][1] = translit[-1][1].upper()
translit.append([letter, arabic_to_english[letter]])
return "".join([x[1] for x in translit])
# to convert letters to latin representation (English transliteration)
Out[3]: 'laazem niatiy raSHaat wiqaaaiYeh lilSHajar '
Created by Guy Mor-Lan.
Contact: guy.mor AT mail.huji.ac.il
- Downloads last month
- 15
Inference Providers
This model is not currently available via any of the supported third-party Inference Providers, and
the model is not deployed on the HF Inference API.