from transformers import MBartForConditionalGeneration, MBartTokenizer | |
def transliterate(text, model_name="deshanksuman/swabhashambert50SinhalaTransliteration"): | |
# Load model and tokenizer | |
tokenizer = MBartTokenizer.from_pretrained(model_name) | |
model = MBartForConditionalGeneration.from_pretrained(model_name) | |
# Set the language codes | |
tokenizer.src_lang = "en_XX" | |
tokenizer.tgt_lang = "si_LK" | |
# Prepare input | |
inputs = tokenizer( | |
text, | |
return_tensors="pt", | |
max_length=128, | |
padding="max_length", | |
truncation=True | |
) | |
# Generate output | |
outputs = model.generate( | |
input_ids=inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
max_length=128, | |
num_beams=5, | |
early_stopping=True | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Example usage | |
if __name__ == "__main__": | |
# Test with example text | |
test_text = "mama oyata adare karanawa" | |
result = transliterate(test_text) | |
print(f"Romanized: {test_text}") | |
print(f"Transliterated: {result}") | |