Spaces:

saadustto2007
/

English-to-Cyrillic-Farsi

Running

App Files Files Community

English-to-Cyrillic-Farsi / app.py

saadustto2007

Update app.py

d59e3a5 verified 2 months ago

raw

history blame contribute delete

8.52 kB

	from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
	import gradio as gr
	import torch

	# Define the model
	model_name = "facebook/m2m100_418M"

	try:
	tokenizer = M2M100Tokenizer.from_pretrained(model_name)
	model = M2M100ForConditionalGeneration.from_pretrained(model_name)
	except Exception as e:
	print(f"Error loading model or tokenizer: {e}")
	exit(1)

	# Move model to GPU if available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Predefined common English-to-Farsi phrase mappings
	common_phrases = {
	"Hello": "سلام",
	"Hi!": "سلام!",
	"Good morning": "صبح بخیر",
	"Good afternoon": "عصر بخیر",
	"Good evening": "شب بخیر",
	"Goodbye": "خداحافظ",
	"Good night": "شب خوش",
	"How are you?": "حالت چطوره؟",
	"I am fine, thank you. And you?": "خوبم، متشکرم. و شما؟",
	"Thank you (very much)": "متشکرم (خیلی ممنون)",
	"You're welcome": "خواهش میکنم",
	"Excuse me": "ببخشید",
	"Pardon me": "معذرت می‌خواهم",
	"I'm sorry": "متأسفم",
	"Congratulations": "تبریک می‌گویم",
	"Please sit down": "لطفاً بنشینید",
	"Good luck": "موفق باشید",
	"Have a good trip": "سفر خوبی داشته باشید",
	"What is your name?": "اسم شما چیست؟",
	"My name is Sara": "اسم من سارا است",
	"Where are you from?": "اهل کجا هستید؟",
	"I am from Iran": "من اهل ایران هستم",
	"Do you speak English?": "آیا انگلیسی صحبت می‌کنید؟",
	"I don't understand": "من متوجه نمی‌شوم",
	"Please speak slowly": "لطفاً آهسته صحبت کنید",
	"Do you have a Persian-English dictionary?": "آیا دیکشنری فارسی-انگلیسی دارید؟",
	"How do you say this in English?": "این را در انگلیسی چگونه می‌گویند؟",
	"How much is this?": "این چقدر قیمت دارد؟",
	"Where is the bathroom?": "دستشویی کجاست؟",
	"Help!": "کمک!",
	"I am lost": "من گم شده‌ام",
	"Can you help me?": "می‌توانید به من کمک کنید؟",
	"What time is it?": "ساعت چند است؟",
	"Where is the hospital?": "بیمارستان کجاست؟",
	"I love you": "دوستت دارم",
	"How can I get to the airport?": "چطور می‌توانم به فرودگاه بروم؟",
	"I need a doctor": "به یک پزشک نیاز دارم",
	"Where can I buy a ticket?": "از کجا می‌توانم بلیط بخرم؟",
	"I am hungry": "گرسنه‌ام",
	"Can I have some water?": "می‌توانم کمی آب بگیرم؟",
	"It’s very beautiful": "خیلی زیباست",
	"See you later": "بعداً می‌بینمت",
	"What is this?": "این چیست؟",
	"I am happy": "خوشحالم",
	"It is very chilly today": "امروز خیلی سرد است",
	"I hope we have better weather tomorrow": "امیدوارم فردا هوا بهتر شود",
	}

	# Function to split text into smaller phrases
	def split_into_phrases(text):
	separators = [",", ".", "?", "!"]
	phrases = [text]
	for sep in separators:
	new_phrases = []
	for phrase in phrases:
	new_phrases.extend(phrase.split(sep))
	phrases = new_phrases
	return [phrase.strip() for phrase in phrases if phrase.strip()]

	# Improved transliteration function (Farsi to Cyrillic)
	def transliterate_farsi_to_cyrillic(farsi_text):
	word_map = {
	"سلام": "Салом",
	"خداحافظ": "Худоҳафиз",
	"شب بخیر": "Шаб хайр",
	"صبح بخیر": "Субҳ хайр",
	"ممنون": "Ташаккур",
	"خواهش میکنم": "Илтимос",
	"چطور هستی؟": "Чӣ тур ҳастӣ?",
	"چطور هستید؟": "Шумо чӣ туред?",
	"بله": "Ҳа",
	"نه": "Не",
	"ایران": "Эрон",
	"تشکر": "Ташаккур",
	"فارسی": "Форсӣ",
	"اسم من": "Номи ман",
	"لطفا": "Илтимос",
	"کمک": "Кумак",
	"هستی": "ҳастӣ",
	"هستید": "ҳастед",
	"است": "аст",
	"امروز": "Имрӯз",
	"خیلی": "Хеле",
	"سرد": "Сард",
	"امیدوارم": "Умидворам",
	"فردا": "Фардо",
	"هوا": "Ҳаво",
	"بهتر": "Беҳтар",
	"شود": "Шавад",
	}

	char_map = {
	"ا": "а",
	"ب": "б",
	"پ": "п",
	"ت": "т",
	"ج": "ж",
	"چ": "ч",
	"ح": "ҳ",
	"خ": "х",
	"د": "д",
	"ر": "р",
	"ز": "з",
	"س": "с",
	"ش": "ш",
	"ص": "с",
	"ط": "т",
	"ع": "ъ",
	"غ": "ғ",
	"ف": "ф",
	"ق": "қ",
	"ک": "к",
	"گ": "г",
	"ل": "л",
	"م": "м",
	"ن": "н",
	"و": "в",
	"ه": "ҳ",
	"ی": "й",
	"؟": "?",
	"،": ",",
	" ": " ",
	}

	def transliterate_name(word):
	if any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء" for c in word) and len(word) > 2:
	return "".join(char_map.get(c, c) for c in word)
	return word

	if farsi_text in word_map:
	return word_map[farsi_text]

	words = farsi_text.split()
	cyrillic_words = []
	for word in words:
	if word in word_map:
	cyrillic_words.append(word_map[word])
	else:
	cyrillic_words.append(transliterate_name(word))

	return " ".join(cyrillic_words)

	# Translation function with input validation and cleaning
	def translate_to_cyrillic_farsi(text):
	if not text or not text.strip():
	return "Error: Please enter a valid English text.", ""
	if not all(ord(char) < 128 for char in text):
	return "Error: Please enter text in English (ASCII characters only).", ""

	# Try full sentence translation first
	tokenizer.src_lang = "en"
	encoded_text = tokenizer(text, return_tensors="pt", padding=True).to(device)
	translated = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id("fa"))
	farsi_text = tokenizer.decode(translated[0], skip_special_tokens=True)

	# Clean the Farsi text (remove leading/trailing unwanted punctuation)
	farsi_text = farsi_text.strip(".!?, ")

	# Check if the translation is valid Farsi
	if not farsi_text or not any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء،؟" for c in farsi_text.replace(" ", "")):
	# Fall back to phrase-by-phrase translation
	phrases = split_into_phrases(text)
	farsi_translations = []
	for phrase in phrases:
	if phrase in common_phrases:
	farsi_translations.append(common_phrases[phrase])
	else:
	tokenizer.src_lang = "en"
	encoded_text = tokenizer(phrase, return_tensors="pt", padding=True).to(device)
	translated = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id("fa"))
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True).strip(".!?, ")
	if any(c in "ابتثجحخدذرزسشصضطظعغفقکگلمنوهیءأؤئء،؟" for c in translated_text.replace(" ", "")):
	farsi_translations.append(translated_text)
	else:
	farsi_translations.append(f"[UNTRANSLATED: {phrase}]")
	farsi_text = " ".join(farsi_translations)

	cyrillic_text = transliterate_farsi_to_cyrillic(farsi_text)
	return farsi_text, cyrillic_text

	# Gradio Interface
	interface = gr.Interface(
	fn=translate_to_cyrillic_farsi,
	inputs=gr.Textbox(label="Enter Text in English"),
	outputs=[
	gr.Textbox(label="Farsi Translation (Native Script)"),
	gr.Textbox(label="Farsi Translation (Cyrillic Script)"),
	],
	title="English to Cyrillic Farsi Translator",
	description="Enter an English word or sentence, and this tool will translate it to Farsi in both native and Cyrillic scripts."
	)

	# Launch the app
	if __name__ == "__main__":
	interface.launch()