Spaces:

anisgtboi
/

my-dialect-translator-app

Sleeping

App Files Files Community

my-dialect-translator-app / app.py

anisgtboi

Update app.py

0afbfd4 verified 12 days ago

raw

history blame

23.4 kB

	# app.py
	"""
	Enhanced Dialect Bengali Translator with Semantic Search
	Uses both text similarity and semantic pattern matching
	Updated to include new dialect patterns and polite/negative 'des/dis' behavior
	"""

	import difflib
	import traceback
	import gradio as gr
	from collections import defaultdict
	import re

	# === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] ===
	phrases_data = [
	# Questions / common
	["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"],
	["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"],
	["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"],
	["oise", "ওইসে", "হয়েছে", "hoyeche"],
	["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"],
	["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"],
	["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"],
	["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"],
	["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"],
	["or", "ওর", "হচ্ছে", "hocche"],
	["bala or", "বালা ওর", "ভালো হচ্ছে", "bhalo hocche"],
	["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"],
	["or je", "ওর যে", "হচ্ছে যে", "hocche je"],
	["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"],
	["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"],
	["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"],
	["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"],
	["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"],
	["tew", "তেও", "তাহলে", "tahole"],
	["tente", "তেনতে", "তাহলে", "tahole"],
	["to", "তো", "তাহলে", "tahole"],
	["se hole", "সে হলে", "তাহলে", "tahole"],
	["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"],
	["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"],
	["asoini", "আসইনি", "আছে কি?", "ache ki?"],
	["ase", "আসে", "আছে", "ache"],

	# Future / Present / Past core verbs (ja / de / fawa / ka)
	["jaimu", "জাইমু", "যাব", "jabo"],
	["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
	["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
	["jaibo", "জাইবো", "যাবে", "jabe"],
	["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
	["oibo", "ওইবো", "হবে", "hobe"],
	["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"],

	["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"],
	["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
	["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"],

	# Give (de) family
	["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"],
	["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"],
	["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"],
	["dis na", "দিস না", "দেও না", "dis na (don't give)"],
	["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"],
	["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"],
	["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"],
	["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"],
	["dibo", "দিবো", "দেব", "debo (will give)"],
	["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"],
	["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"],

	# Get / receive (fawa) family
	["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"],
	["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"],
	["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"],
	["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"],
	["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"],
	["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"],
	["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
	["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
	["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
	["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
	["faibo", "ফাইবো", "সে পাবে", "se pabe"],
	["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],

	# Eat (ka) family
	["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"],
	["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"],
	["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"],
	["kaibo", "কাইবো", "সে খাবে", "se khabe"],
	["kaiba", "কাইবা", "তারা খাবে", "tara khabe"],

	# Other sample sentences from user's corpus
	["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"],
	["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"],
	["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"],
	["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
	["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"],
	["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"],
	["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"],
	["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"],
	["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
	]

	# =============================================================================
	# NEW RULE-BASED TRANSLATION ENGINE
	# =============================================================================

	# 1. DIALECT LEXICON (A dictionary for unique words)
	# This maps a dialect word to its standard Bengali meaning.
	dialect_lexicon = {
	'fua': 'chele',
	'furi': 'meye',
	'tara': 'tara', # It's the same!
	'ora': 'tara',
	'beta': 'purush',
	'sogra': 'chele',
	'bakka': 'pagol',
	'kun': 'ke', # who
	'ki': 'ki', # what
	'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
	'kuno': 'kothay', # where (alternative spelling)
	'jaibo': 'jabe',
	'oibo': 'hobe',
	'dibo': 'debe',
	'lowna': 'nen', # you take (respectful)
	'koin': 'bolun', # you say (respectful)
	'disot': 'diyechile', # you gave (question form)
	'faisot': 'peyechile', # you got (question form)
	'ase': 'ache',
	'or': 'hocche',
	'oise': 'hoyeche',
	'bala': 'bhalo',
	'kub': 'onek',
	'tik': 'thik',
	'acha': 'thik',
	'jen': 'je',
	'ni': 'ki',
	'kobor': 'khobor',
	'korde': 'korchho',
	'gesle': 'giyechile',
	'oislo': 'hoyeche',
	'oigese': 'hoyegese',
	'jaimu': 'jabo',
	'jaibay': 'jabe',
	'jaibe': 'jabe',
	'jaiba': 'jabe',
	'des': 'dao',
	'dis': 'na dio',
	'dilaisi': 'diyechi',
	'dilaise': 'diyeche',
	'dilaisoin': 'diyechen',
	'dise': 'dey',
	'faisi': 'peyechi',
	'faislo': 'pelo',
	'faislam': 'pelam',
	'faisla': 'pela',
	'faimu': 'pabo',
	'faibay': 'pabe',
	'faibe': 'pabe',
	'faibo': 'pabe',
	'faiba': 'pabe',
	'kaimu': 'khabo',
	'kaibay': 'khabe',
	'kaibe': 'khabe',
	'kaibo': 'khabe',
	'kaiba': 'khabe',
	}

	# 2. THE TRANSLATION FUNCTION
	def translate_with_rules(user_input):
	"""
	This is the new core function. It translates a sentence
	using the rule engine and lexicon.
	It returns: (dialect_bengali, actual_bengali, benglish, explanation)
	"""
	# Step 1: Tokenize - split the sentence into words
	input_words = user_input.lower().split()
	translated_std_words = [] # This will hold the standard Bengali words
	explanation = [] # This will explain the translation

	# Step 2: Decode each word using the lexicon
	for word in input_words:
	# Check if the word is in the dialect lexicon
	if word in dialect_lexicon:
	std_word = dialect_lexicon[word]
	translated_std_words.append(std_word)
	explanation.append(f"'{word}' -> '{std_word}'")
	else:
	# If not found, keep the original word (it might be proper noun)
	translated_std_words.append(word)
	explanation.append(f"'{word}' -> ?")

	# Step 3: Reconstruct the standard Bengali sentence
	standard_sentence = " ".join(translated_std_words)

	# For now, we'll use the input as dialect form since user typed it
	dialect_sentence = user_input
	# Benglish could be a simple phonetic version
	benglish_sentence = user_input

	return dialect_sentence, standard_sentence, benglish_sentence, " \| ".join(explanation)

	# =============================================================================
	# END OF NEW TRANSLATION ENGINE
	# =============================================================================

	# Semantic mapping of dialect patterns to meanings + types
	semantic_patterns = {
	# question/particles
	r"\bni\b": {"meaning": "কি", "type": "question"},
	r"\bni\b$": {"meaning": "কি", "type": "question"},
	# verbs / roots
	r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"},
	r"\boise\b": {"meaning": "হয়েছে", "type": "verb"},
	r"\boibo\b": {"meaning": "হবে", "type": "verb"},
	r"\bjaimu\b": {"meaning": "যাব", "type": "verb"},
	r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"},
	r"\bkobor\b": {"meaning": "খবর", "type": "noun"},
	r"\bkorde\b": {"meaning": "করছে", "type": "verb"},
	r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"},
	r"\bbala\b": {"meaning": "ভালো", "type": "adjective"},
	r"\bkub\b": {"meaning": "অনেক", "type": "adverb"},
	r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"},
	r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"},
	r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"},
	r"\bjen\b": {"meaning": "যে", "type": "conjunction"},
	r"\bje\b": {"meaning": "যে", "type": "conjunction"},
	r"\btik\b": {"meaning": "ঠিক", "type": "adjective"},
	r"\base\b": {"meaning": "আছে", "type": "verb"},
	r"\basoin\b": {"meaning": "আছে", "type": "verb"},
	r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"},
	r"\bGoto\b": {"meaning": "গত", "type": "adjective"},
	r"\bkali\b": {"meaning": "কাল", "type": "noun"},
	r"\bkita\b": {"meaning": "কি", "type": "question"},
	r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"},
	# give/get polarity (important dialect contrast)
	r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"},
	r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"},
	r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"},
	r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"},
	# fawa/get variants
	r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"},
	r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"},
	r"\bfaimu\b": {"meaning": "পাব", "type": "verb"},
	r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"},
	# future pattern markers
	r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"},
	r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"},
	r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"},
	r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"},
	}

	# Precompute data structures for matching
	dialects = [p[0] for p in phrases_data]
	dialects_lower = [d.lower() for d in dialects]
	actual_bengali_list = [p[2] for p in phrases_data]

	# Create a mapping from dialect to full row
	dialect_to_all = {p[0].lower(): p for p in phrases_data}

	def semantic_analysis(user_input):
	"""Perform semantic analysis on user input to understand meaning"""
	user_lower = user_input.lower()
	detected_patterns = []
	meaning_components = []

	# Use regex-based whole-word matching for patterns
	for pattern, info in semantic_patterns.items():
	try:
	if re.search(pattern, user_lower):
	detected_patterns.append((pattern, info["meaning"], info["type"]))
	meaning_components.append(info["meaning"])
	except re.error:
	# If pattern is bad, skip it safely
	continue

	return detected_patterns, meaning_components

	def find_semantic_matches(user_input, threshold=0.35):
	"""Find matches based on semantic similarity + text similarity"""
	user_lower = user_input.lower()
	matches = []

	# Get semantic patterns from user input
	detected_patterns, meaning_components = semantic_analysis(user_input)

	# If we found semantic patterns, look for phrases with similar meanings
	if meaning_components:
	for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
	match_score = 0.0
	# boost if any of the meaning_components appear in actual or dialect
	for meaning in meaning_components:
	if meaning in actual:
	match_score += 0.35
	if meaning in dialect.lower():
	match_score += 0.25

	# text similarity between user and dialect form
	text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio()
	total_score = match_score + (text_similarity * 0.5)

	if total_score > threshold:
	matches.append((i, total_score, "semantic"))

	return matches

	def format_suggestions_from_indices(indices, match_type="text", scores=None):
	"""Helper to format suggestion list for the suggestions box"""
	lines = []
	for i, idx in enumerate(indices):
	d, dialect_bengali, actual, benglish = phrases_data[idx]

	score_str = ""
	if scores is not None and i < len(scores):
	s_pct = int(scores[i] * 100)
	score_str = f" ({match_type}-match: {s_pct}%)"

	lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}")
	return "\n\n".join(lines)

	def translate_text(user_text, top_k: int = 6):
	"""
	Returns: (dialect_out, actual_out, benglish_out, suggestions_out)
	"""
	try:
	q = (user_text or "").strip()
	if not q:
	return "", "", "", "Please enter a phrase or question."

	q_lower = q.lower()

	# 1) Exact match (case-insensitive)
	for dialect, dialect_bengali, actual, benglish in phrases_data:
	if q_lower == dialect.lower():
	return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"

	# 2) NEW: Try to translate it using the RULE ENGINE
	# Check if this is a simple phrase that can be broken down
	dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)

	# If the rule engine found translations for all words, use it!
	if "?" not in explanation: # Basic check - if no unknown words
	return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"

	# 3) If input contains multiple phrases separated by punctuation
	potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
	if len(potential_phrases) > 1:
	results = []
	for phrase in potential_phrases:
	matched = False
	for d, dialect_bengali, actual, benglish in phrases_data:
	if phrase.lower() == d.lower():
	results.append(f"{dialect_bengali} → {actual} → {benglish}")
	matched = True
	break
	if not matched:
	results.append(f"'{phrase}' → No match found")
	return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)

	# 4) Semantic matches
	semantic_matches = find_semantic_matches(q)
	if semantic_matches:
	# sort and return top semantic candidates
	semantic_matches.sort(key=lambda x: x[1], reverse=True)
	indices = [idx for idx, score, mt in semantic_matches[:top_k]]
	scores = [score for idx, score, mt in semantic_matches[:top_k]]
	suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores)
	# Return best match as primary output
	best_idx = indices[0]
	d, dialect_bengali, actual, benglish = phrases_data[best_idx]
	return dialect_bengali, actual, benglish, suggestions

	# 5) Partial matches in dialect strings
	partial_matches = []
	for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
	if q_lower in dialect.lower() or dialect.lower() in q_lower:
	similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio()
	partial_matches.append((i, similarity))

	if partial_matches:
	partial_matches.sort(key=lambda x: x[1], reverse=True)
	indices = [idx for idx, score in partial_matches[:top_k]]
	scores = [score for idx, score in partial_matches[:top_k]]
	best_idx = indices[0]
	d, dialect_bengali, actual, benglish = phrases_data[best_idx]
	suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
	return dialect_bengali, actual, benglish, suggestions

	# 6) Close textual matches using difflib
	close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
	if close_matches:
	indices = [dialects_lower.index(m) for m in close_matches]
	text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches]
	best_idx = indices[0]
	d, dialect_bengali, actual, benglish = phrases_data[best_idx]
	suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
	return dialect_bengali, actual, benglish, suggestions

	# 7) Rule engine fallback (even with some unknown words)
	return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])

	except Exception as ex:
	tb = traceback.format_exc()
	return "", "", "", f"Runtime error:\n{str(ex)}\n\nTraceback:\n{tb}"

	def show_semantic_analysis(user_text):
	"""Show semantic analysis of user input"""
	if not user_text.strip():
	return ""
	patterns, meanings = semantic_analysis(user_text)
	if patterns:
	return f"Detected patterns: {', '.join([f'{p} → {m}' for p, m, t in patterns])}"
	return "No specific patterns detected"

	# Custom CSS for a softer, less blinding color scheme
	css = """
	body {
	font-family: Arial, sans-serif;
	}
	.gr-box {
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	}
	.gr-button {
	background: #4CAF50;
	color: white;
	}
	.gr-button:hover {
	background: #45a049;
	}
	"""

	# Build Gradio UI with a softer theme
	with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish")
	gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.")

	# Define input component first
	inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Examples to try:")
	examples = gr.Examples(
	examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
	inputs=inp,
	label="Try these examples"
	)
	with gr.Column(scale=2):
	btn = gr.Button("Translate / Find", variant="primary")

	with gr.Row():
	out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)")
	out_actual = gr.Textbox(label="Actual Bengali (Standard)")
	out_benglish = gr.Textbox(label="Benglish (Phonetic English)")

	with gr.Row():
	semantic_info = gr.Textbox(label="Semantic Analysis", lines=2)

	suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8)

	# Set up event handlers
	btn.click(
	fn=translate_text,
	inputs=[inp],
	outputs=[out_dialect, out_actual, out_benglish, suggestions]
	)

	inp.change(
	fn=show_semantic_analysis,
	inputs=[inp],
	outputs=[semantic_info]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()