anisgtboi's picture
Update app.py
0afbfd4 verified
raw
history blame
23.4 kB
# app.py
"""
Enhanced Dialect Bengali Translator with Semantic Search
Uses both text similarity and semantic pattern matching
Updated to include new dialect patterns and polite/negative 'des/dis' behavior
"""
import difflib
import traceback
import gradio as gr
from collections import defaultdict
import re
# === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] ===
phrases_data = [
# Questions / common
["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"],
["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"],
["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"],
["oise", "ওইসে", "হয়েছে", "hoyeche"],
["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"],
["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"],
["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"],
["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"],
["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"],
["or", "ওর", "হচ্ছে", "hocche"],
["bala or", "বালা ওর", "ভালো হচ্ছে", "bhalo hocche"],
["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"],
["or je", "ওর যে", "হচ্ছে যে", "hocche je"],
["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"],
["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"],
["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"],
["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"],
["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"],
["tew", "তেও", "তাহলে", "tahole"],
["tente", "তেনতে", "তাহলে", "tahole"],
["to", "তো", "তাহলে", "tahole"],
["se hole", "সে হলে", "তাহলে", "tahole"],
["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"],
["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"],
["asoini", "আসইনি", "আছে কি?", "ache ki?"],
["ase", "আসে", "আছে", "ache"],
# Future / Present / Past core verbs (ja / de / fawa / ka)
["jaimu", "জাইমু", "যাব", "jabo"],
["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
["jaibo", "জাইবো", "যাবে", "jabe"],
["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
["oibo", "ওইবো", "হবে", "hobe"],
["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"],
["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"],
["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"],
# Give (de) family
["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"],
["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"],
["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"],
["dis na", "দিস না", "দেও না", "dis na (don't give)"],
["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"],
["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"],
["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"],
["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"],
["dibo", "দিবো", "দেব", "debo (will give)"],
["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"],
["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"],
# Get / receive (fawa) family
["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"],
["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"],
["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"],
["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"],
["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"],
["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"],
["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
["faibo", "ফাইবো", "সে পাবে", "se pabe"],
["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
# Eat (ka) family
["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"],
["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"],
["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"],
["kaibo", "কাইবো", "সে খাবে", "se khabe"],
["kaiba", "কাইবা", "তারা খাবে", "tara khabe"],
# Other sample sentences from user's corpus
["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"],
["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"],
["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"],
["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"],
["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"],
["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"],
["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"],
["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
]
# =============================================================================
# NEW RULE-BASED TRANSLATION ENGINE
# =============================================================================
# 1. DIALECT LEXICON (A dictionary for unique words)
# This maps a dialect word to its standard Bengali meaning.
dialect_lexicon = {
'fua': 'chele',
'furi': 'meye',
'tara': 'tara', # It's the same!
'ora': 'tara',
'beta': 'purush',
'sogra': 'chele',
'bakka': 'pagol',
'kun': 'ke', # who
'ki': 'ki', # what
'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
'kuno': 'kothay', # where (alternative spelling)
'jaibo': 'jabe',
'oibo': 'hobe',
'dibo': 'debe',
'lowna': 'nen', # you take (respectful)
'koin': 'bolun', # you say (respectful)
'disot': 'diyechile', # you gave (question form)
'faisot': 'peyechile', # you got (question form)
'ase': 'ache',
'or': 'hocche',
'oise': 'hoyeche',
'bala': 'bhalo',
'kub': 'onek',
'tik': 'thik',
'acha': 'thik',
'jen': 'je',
'ni': 'ki',
'kobor': 'khobor',
'korde': 'korchho',
'gesle': 'giyechile',
'oislo': 'hoyeche',
'oigese': 'hoyegese',
'jaimu': 'jabo',
'jaibay': 'jabe',
'jaibe': 'jabe',
'jaiba': 'jabe',
'des': 'dao',
'dis': 'na dio',
'dilaisi': 'diyechi',
'dilaise': 'diyeche',
'dilaisoin': 'diyechen',
'dise': 'dey',
'faisi': 'peyechi',
'faislo': 'pelo',
'faislam': 'pelam',
'faisla': 'pela',
'faimu': 'pabo',
'faibay': 'pabe',
'faibe': 'pabe',
'faibo': 'pabe',
'faiba': 'pabe',
'kaimu': 'khabo',
'kaibay': 'khabe',
'kaibe': 'khabe',
'kaibo': 'khabe',
'kaiba': 'khabe',
}
# 2. THE TRANSLATION FUNCTION
def translate_with_rules(user_input):
"""
This is the new core function. It translates a sentence
using the rule engine and lexicon.
It returns: (dialect_bengali, actual_bengali, benglish, explanation)
"""
# Step 1: Tokenize - split the sentence into words
input_words = user_input.lower().split()
translated_std_words = [] # This will hold the standard Bengali words
explanation = [] # This will explain the translation
# Step 2: Decode each word using the lexicon
for word in input_words:
# Check if the word is in the dialect lexicon
if word in dialect_lexicon:
std_word = dialect_lexicon[word]
translated_std_words.append(std_word)
explanation.append(f"'{word}' -> '{std_word}'")
else:
# If not found, keep the original word (it might be proper noun)
translated_std_words.append(word)
explanation.append(f"'{word}' -> ?")
# Step 3: Reconstruct the standard Bengali sentence
standard_sentence = " ".join(translated_std_words)
# For now, we'll use the input as dialect form since user typed it
dialect_sentence = user_input
# Benglish could be a simple phonetic version
benglish_sentence = user_input
return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation)
# =============================================================================
# END OF NEW TRANSLATION ENGINE
# =============================================================================
# Semantic mapping of dialect patterns to meanings + types
semantic_patterns = {
# question/particles
r"\bni\b": {"meaning": "কি", "type": "question"},
r"\bni\b$": {"meaning": "কি", "type": "question"},
# verbs / roots
r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"},
r"\boise\b": {"meaning": "হয়েছে", "type": "verb"},
r"\boibo\b": {"meaning": "হবে", "type": "verb"},
r"\bjaimu\b": {"meaning": "যাব", "type": "verb"},
r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"},
r"\bkobor\b": {"meaning": "খবর", "type": "noun"},
r"\bkorde\b": {"meaning": "করছে", "type": "verb"},
r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"},
r"\bbala\b": {"meaning": "ভালো", "type": "adjective"},
r"\bkub\b": {"meaning": "অনেক", "type": "adverb"},
r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"},
r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"},
r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"},
r"\bjen\b": {"meaning": "যে", "type": "conjunction"},
r"\bje\b": {"meaning": "যে", "type": "conjunction"},
r"\btik\b": {"meaning": "ঠিক", "type": "adjective"},
r"\base\b": {"meaning": "আছে", "type": "verb"},
r"\basoin\b": {"meaning": "আছে", "type": "verb"},
r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"},
r"\bGoto\b": {"meaning": "গত", "type": "adjective"},
r"\bkali\b": {"meaning": "কাল", "type": "noun"},
r"\bkita\b": {"meaning": "কি", "type": "question"},
r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"},
# give/get polarity (important dialect contrast)
r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"},
r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"},
r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"},
r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"},
# fawa/get variants
r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"},
r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"},
r"\bfaimu\b": {"meaning": "পাব", "type": "verb"},
r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"},
# future pattern markers
r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"},
r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"},
r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"},
r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"},
}
# Precompute data structures for matching
dialects = [p[0] for p in phrases_data]
dialects_lower = [d.lower() for d in dialects]
actual_bengali_list = [p[2] for p in phrases_data]
# Create a mapping from dialect to full row
dialect_to_all = {p[0].lower(): p for p in phrases_data}
def semantic_analysis(user_input):
"""Perform semantic analysis on user input to understand meaning"""
user_lower = user_input.lower()
detected_patterns = []
meaning_components = []
# Use regex-based whole-word matching for patterns
for pattern, info in semantic_patterns.items():
try:
if re.search(pattern, user_lower):
detected_patterns.append((pattern, info["meaning"], info["type"]))
meaning_components.append(info["meaning"])
except re.error:
# If pattern is bad, skip it safely
continue
return detected_patterns, meaning_components
def find_semantic_matches(user_input, threshold=0.35):
"""Find matches based on semantic similarity + text similarity"""
user_lower = user_input.lower()
matches = []
# Get semantic patterns from user input
detected_patterns, meaning_components = semantic_analysis(user_input)
# If we found semantic patterns, look for phrases with similar meanings
if meaning_components:
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
match_score = 0.0
# boost if any of the meaning_components appear in actual or dialect
for meaning in meaning_components:
if meaning in actual:
match_score += 0.35
if meaning in dialect.lower():
match_score += 0.25
# text similarity between user and dialect form
text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio()
total_score = match_score + (text_similarity * 0.5)
if total_score > threshold:
matches.append((i, total_score, "semantic"))
return matches
def format_suggestions_from_indices(indices, match_type="text", scores=None):
"""Helper to format suggestion list for the suggestions box"""
lines = []
for i, idx in enumerate(indices):
d, dialect_bengali, actual, benglish = phrases_data[idx]
score_str = ""
if scores is not None and i < len(scores):
s_pct = int(scores[i] * 100)
score_str = f" ({match_type}-match: {s_pct}%)"
lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}")
return "\n\n".join(lines)
def translate_text(user_text, top_k: int = 6):
"""
Returns: (dialect_out, actual_out, benglish_out, suggestions_out)
"""
try:
q = (user_text or "").strip()
if not q:
return "", "", "", "Please enter a phrase or question."
q_lower = q.lower()
# 1) Exact match (case-insensitive)
for dialect, dialect_bengali, actual, benglish in phrases_data:
if q_lower == dialect.lower():
return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
# 2) NEW: Try to translate it using the RULE ENGINE
# Check if this is a simple phrase that can be broken down
dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)
# If the rule engine found translations for all words, use it!
if "?" not in explanation: # Basic check - if no unknown words
return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"
# 3) If input contains multiple phrases separated by punctuation
potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
if len(potential_phrases) > 1:
results = []
for phrase in potential_phrases:
matched = False
for d, dialect_bengali, actual, benglish in phrases_data:
if phrase.lower() == d.lower():
results.append(f"{dialect_bengali}{actual}{benglish}")
matched = True
break
if not matched:
results.append(f"'{phrase}' → No match found")
return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
# 4) Semantic matches
semantic_matches = find_semantic_matches(q)
if semantic_matches:
# sort and return top semantic candidates
semantic_matches.sort(key=lambda x: x[1], reverse=True)
indices = [idx for idx, score, mt in semantic_matches[:top_k]]
scores = [score for idx, score, mt in semantic_matches[:top_k]]
suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores)
# Return best match as primary output
best_idx = indices[0]
d, dialect_bengali, actual, benglish = phrases_data[best_idx]
return dialect_bengali, actual, benglish, suggestions
# 5) Partial matches in dialect strings
partial_matches = []
for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
if q_lower in dialect.lower() or dialect.lower() in q_lower:
similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio()
partial_matches.append((i, similarity))
if partial_matches:
partial_matches.sort(key=lambda x: x[1], reverse=True)
indices = [idx for idx, score in partial_matches[:top_k]]
scores = [score for idx, score in partial_matches[:top_k]]
best_idx = indices[0]
d, dialect_bengali, actual, benglish = phrases_data[best_idx]
suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
return dialect_bengali, actual, benglish, suggestions
# 6) Close textual matches using difflib
close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
if close_matches:
indices = [dialects_lower.index(m) for m in close_matches]
text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches]
best_idx = indices[0]
d, dialect_bengali, actual, benglish = phrases_data[best_idx]
suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
return dialect_bengali, actual, benglish, suggestions
# 7) Rule engine fallback (even with some unknown words)
return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])
except Exception as ex:
tb = traceback.format_exc()
return "", "", "", f"Runtime error:\n{str(ex)}\n\nTraceback:\n{tb}"
def show_semantic_analysis(user_text):
"""Show semantic analysis of user input"""
if not user_text.strip():
return ""
patterns, meanings = semantic_analysis(user_text)
if patterns:
return f"Detected patterns: {', '.join([f'{p}{m}' for p, m, t in patterns])}"
return "No specific patterns detected"
# Custom CSS for a softer, less blinding color scheme
css = """
body {
font-family: Arial, sans-serif;
}
.gr-box {
border: 1px solid #e0e0e0;
border-radius: 8px;
}
.gr-button {
background: #4CAF50;
color: white;
}
.gr-button:hover {
background: #45a049;
}
"""
# Build Gradio UI with a softer theme
with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish")
gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.")
# Define input component first
inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Examples to try:")
examples = gr.Examples(
examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
inputs=inp,
label="Try these examples"
)
with gr.Column(scale=2):
btn = gr.Button("Translate / Find", variant="primary")
with gr.Row():
out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)")
out_actual = gr.Textbox(label="Actual Bengali (Standard)")
out_benglish = gr.Textbox(label="Benglish (Phonetic English)")
with gr.Row():
semantic_info = gr.Textbox(label="Semantic Analysis", lines=2)
suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8)
# Set up event handlers
btn.click(
fn=translate_text,
inputs=[inp],
outputs=[out_dialect, out_actual, out_benglish, suggestions]
)
inp.change(
fn=show_semantic_analysis,
inputs=[inp],
outputs=[semantic_info]
)
# Launch the app
if __name__ == "__main__":
demo.launch()