# app.py """ Enhanced Dialect Bengali Translator with Semantic Search Uses both text similarity and semantic pattern matching Updated to include new dialect patterns and polite/negative 'des/dis' behavior """ import difflib import traceback import gradio as gr from collections import defaultdict import re # === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] === phrases_data = [ # Questions / common ["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"], ["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"], ["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"], ["oise", "ওইসে", "হয়েছে", "hoyeche"], ["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"], ["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"], ["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"], ["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"], ["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"], ["or", "ওর", "হচ্ছে", "hocche"], ["bala or", "বালা ওর", "ভালো হচ্ছে", "bhalo hocche"], ["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"], ["or je", "ওর যে", "হচ্ছে যে", "hocche je"], ["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"], ["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"], ["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"], ["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"], ["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"], ["tew", "তেও", "তাহলে", "tahole"], ["tente", "তেনতে", "তাহলে", "tahole"], ["to", "তো", "তাহলে", "tahole"], ["se hole", "সে হলে", "তাহলে", "tahole"], ["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"], ["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"], ["asoini", "আসইনি", "আছে কি?", "ache ki?"], ["ase", "আসে", "আছে", "ache"], # Future / Present / Past core verbs (ja / de / fawa / ka) ["jaimu", "জাইমু", "যাব", "jabo"], ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"], ["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"], ["jaibo", "জাইবো", "যাবে", "jabe"], ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"], ["oibo", "ওইবো", "হবে", "hobe"], ["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"], ["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"], ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"], ["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"], # Give (de) family ["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"], ["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"], ["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"], ["dis na", "দিস না", "দেও না", "dis na (don't give)"], ["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"], ["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"], ["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"], ["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"], ["dibo", "দিবো", "দেব", "debo (will give)"], ["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"], ["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"], # Get / receive (fawa) family ["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"], ["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"], ["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"], ["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"], ["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"], ["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"], ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"], ["faimu", "ফাইমু", "পাব", "pabo (I will get)"], ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"], ["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"], ["faibo", "ফাইবো", "সে পাবে", "se pabe"], ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"], # Eat (ka) family ["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"], ["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"], ["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"], ["kaibo", "কাইবো", "সে খাবে", "se khabe"], ["kaiba", "কাইবা", "তারা খাবে", "tara khabe"], # Other sample sentences from user's corpus ["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"], ["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"], ["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"], ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"], ["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"], ["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"], ["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"], ["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"], ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"] ] # ============================================================================= # NEW RULE-BASED TRANSLATION ENGINE # ============================================================================= # 1. DIALECT LEXICON (A dictionary for unique words) # This maps a dialect word to its standard Bengali meaning. dialect_lexicon = { 'fua': 'chele', 'furi': 'meye', 'tara': 'tara', # It's the same! 'ora': 'tara', 'beta': 'purush', 'sogra': 'chele', 'bakka': 'pagol', 'kun': 'ke', # who 'ki': 'ki', # what 'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE 'kuno': 'kothay', # where (alternative spelling) 'jaibo': 'jabe', 'oibo': 'hobe', 'dibo': 'debe', 'lowna': 'nen', # you take (respectful) 'koin': 'bolun', # you say (respectful) 'disot': 'diyechile', # you gave (question form) 'faisot': 'peyechile', # you got (question form) 'ase': 'ache', 'or': 'hocche', 'oise': 'hoyeche', 'bala': 'bhalo', 'kub': 'onek', 'tik': 'thik', 'acha': 'thik', 'jen': 'je', 'ni': 'ki', 'kobor': 'khobor', 'korde': 'korchho', 'gesle': 'giyechile', 'oislo': 'hoyeche', 'oigese': 'hoyegese', 'jaimu': 'jabo', 'jaibay': 'jabe', 'jaibe': 'jabe', 'jaiba': 'jabe', 'des': 'dao', 'dis': 'na dio', 'dilaisi': 'diyechi', 'dilaise': 'diyeche', 'dilaisoin': 'diyechen', 'dise': 'dey', 'faisi': 'peyechi', 'faislo': 'pelo', 'faislam': 'pelam', 'faisla': 'pela', 'faimu': 'pabo', 'faibay': 'pabe', 'faibe': 'pabe', 'faibo': 'pabe', 'faiba': 'pabe', 'kaimu': 'khabo', 'kaibay': 'khabe', 'kaibe': 'khabe', 'kaibo': 'khabe', 'kaiba': 'khabe', } # 2. THE TRANSLATION FUNCTION def translate_with_rules(user_input): """ This is the new core function. It translates a sentence using the rule engine and lexicon. It returns: (dialect_bengali, actual_bengali, benglish, explanation) """ # Step 1: Tokenize - split the sentence into words input_words = user_input.lower().split() translated_std_words = [] # This will hold the standard Bengali words explanation = [] # This will explain the translation # Step 2: Decode each word using the lexicon for word in input_words: # Check if the word is in the dialect lexicon if word in dialect_lexicon: std_word = dialect_lexicon[word] translated_std_words.append(std_word) explanation.append(f"'{word}' -> '{std_word}'") else: # If not found, keep the original word (it might be proper noun) translated_std_words.append(word) explanation.append(f"'{word}' -> ?") # Step 3: Reconstruct the standard Bengali sentence standard_sentence = " ".join(translated_std_words) # For now, we'll use the input as dialect form since user typed it dialect_sentence = user_input # Benglish could be a simple phonetic version benglish_sentence = user_input return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation) # ============================================================================= # END OF NEW TRANSLATION ENGINE # ============================================================================= # Semantic mapping of dialect patterns to meanings + types semantic_patterns = { # question/particles r"\bni\b": {"meaning": "কি", "type": "question"}, r"\bni\b$": {"meaning": "কি", "type": "question"}, # verbs / roots r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"}, r"\boise\b": {"meaning": "হয়েছে", "type": "verb"}, r"\boibo\b": {"meaning": "হবে", "type": "verb"}, r"\bjaimu\b": {"meaning": "যাব", "type": "verb"}, r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"}, r"\bkobor\b": {"meaning": "খবর", "type": "noun"}, r"\bkorde\b": {"meaning": "করছে", "type": "verb"}, r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"}, r"\bbala\b": {"meaning": "ভালো", "type": "adjective"}, r"\bkub\b": {"meaning": "অনেক", "type": "adverb"}, r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"}, r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"}, r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"}, r"\bjen\b": {"meaning": "যে", "type": "conjunction"}, r"\bje\b": {"meaning": "যে", "type": "conjunction"}, r"\btik\b": {"meaning": "ঠিক", "type": "adjective"}, r"\base\b": {"meaning": "আছে", "type": "verb"}, r"\basoin\b": {"meaning": "আছে", "type": "verb"}, r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"}, r"\bGoto\b": {"meaning": "গত", "type": "adjective"}, r"\bkali\b": {"meaning": "কাল", "type": "noun"}, r"\bkita\b": {"meaning": "কি", "type": "question"}, r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"}, # give/get polarity (important dialect contrast) r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"}, r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"}, r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"}, r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"}, # fawa/get variants r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"}, r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"}, r"\bfaimu\b": {"meaning": "পাব", "type": "verb"}, r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"}, # future pattern markers r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"}, r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"}, r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"}, r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"}, } # Precompute data structures for matching dialects = [p[0] for p in phrases_data] dialects_lower = [d.lower() for d in dialects] actual_bengali_list = [p[2] for p in phrases_data] # Create a mapping from dialect to full row dialect_to_all = {p[0].lower(): p for p in phrases_data} def semantic_analysis(user_input): """Perform semantic analysis on user input to understand meaning""" user_lower = user_input.lower() detected_patterns = [] meaning_components = [] # Use regex-based whole-word matching for patterns for pattern, info in semantic_patterns.items(): try: if re.search(pattern, user_lower): detected_patterns.append((pattern, info["meaning"], info["type"])) meaning_components.append(info["meaning"]) except re.error: # If pattern is bad, skip it safely continue return detected_patterns, meaning_components def find_semantic_matches(user_input, threshold=0.35): """Find matches based on semantic similarity + text similarity""" user_lower = user_input.lower() matches = [] # Get semantic patterns from user input detected_patterns, meaning_components = semantic_analysis(user_input) # If we found semantic patterns, look for phrases with similar meanings if meaning_components: for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data): match_score = 0.0 # boost if any of the meaning_components appear in actual or dialect for meaning in meaning_components: if meaning in actual: match_score += 0.35 if meaning in dialect.lower(): match_score += 0.25 # text similarity between user and dialect form text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio() total_score = match_score + (text_similarity * 0.5) if total_score > threshold: matches.append((i, total_score, "semantic")) return matches def format_suggestions_from_indices(indices, match_type="text", scores=None): """Helper to format suggestion list for the suggestions box""" lines = [] for i, idx in enumerate(indices): d, dialect_bengali, actual, benglish = phrases_data[idx] score_str = "" if scores is not None and i < len(scores): s_pct = int(scores[i] * 100) score_str = f" ({match_type}-match: {s_pct}%)" lines.append(f"• {d}{score_str}\n Dialect Bengali: {dialect_bengali}\n Actual Bengali: {actual}\n Benglish: {benglish}") return "\n\n".join(lines) def translate_text(user_text, top_k: int = 6): """ Returns: (dialect_out, actual_out, benglish_out, suggestions_out) """ try: q = (user_text or "").strip() if not q: return "", "", "", "Please enter a phrase or question." q_lower = q.lower() # 1) Exact match (case-insensitive) for dialect, dialect_bengali, actual, benglish in phrases_data: if q_lower == dialect.lower(): return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)" # 2) NEW: Try to translate it using the RULE ENGINE # Check if this is a simple phrase that can be broken down dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q) # If the rule engine found translations for all words, use it! if "?" not in explanation: # Basic check - if no unknown words return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}" # 3) If input contains multiple phrases separated by punctuation potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()] if len(potential_phrases) > 1: results = [] for phrase in potential_phrases: matched = False for d, dialect_bengali, actual, benglish in phrases_data: if phrase.lower() == d.lower(): results.append(f"{dialect_bengali} → {actual} → {benglish}") matched = True break if not matched: results.append(f"'{phrase}' → No match found") return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results) # 4) Semantic matches semantic_matches = find_semantic_matches(q) if semantic_matches: # sort and return top semantic candidates semantic_matches.sort(key=lambda x: x[1], reverse=True) indices = [idx for idx, score, mt in semantic_matches[:top_k]] scores = [score for idx, score, mt in semantic_matches[:top_k]] suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores) # Return best match as primary output best_idx = indices[0] d, dialect_bengali, actual, benglish = phrases_data[best_idx] return dialect_bengali, actual, benglish, suggestions # 5) Partial matches in dialect strings partial_matches = [] for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data): if q_lower in dialect.lower() or dialect.lower() in q_lower: similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio() partial_matches.append((i, similarity)) if partial_matches: partial_matches.sort(key=lambda x: x[1], reverse=True) indices = [idx for idx, score in partial_matches[:top_k]] scores = [score for idx, score in partial_matches[:top_k]] best_idx = indices[0] d, dialect_bengali, actual, benglish = phrases_data[best_idx] suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores) return dialect_bengali, actual, benglish, suggestions # 6) Close textual matches using difflib close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3) if close_matches: indices = [dialects_lower.index(m) for m in close_matches] text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches] best_idx = indices[0] d, dialect_bengali, actual, benglish = phrases_data[best_idx] suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores) return dialect_bengali, actual, benglish, suggestions # 7) Rule engine fallback (even with some unknown words) return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]]) except Exception as ex: tb = traceback.format_exc() return "", "", "", f"Runtime error:\n{str(ex)}\n\nTraceback:\n{tb}" def show_semantic_analysis(user_text): """Show semantic analysis of user input""" if not user_text.strip(): return "" patterns, meanings = semantic_analysis(user_text) if patterns: return f"Detected patterns: {', '.join([f'{p} → {m}' for p, m, t in patterns])}" return "No specific patterns detected" # Custom CSS for a softer, less blinding color scheme css = """ body { font-family: Arial, sans-serif; } .gr-box { border: 1px solid #e0e0e0; border-radius: 8px; } .gr-button { background: #4CAF50; color: white; } .gr-button:hover { background: #45a049; } """ # Build Gradio UI with a softer theme with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo: gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish") gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.") # Define input component first inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Examples to try:") examples = gr.Examples( examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"], inputs=inp, label="Try these examples" ) with gr.Column(scale=2): btn = gr.Button("Translate / Find", variant="primary") with gr.Row(): out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)") out_actual = gr.Textbox(label="Actual Bengali (Standard)") out_benglish = gr.Textbox(label="Benglish (Phonetic English)") with gr.Row(): semantic_info = gr.Textbox(label="Semantic Analysis", lines=2) suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8) # Set up event handlers btn.click( fn=translate_text, inputs=[inp], outputs=[out_dialect, out_actual, out_benglish, suggestions] ) inp.change( fn=show_semantic_analysis, inputs=[inp], outputs=[semantic_info] ) # Launch the app if __name__ == "__main__": demo.launch()