Spaces:

anisgtboi
/

my-dialect-translator-app

Sleeping

App Files Files Community

anisgtboi commited on 11 days ago

Commit

0afbfd4

verified ·

1 Parent(s): 5b2f305

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -10

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ phrases_data = [
     # Future / Present / Past core verbs (ja / de / fawa / ka)
     ["jaimu", "জাইমু", "যাব", "jabo"],
     ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
-    ["jaibe", "জাইবে", "তুমি যাবে (friend)", "tumi jabe (friend form)"],
     ["jaibo", "জাইবো", "যাবে", "jabe"],
     ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
     ["oibo", "ওইবো", "হবে", "hobe"],
@@ -77,7 +77,7 @@ phrases_data = [
     ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
     ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
     ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
-    ["faibe", "ফাইবে", "তুমি পাবে (friend)", "tumi pabe (friend)"],
     ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
     ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
@@ -100,6 +100,109 @@ phrases_data = [
     ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
 ]
 # Semantic mapping of dialect patterns to meanings + types
 semantic_patterns = {
     # question/particles
@@ -230,7 +333,15 @@ def translate_text(user_text, top_k: int = 6):
             if q_lower == dialect.lower():
                 return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
-        # 2) If input contains multiple phrases separated by punctuation
         potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
         if len(potential_phrases) > 1:
             results = []
@@ -245,7 +356,7 @@ def translate_text(user_text, top_k: int = 6):
                     results.append(f"'{phrase}' → No match found")
             return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
-        # 3) Semantic matches
         semantic_matches = find_semantic_matches(q)
         if semantic_matches:
             # sort and return top semantic candidates
@@ -258,7 +369,7 @@ def translate_text(user_text, top_k: int = 6):
             d, dialect_bengali, actual, benglish = phrases_data[best_idx]
             return dialect_bengali, actual, benglish, suggestions
-        # 4) Partial matches in dialect strings
         partial_matches = []
         for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
             if q_lower in dialect.lower() or dialect.lower() in q_lower:
@@ -274,7 +385,7 @@ def translate_text(user_text, top_k: int = 6):
             suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
             return dialect_bengali, actual, benglish, suggestions
-        # 5) Close textual matches using difflib
         close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
         if close_matches:
             indices = [dialects_lower.index(m) for m in close_matches]
@@ -284,9 +395,8 @@ def translate_text(user_text, top_k: int = 6):
             suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
             return dialect_bengali, actual, benglish, suggestions
-        # 6) Nothing found — give sample suggestions
-        sample_phrases = [p[0] for p in phrases_data[:10]]
-        return "", "", "", "❓ NO MATCH FOUND\n\nTry these sample phrases:\n" + "\n".join([f"• {ph}" for ph in sample_phrases])
     except Exception as ex:
         tb = traceback.format_exc()
@@ -331,7 +441,7 @@ with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Sof
         with gr.Column(scale=1):
             gr.Markdown("### Examples to try:")
             examples = gr.Examples(
-                examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni"],
                 inputs=inp,
                 label="Try these examples"
             )

     # Future / Present / Past core verbs (ja / de / fawa / ka)
     ["jaimu", "জাইমু", "যাব", "jabo"],
     ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
+    ["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
     ["jaibo", "জাইবো", "যাবে", "jabe"],
     ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
     ["oibo", "ওইবো", "হবে", "hobe"],
     ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
     ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
     ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
+    ["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
     ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
     ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],
     ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
 ]
+# =============================================================================
+# NEW RULE-BASED TRANSLATION ENGINE
+# =============================================================================
+# 1. DIALECT LEXICON (A dictionary for unique words)
+# This maps a dialect word to its standard Bengali meaning.
+dialect_lexicon = {
+    'fua': 'chele',
+    'furi': 'meye',
+    'tara': 'tara', # It's the same!
+    'ora': 'tara',
+    'beta': 'purush',
+    'sogra': 'chele',
+    'bakka': 'pagol',
+    'kun': 'ke', # who
+    'ki': 'ki', # what
+    'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
+    'kuno': 'kothay', # where (alternative spelling)
+    'jaibo': 'jabe',
+    'oibo': 'hobe',
+    'dibo': 'debe',
+    'lowna': 'nen', # you take (respectful)
+    'koin': 'bolun', # you say (respectful)
+    'disot': 'diyechile', # you gave (question form)
+    'faisot': 'peyechile', # you got (question form)
+    'ase': 'ache',
+    'or': 'hocche',
+    'oise': 'hoyeche',
+    'bala': 'bhalo',
+    'kub': 'onek',
+    'tik': 'thik',
+    'acha': 'thik',
+    'jen': 'je',
+    'ni': 'ki',
+    'kobor': 'khobor',
+    'korde': 'korchho',
+    'gesle': 'giyechile',
+    'oislo': 'hoyeche',
+    'oigese': 'hoyegese',
+    'jaimu': 'jabo',
+    'jaibay': 'jabe',
+    'jaibe': 'jabe',
+    'jaiba': 'jabe',
+    'des': 'dao',
+    'dis': 'na dio',
+    'dilaisi': 'diyechi',
+    'dilaise': 'diyeche',
+    'dilaisoin': 'diyechen',
+    'dise': 'dey',
+    'faisi': 'peyechi',
+    'faislo': 'pelo',
+    'faislam': 'pelam',
+    'faisla': 'pela',
+    'faimu': 'pabo',
+    'faibay': 'pabe',
+    'faibe': 'pabe',
+    'faibo': 'pabe',
+    'faiba': 'pabe',
+    'kaimu': 'khabo',
+    'kaibay': 'khabe',
+    'kaibe': 'khabe',
+    'kaibo': 'khabe',
+    'kaiba': 'khabe',
+}
+# 2. THE TRANSLATION FUNCTION
+def translate_with_rules(user_input):
+    """
+    This is the new core function. It translates a sentence
+    using the rule engine and lexicon.
+    It returns: (dialect_bengali, actual_bengali, benglish, explanation)
+    """
+    # Step 1: Tokenize - split the sentence into words
+    input_words = user_input.lower().split()
+    translated_std_words = [] # This will hold the standard Bengali words
+    explanation = [] # This will explain the translation
+    # Step 2: Decode each word using the lexicon
+    for word in input_words:
+        # Check if the word is in the dialect lexicon
+        if word in dialect_lexicon:
+            std_word = dialect_lexicon[word]
+            translated_std_words.append(std_word)
+            explanation.append(f"'{word}' -> '{std_word}'")
+        else:
+            # If not found, keep the original word (it might be proper noun)
+            translated_std_words.append(word)
+            explanation.append(f"'{word}' -> ?")
+    # Step 3: Reconstruct the standard Bengali sentence
+    standard_sentence = " ".join(translated_std_words)
+    # For now, we'll use the input as dialect form since user typed it
+    dialect_sentence = user_input
+    # Benglish could be a simple phonetic version
+    benglish_sentence = user_input
+    return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation)
+# =============================================================================
+# END OF NEW TRANSLATION ENGINE
+# =============================================================================
 # Semantic mapping of dialect patterns to meanings + types
 semantic_patterns = {
     # question/particles
             if q_lower == dialect.lower():
                 return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"
+        # 2) NEW: Try to translate it using the RULE ENGINE
+        # Check if this is a simple phrase that can be broken down
+        dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)
+        # If the rule engine found translations for all words, use it!
+        if "?" not in explanation:  # Basic check - if no unknown words
+            return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"
+        # 3) If input contains multiple phrases separated by punctuation
         potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
         if len(potential_phrases) > 1:
             results = []
                     results.append(f"'{phrase}' → No match found")
             return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)
+        # 4) Semantic matches
         semantic_matches = find_semantic_matches(q)
         if semantic_matches:
             # sort and return top semantic candidates
             d, dialect_bengali, actual, benglish = phrases_data[best_idx]
             return dialect_bengali, actual, benglish, suggestions
+        # 5) Partial matches in dialect strings
         partial_matches = []
         for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
             if q_lower in dialect.lower() or dialect.lower() in q_lower:
             suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
             return dialect_bengali, actual, benglish, suggestions
+        # 6) Close textual matches using difflib
         close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
         if close_matches:
             indices = [dialects_lower.index(m) for m in close_matches]
             suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
             return dialect_bengali, actual, benglish, suggestions
+        # 7) Rule engine fallback (even with some unknown words)
+        return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])
     except Exception as ex:
         tb = traceback.format_exc()
         with gr.Column(scale=1):
             gr.Markdown("### Examples to try:")
             examples = gr.Examples(
+                examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
                 inputs=inp,
                 label="Try these examples"
             )