# app.py
"""
Enhanced Dialect Bengali Translator with Semantic Search
Uses both text similarity and semantic pattern matching
Updated to include new dialect patterns and polite/negative 'des/dis' behavior
"""

import difflib
import traceback
import gradio as gr
from collections import defaultdict
import re

# === Phrase data: [Dialect Latin, Dialect Bengali Script, Actual Bengali (Std), Benglish] ===
phrases_data = [
    # Questions / common
    ["gesle ni", "গেসলে নি", "গিয়েছিলে কি?", "giese chile ki?"],
    ["oislo ni", "ওইস্লো নি", "হয়েছে কি?", "hoyeche ki?"],
    ["oigese ni", "ওইগেসে নি", "হয়ে গেছে কি?", "hoyegese ki?"],
    ["oise", "ওইসে", "হয়েছে", "hoyeche"],
    ["bala oise", "বালা ওইসে", "ভালো হয়েছে", "bhalo hoyeche"],
    ["kub bala oise", "কুব বালা ওইসে", "অনেক ভালো হয়েছে", "onek bhalo hoyeche"],
    ["oise jen", "ওইসে জেন", "হয়েছিল যে", "hoyechilo je"],
    ["jaite ni", "জাইতে নি", "যাবে কি?", "jabe ki?"],
    ["or ni", "ওর নি", "হচ্ছে কি?", "hocche ki?"],
    ["or", "ওর", "হচ্ছে", "hocche"],
    ["bala or", "বালা ওর", "ভালো হচ্ছে", "bhalo hocche"],
    ["bala ni", "বালা নি", "ভালো কি?", "bhalo ki?"],
    ["or je", "ওর যে", "হচ্ছে যে", "hocche je"],
    ["jaibe ni", "জাইবে নি", "যাবে কি?", "jabe ki?"],
    ["jare ni", "জারে নি", "যাচ্ছো কি?", "jaccho ki?"],
    ["Kita kobor?", "কিতা খবর?", "কি খবর?", "ki khobor?"],
    ["Kita korde?", "কিতা কোর্দে?", "কি করছে?", "ki korchho?"],
    ["acha oibo-tik ase", "আচা ওইবো-তিক আসে", "ঠিক আছে", "thik ache"],
    ["tew", "তেও", "তাহলে", "tahole"],
    ["tente", "তেনতে", "তাহলে", "tahole"],
    ["to", "তো", "তাহলে", "tahole"],
    ["se hole", "সে হলে", "তাহলে", "tahole"],
    ["Sob bala asoin ni", "সব বালা আসইন নি", "সব ভালো আছে কি?", "sob bhalo ache ki?"],
    ["Sob bala ase", "সব বালা আসে", "সব ভালো আছে", "sob bhalo ache"],
    ["asoini", "আসইনি", "আছে কি?", "ache ki?"],
    ["ase", "আসে", "আছে", "ache"],

    # Future / Present / Past core verbs (ja / de / fawa / ka)
    ["jaimu", "জাইমু", "যাব", "jabo"],
    ["jaibay", "জাইবে", "তুমি যাবে", "tumi jabe (dialect)"],
    ["jaibe", "জাইবে", "তुमি যাবে (friend)", "tumi jabe (friend form)"],
    ["jaibo", "জাইবো", "যাবে", "jabe"],
    ["jaiba", "জাইবা", "তারা যাবে", "tara jabe"],
    ["oibo", "ওইবো", "হবে", "hobe"],
    ["oibo jen", "ওইবো জেন", "হবে যে", "hobe je"],

    ["ami jaimu", "আমি জাইমু", "আমি যাব", "ami jabo"],
    ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
    ["He rit aise", "হে রিত আসে", "সে রাতে এসেছে", "se rate esheche"],

    # Give (de) family
    ["des", "দেস", "দাও (মৃদু)", "des (give, friendly)"],
    ["des na", "দেস না", "দাও (দয়া করে, মৃদু অনুরোধ)", "des na (please give)"],
    ["dis", "দিস", "না দাও / নিষেধ", "dis (don't give)"],
    ["dis na", "দিস না", "দেও না", "dis na (don't give)"],
    ["dilaisi", "দিলাইসি", "দিয়েছি", "diyechi (I gave)"],
    ["dilaise", "দিলাইসে", "দিয়েছে", "diyeche (he gave)"],
    ["dilaisoin", "দিলাইসইন", "দিয়েছেন (সম্মানভাষা)", "diyechen (honorific)"],
    ["dise na", "দিসে না", "দেয়নি", "deni (didn't give)"],
    ["dibo", "দিবো", "দেব", "debo (will give)"],
    ["der amare", "দের আমিরে", "সে আমাকে দেয়", "se amake dey"],
    ["dibo amare", "দিবো আমিরে", "সে আমাকে দেবে", "se amake debe"],

    # Get / receive (fawa) family
    ["faisi", "ফাইসি", "পেয়েছি", "peyechi (I got)"],
    ["faisi na", "ফাইসি না", "পাইনি", "pelam na (didn't get)"],
    ["faisot ni", "ফাইসোট নি", "পেলে কি?", "pele ki?"],
    ["faislo", "ফাইসলো", "পেয়ে গেল/লাভ করল (3sg past)", "pelo (he got)"],
    ["faislam", "ফাইসলাম", "পেয়েছিলাম", "pelam (I got past)"],
    ["faisla", "ফাইসলা", "পেয়েছিল (they)", "pela (they got)"],
    ["faisly", "ফাইসলাই", "তুমি পেয়েছ", "tumi pele (you got)"],
    ["faimu", "ফাইমু", "পাব", "pabo (I will get)"],
    ["faibay", "ফাইবে", "তুমি পাবে (dialect)", "tumi pabe"],
    ["faibe", "फাইবে", "तुमি पাবে (friend)", "tumi pabe (friend)"],
    ["faibo", "ফাইবো", "সে পাবে", "se pabe"],
    ["faiba", "ফাইবা", "তারা পাবে", "tara pabe"],

    # Eat (ka) family
    ["kaimu", "কাইমু", "খাব", "khaimu (I will eat)"],
    ["kaibay", "কাইবে", "তুমি খাব (dialect)", "tumi khabe"],
    ["kaibe", "কাইবে", "তুমি খাব (friend)", "tumi khabe (friend)"],
    ["kaibo", "কাইবো", "সে খাবে", "se khabe"],
    ["kaiba", "কাইবা", "তারা খাবে", "tara khabe"],

    # Other sample sentences from user's corpus
    ["Ami faisi ekta notun jinish", "আমি ফাইসি একটা নতুন জিনিস", "আমি একটা নতুন জিনিস পেয়েছি", "ami ekta notun jinish peyechi"],
    ["Tumi taka faiso ni", "তুমি টাকা ফাইসো নি", "তুমি টাকা পেয়েছ কি?", "tumi taka peyecho ki?"],
    ["He sobsomoy amare teka dey", "হে সবসময় আমিারে তেকা দেয়", "সে সবসময় আমাকে টাকা দেয়", "se shobshomoy amake taka dey"],
    ["Ami bazaro jaimu", "আমি বাজারো জাইমু", "আমি বাজারে যাব", "ami bazar e jabo"],
    ["Tara bazaro bohut jinish faisoin", "তারা বাজারো বহুত জিনিস ফাইসইন", "তারা বাজারে অনেক জিনিস পেয়েছে", "tara bazar e onek jinish peyechhe"],
    ["Tumi boi diso ni", "তুমি বই দিসো নি", "আপনি কি বই দিয়েছেন?", "apni boi diyechen?"],
    ["Tuin boi disot ni", "তুইন বই দিসট নি", "তুই বই দিয়েছ কি?", "tui boi diyechish?"],
    ["Bifodo asi", "বিফোডো আছি", "বিপদে আছি", "bipode achi"],
    ["Kotobil bade fawa gese", "কোটবিল বাদে ফাওয়া গেসে", "অনেকদিন পরে পেয়েছি", "got after long time got"]
]

# =============================================================================
# NEW RULE-BASED TRANSLATION ENGINE
# =============================================================================

# 1. DIALECT LEXICON (A dictionary for unique words)
# This maps a dialect word to its standard Bengali meaning.
dialect_lexicon = {
    'fua': 'chele',
    'furi': 'meye',
    'tara': 'tara', # It's the same!
    'ora': 'tara',
    'beta': 'purush',
    'sogra': 'chele',
    'bakka': 'pagol',
    'kun': 'ke', # who
    'ki': 'ki', # what
    'kano': 'kothay', # where - THIS IS THE KEY FOR YOUR PHRASE
    'kuno': 'kothay', # where (alternative spelling)
    'jaibo': 'jabe',
    'oibo': 'hobe',
    'dibo': 'debe',
    'lowna': 'nen', # you take (respectful)
    'koin': 'bolun', # you say (respectful)
    'disot': 'diyechile', # you gave (question form)
    'faisot': 'peyechile', # you got (question form)
    'ase': 'ache',
    'or': 'hocche',
    'oise': 'hoyeche',
    'bala': 'bhalo',
    'kub': 'onek',
    'tik': 'thik',
    'acha': 'thik',
    'jen': 'je',
    'ni': 'ki',
    'kobor': 'khobor',
    'korde': 'korchho',
    'gesle': 'giyechile',
    'oislo': 'hoyeche',
    'oigese': 'hoyegese',
    'jaimu': 'jabo',
    'jaibay': 'jabe',
    'jaibe': 'jabe',
    'jaiba': 'jabe',
    'des': 'dao',
    'dis': 'na dio',
    'dilaisi': 'diyechi',
    'dilaise': 'diyeche',
    'dilaisoin': 'diyechen',
    'dise': 'dey',
    'faisi': 'peyechi',
    'faislo': 'pelo',
    'faislam': 'pelam',
    'faisla': 'pela',
    'faimu': 'pabo',
    'faibay': 'pabe',
    'faibe': 'pabe',
    'faibo': 'pabe',
    'faiba': 'pabe',
    'kaimu': 'khabo',
    'kaibay': 'khabe',
    'kaibe': 'khabe',
    'kaibo': 'khabe',
    'kaiba': 'khabe',
}

# 2. THE TRANSLATION FUNCTION
def translate_with_rules(user_input):
    """
    This is the new core function. It translates a sentence
    using the rule engine and lexicon.
    It returns: (dialect_bengali, actual_bengali, benglish, explanation)
    """
    # Step 1: Tokenize - split the sentence into words
    input_words = user_input.lower().split()
    translated_std_words = [] # This will hold the standard Bengali words
    explanation = [] # This will explain the translation

    # Step 2: Decode each word using the lexicon
    for word in input_words:
        # Check if the word is in the dialect lexicon
        if word in dialect_lexicon:
            std_word = dialect_lexicon[word]
            translated_std_words.append(std_word)
            explanation.append(f"'{word}' -> '{std_word}'")
        else:
            # If not found, keep the original word (it might be proper noun)
            translated_std_words.append(word)
            explanation.append(f"'{word}' -> ?")

    # Step 3: Reconstruct the standard Bengali sentence
    standard_sentence = " ".join(translated_std_words)

    # For now, we'll use the input as dialect form since user typed it
    dialect_sentence = user_input
    # Benglish could be a simple phonetic version
    benglish_sentence = user_input

    return dialect_sentence, standard_sentence, benglish_sentence, " | ".join(explanation)

# =============================================================================
# END OF NEW TRANSLATION ENGINE
# =============================================================================

# Semantic mapping of dialect patterns to meanings + types
semantic_patterns = {
    # question/particles
    r"\bni\b": {"meaning": "কি", "type": "question"},
    r"\bni\b$": {"meaning": "কি", "type": "question"},
    # verbs / roots
    r"\bor\b": {"meaning": "হচ্ছে", "type": "verb"},
    r"\boise\b": {"meaning": "হয়েছে", "type": "verb"},
    r"\boibo\b": {"meaning": "হবে", "type": "verb"},
    r"\bjaimu\b": {"meaning": "যাব", "type": "verb"},
    r"\bjaib[aey]\b": {"meaning": "যাবে", "type": "verb"},
    r"\bkobor\b": {"meaning": "খবর", "type": "noun"},
    r"\bkorde\b": {"meaning": "করছে", "type": "verb"},
    r"\bacha\b": {"meaning": "ঠিক", "type": "adjective"},
    r"\bbala\b": {"meaning": "ভালো", "type": "adjective"},
    r"\bkub\b": {"meaning": "অনেক", "type": "adverb"},
    r"\bgesle\b": {"meaning": "গিয়েছিলে", "type": "verb"},
    r"\boislo\b": {"meaning": "হয়েছে", "type": "verb"},
    r"\boigese\b": {"meaning": "হয়েগেছে", "type": "verb"},
    r"\bjen\b": {"meaning": "যে", "type": "conjunction"},
    r"\bje\b": {"meaning": "যে", "type": "conjunction"},
    r"\btik\b": {"meaning": "ঠিক", "type": "adjective"},
    r"\base\b": {"meaning": "আছে", "type": "verb"},
    r"\basoin\b": {"meaning": "আছে", "type": "verb"},
    r"\basoini\b": {"meaning": "আছে কি", "type": "verb+question"},
    r"\bGoto\b": {"meaning": "গত", "type": "adjective"},
    r"\bkali\b": {"meaning": "কাল", "type": "noun"},
    r"\bkita\b": {"meaning": "কি", "type": "question"},
    r"\btew\b": {"meaning": "তাহলে", "type": "conjunction"},
    # give/get polarity (important dialect contrast)
    r"\bdes\b": {"meaning": "দান/দাও (বন্ধু-মৃদু)", "type": "give_positive"},
    r"\bdes\s+na\b": {"meaning": "মৃদু অনুরোধ: দাও", "type": "give_positive"},
    r"\bdis\b": {"meaning": "না দাও / নিষেধ", "type": "give_negative"},
    r"\bdis\s+na\b": {"meaning": "না দাও (নিষেধ)", "type": "give_negative"},
    # fawa/get variants
    r"\bfaisi\b": {"meaning": "পেয়েছি", "type": "verb"},
    r"\bfaisl[ao]m\b": {"meaning": "পেয়েছিলাম/পেয়েছি(past)", "type": "verb"},
    r"\bfaimu\b": {"meaning": "পাব", "type": "verb"},
    r"\bfaib[ae]y?\b": {"meaning": "পাবে", "type": "verb"},
    # future pattern markers
    r"\bmu\b": {"meaning": "ভবিষ্যৎ: 1sg", "type": "tense_future"},
    r"\bbay\b": {"meaning": "ভবিষ্যৎ: 2sg (tumi)", "type": "tense_future"},
    r"\bbo\b": {"meaning": "ভবিষ্যৎ: 3sg", "type": "tense_future"},
    r"\bba\b": {"meaning": "ভবিষ্যৎ: plural/3pl", "type": "tense_future"},
}

# Precompute data structures for matching
dialects = [p[0] for p in phrases_data]
dialects_lower = [d.lower() for d in dialects]
actual_bengali_list = [p[2] for p in phrases_data]

# Create a mapping from dialect to full row
dialect_to_all = {p[0].lower(): p for p in phrases_data}

def semantic_analysis(user_input):
    """Perform semantic analysis on user input to understand meaning"""
    user_lower = user_input.lower()
    detected_patterns = []
    meaning_components = []

    # Use regex-based whole-word matching for patterns
    for pattern, info in semantic_patterns.items():
        try:
            if re.search(pattern, user_lower):
                detected_patterns.append((pattern, info["meaning"], info["type"]))
                meaning_components.append(info["meaning"])
        except re.error:
            # If pattern is bad, skip it safely
            continue

    return detected_patterns, meaning_components

def find_semantic_matches(user_input, threshold=0.35):
    """Find matches based on semantic similarity + text similarity"""
    user_lower = user_input.lower()
    matches = []

    # Get semantic patterns from user input
    detected_patterns, meaning_components = semantic_analysis(user_input)

    # If we found semantic patterns, look for phrases with similar meanings
    if meaning_components:
        for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
            match_score = 0.0
            # boost if any of the meaning_components appear in actual or dialect
            for meaning in meaning_components:
                if meaning in actual:
                    match_score += 0.35
                if meaning in dialect.lower():
                    match_score += 0.25

            # text similarity between user and dialect form
            text_similarity = difflib.SequenceMatcher(None, user_lower, dialect.lower()).ratio()
            total_score = match_score + (text_similarity * 0.5)

            if total_score > threshold:
                matches.append((i, total_score, "semantic"))

    return matches

def format_suggestions_from_indices(indices, match_type="text", scores=None):
    """Helper to format suggestion list for the suggestions box"""
    lines = []
    for i, idx in enumerate(indices):
        d, dialect_bengali, actual, benglish = phrases_data[idx]

        score_str = ""
        if scores is not None and i < len(scores):
            s_pct = int(scores[i] * 100)
            score_str = f" ({match_type}-match: {s_pct}%)"

        lines.append(f"• {d}{score_str}\n    Dialect Bengali: {dialect_bengali}\n    Actual Bengali: {actual}\n    Benglish: {benglish}")
    return "\n\n".join(lines)

def translate_text(user_text, top_k: int = 6):
    """
    Returns: (dialect_out, actual_out, benglish_out, suggestions_out)
    """
    try:
        q = (user_text or "").strip()
        if not q:
            return "", "", "", "Please enter a phrase or question."

        q_lower = q.lower()

        # 1) Exact match (case-insensitive)
        for dialect, dialect_bengali, actual, benglish in phrases_data:
            if q_lower == dialect.lower():
                return dialect_bengali, actual, benglish, "✅ EXACT MATCH (100%)"

        # 2) NEW: Try to translate it using the RULE ENGINE
        # Check if this is a simple phrase that can be broken down
        dialect_out, actual_out, benglish_out, explanation = translate_with_rules(q)
        
        # If the rule engine found translations for all words, use it!
        if "?" not in explanation:  # Basic check - if no unknown words
            return dialect_out, actual_out, benglish_out, f"🔧 RULE-BASED TRANSLATION:\n{explanation}"

        # 3) If input contains multiple phrases separated by punctuation
        potential_phrases = [p.strip() for p in re.split(r'[.,;!?]\s*', q) if p.strip()]
        if len(potential_phrases) > 1:
            results = []
            for phrase in potential_phrases:
                matched = False
                for d, dialect_bengali, actual, benglish in phrases_data:
                    if phrase.lower() == d.lower():
                        results.append(f"{dialect_bengali} → {actual} → {benglish}")
                        matched = True
                        break
                if not matched:
                    results.append(f"'{phrase}' → No match found")
            return "", "", "", "Multiple phrases detected:\n\n" + "\n\n".join(results)

        # 4) Semantic matches
        semantic_matches = find_semantic_matches(q)
        if semantic_matches:
            # sort and return top semantic candidates
            semantic_matches.sort(key=lambda x: x[1], reverse=True)
            indices = [idx for idx, score, mt in semantic_matches[:top_k]]
            scores = [score for idx, score, mt in semantic_matches[:top_k]]
            suggestions = "🔍 Semantic matches found:\n\n" + format_suggestions_from_indices(indices, "semantic", scores)
            # Return best match as primary output
            best_idx = indices[0]
            d, dialect_bengali, actual, benglish = phrases_data[best_idx]
            return dialect_bengali, actual, benglish, suggestions

        # 5) Partial matches in dialect strings
        partial_matches = []
        for i, (dialect, dialect_bengali, actual, benglish) in enumerate(phrases_data):
            if q_lower in dialect.lower() or dialect.lower() in q_lower:
                similarity = difflib.SequenceMatcher(None, q_lower, dialect.lower()).ratio()
                partial_matches.append((i, similarity))

        if partial_matches:
            partial_matches.sort(key=lambda x: x[1], reverse=True)
            indices = [idx for idx, score in partial_matches[:top_k]]
            scores = [score for idx, score in partial_matches[:top_k]]
            best_idx = indices[0]
            d, dialect_bengali, actual, benglish = phrases_data[best_idx]
            suggestions = "🔍 Partial matches in dialect:\n\n" + format_suggestions_from_indices(indices, "text", scores)
            return dialect_bengali, actual, benglish, suggestions

        # 6) Close textual matches using difflib
        close_matches = difflib.get_close_matches(q_lower, dialects_lower, n=top_k, cutoff=0.3)
        if close_matches:
            indices = [dialects_lower.index(m) for m in close_matches]
            text_sim_scores = [difflib.SequenceMatcher(None, q_lower, m).ratio() for m in close_matches]
            best_idx = indices[0]
            d, dialect_bengali, actual, benglish = phrases_data[best_idx]
            suggestions = "🔍 Similar dialect phrases:\n\n" + format_suggestions_from_indices(indices, "text", text_sim_scores)
            return dialect_bengali, actual, benglish, suggestions

        # 7) Rule engine fallback (even with some unknown words)
        return dialect_out, actual_out, benglish_out, f"🤖 RULE ENGINE ATTEMPT (some unknown words):\n{explanation}\n\n💡 Try these sample phrases:\n" + "\n".join([f"• {p[0]}" for p in phrases_data[:5]])

    except Exception as ex:
        tb = traceback.format_exc()
        return "", "", "", f"Runtime error:\n{str(ex)}\n\nTraceback:\n{tb}"

def show_semantic_analysis(user_text):
    """Show semantic analysis of user input"""
    if not user_text.strip():
        return ""
    patterns, meanings = semantic_analysis(user_text)
    if patterns:
        return f"Detected patterns: {', '.join([f'{p} → {m}' for p, m, t in patterns])}"
    return "No specific patterns detected"

# Custom CSS for a softer, less blinding color scheme
css = """
body {
    font-family: Arial, sans-serif;
}
.gr-box {
    border: 1px solid #e0e0e0;
    border-radius: 8px;
}
.gr-button {
    background: #4CAF50;
    color: white;
}
.gr-button:hover {
    background: #45a049;
}
"""

# Build Gradio UI with a softer theme
with gr.Blocks(title="Enhanced Dialect Translator", css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌍 Dialect Bengali → Actual Bengali → Benglish")
    gr.Markdown("Type a phrase in your dialect. The app uses both text and semantic matching to find similar phrases.")

    # Define input component first
    inp = gr.Textbox(label="Type phrase in Dialect Bengali", placeholder="e.g. Kita kobor? Sob bala asoin ni")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Examples to try:")
            examples = gr.Examples(
                examples=["tew", "bala or", "Kita kobor?", "tente", "to", "se hole", "Sob bala asoin ni", "kano tara"],
                inputs=inp,
                label="Try these examples"
            )
        with gr.Column(scale=2):
            btn = gr.Button("Translate / Find", variant="primary")

    with gr.Row():
        out_dialect = gr.Textbox(label="Dialect Bengali (Bengali Script)")
        out_actual = gr.Textbox(label="Actual Bengali (Standard)")
        out_benglish = gr.Textbox(label="Benglish (Phonetic English)")

    with gr.Row():
        semantic_info = gr.Textbox(label="Semantic Analysis", lines=2)

    suggestions = gr.Textbox(label="Status / Suggestions / Top Candidates", lines=8)

    # Set up event handlers
    btn.click(
        fn=translate_text,
        inputs=[inp],
        outputs=[out_dialect, out_actual, out_benglish, suggestions]
    )

    inp.change(
        fn=show_semantic_analysis,
        inputs=[inp],
        outputs=[semantic_info]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()