import sys

import gradio as gr
import pandas as pd
import plotly.express as px
from gradio.themes.utils import colors

from results.parse import parse_agg, read_data
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
from style.css_html_js import custom_css
from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases


def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
    subset = df.copy()

    # Filter by task specific benchmarks when 'All' benchmarks is selected
    if task == "Spec-to-RTL":
        valid_benchmarks = s2r_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
    elif task == "Code Completion":
        valid_benchmarks = cc_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
    elif task == "Line Completion":
        valid_benchmarks = lc_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]

    if benchmark != "All":
        subset = df[df["Benchmark"] == benchmark]

    if model_type != "All":
        # without emojis
        subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
    if search_query:
        subset = subset[
            subset["Model"].str.contains(search_query, case=False, na=False)
        ]
    max_params = float(max_params)
    subset = subset[subset["Params"] <= max_params]

    if benchmark == "All":
        if task == "Spec-to-RTL":
            return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
        elif task == "Code Completion":
            return filter_bench_all(subset, df_agg, agg_column="Agg MC")
        elif task == "Line Completion":
            return filter_RTLRepo(subset)
    elif benchmark == "RTL-Repo":
        return filter_RTLRepo(subset)
    else:
        agg_column = None
        if benchmark == "VerilogEval S2R":
            agg_column = "Agg VerilogEval S2R"
        elif benchmark == "VerilogEval MC":
            agg_column = "Agg VerilogEval MC"
        elif benchmark == "RTLLM":
            agg_column = "Agg RTLLM"
        elif benchmark == "VeriGen":
            agg_column = "Agg VeriGen"

        return filter_bench(subset, df_agg, agg_column)


def update_benchmarks_by_task(task):
    if task == "Spec-to-RTL":
        new_benchmarks = ["All"] + s2r_benchs
    elif task == "Code Completion":
        new_benchmarks = ["All"] + cc_benchs
    elif task == "Line Completion":
        new_benchmarks = lc_benchs
    else:
        new_benchmarks = ["All"] + benchmarks
    benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
    filtered = filter_leaderboard(
        task,
        benchmark_value,
        model_type_dropdown.value,
        search_box.value,
        params_slider.value,
    )
    return gr.update(value=benchmark_value, choices=new_benchmarks), filtered


def generate_scatter_plot(benchmark, metric):
    benchmark, metric = handle_special_cases(benchmark, metric)

    subset = df[df["Benchmark"] == benchmark]
    if benchmark == "RTL-Repo":
        subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
        detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
        detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
    else:
        detailed_scores = subset.pivot_table(
            index="Model", columns="Metric", values="Score"
        ).reset_index()

    details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
    scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
        subset=["Params", metric]
    )

    scatter_data["x"] = scatter_data["Params"]
    scatter_data["y"] = scatter_data[metric]
    scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40

    type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
    scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")

    y_axis_limits = {
        "Functionality (FNC)": [5, 90],
        "Syntax (STX)": [20, 100],
        "Synthesis (SYN)": [5, 90],
        "Power": [0, 50],
        "Performance": [0, 50],
        "Area": [0, 50],
        "Exact Matching (EM)": [0, 50],
    }
    y_range = y_axis_limits.get(metric, [0, 80])

    fig = px.scatter(
        scatter_data,
        x="x",
        y="y",
        log_x=True,
        size="size",
        color="Model Type",
        text="Model",
        hover_data={metric: ":.2f"},
        title=f"Params vs. {metric} for {benchmark}",
        labels={"x": "# Params (Log Scale)", "y": metric},
        template="plotly_white",
        height=600,
        width=1200,
    )

    fig.update_traces(
        textposition="top center",
        textfont_size=10,
        marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
    )
    fig.update_layout(
        xaxis=dict(
            showgrid=True,
            type="log",
            tickmode="array",
            tickvals=[8, 14, 32, 72, 200, 700],
            ticktext=["8", "14", "32", "72", "200", "700"],
        ),
        showlegend=False,
        yaxis=dict(range=y_range),
        margin=dict(l=50, r=50, t=50, b=50),
        plot_bgcolor="white",
    )

    return fig


js_func = """
function refresh() {
    const url = new URL(window.location);
    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }
}
"""

with gr.Blocks(
    css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
) as app:
    df, benchmarks, metrics, default_metric = read_data()
    df_agg = parse_agg("./results/aggregated_scores.csv")
    tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
    s2r_benchs = ["VerilogEval S2R", "RTLLM"]
    cc_benchs = ["VerilogEval MC", "VeriGen"]
    lc_benchs = ["RTL-Repo"]
    non_rtl_metrics = [
        "Syntax (STX)",
        "Functionality (FNC)",
        "Synthesis (SYN)",
        "Power",
        "Performance",
        "Area",
    ]
    rtl_metrics = ["Exact Matching (EM)"]
    model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]

    gr.HTML(
        """
    <div align="center">
        <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
    </div>
    """
    )
    gr.HTML(
        """
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
    <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
    <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
        <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
            <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                GitHub Repo 
            </button>
        </a>

        <a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;">
            <button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                arXiv Preprint
            </button>
        </a>

        <a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;">
            <button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                How to submit
            </button>
        </a>
        <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate: 
            <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
        </p>
    </div>
    """
    )
    gr.HTML(
        """
    <div style=" margin-top:-10px !important;">
        <p style="margin-bottom: 15px;  text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
        Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs. 
        Use the filters below to explore different RTL benchmarks and models.</p>
    <p style="margin-top: 15px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">NEW UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub, and add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks.</p>
    </div>
    """
    )
    with gr.Tabs():
        with gr.Tab("Leaderboard"):
            with gr.Row(equal_height=True):
                with gr.Column():
                    task_radio = gr.Radio(
                        choices=tasks, label="Select Task", value="Spec-to-RTL"
                    )
                with gr.Column():
                    benchmark_radio = gr.Radio(
                        choices=["All"] + s2r_benchs,
                        label="Select Benchmark",
                        value="All",
                    )

            with gr.Row(equal_height=True):
                search_box = gr.Textbox(
                    label="Search Model",
                    placeholder="Type model name...",
                    scale=2,
                )
                model_type_dropdown = gr.Radio(
                    choices=model_types,
                    label="Select Model Type",
                    value="All",
                    scale=3,
                )
                params_slider = gr.Slider(
                    minimum=df["Params"].min(),
                    maximum=700,
                    value=700,
                    label="Max Params",
                    step=1,
                    scale=2,
                )

            leaderboard = gr.DataFrame(
                value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
                headers="first row",
                show_row_numbers=True,
                wrap=True,
                datatype=[
                    "markdown",
                    "html",
                ],
                interactive=False,
                column_widths=[
                    "7%",
                    "24%",
                    "17%",
                    "10%",
                    "13%",
                    "10%",
                    "14%",
                ],
                elem_classes="dataframe-leaderboard",
            )

        with gr.Tab("Plot View"):
            with gr.Row(equal_height=True):
                default_benchmark = s2r_benchs[0]
                bubble_benchmark = gr.Dropdown(
                    choices=benchmarks,
                    label="Select Benchmark",
                    value=default_benchmark,
                    elem_classes="gr-dropdown",
                )
                default_metric = non_rtl_metrics[0]
                bubble_metric = gr.Dropdown(
                    choices=non_rtl_metrics,
                    label="Select Metric",
                    value=default_metric,
                )
            with gr.Row(equal_height=True):
                scatter_plot = gr.Plot(
                    value=generate_scatter_plot(default_benchmark, default_metric),
                    label="Bubble Chart",
                    elem_id="full-width-plot",
                )

        with gr.Tab("Metrics Information"):
            with open("./static/metrics.md", "r") as file:
                gr.Markdown(
                    file.read(),
                    latex_delimiters=[
                        {"left": "$$", "right": "$$", "display": True},
                        {"left": "$", "right": "$", "display": False},
                    ],
                    elem_classes="metrics-page",
                )
        with gr.Tab("About Us"):
            gr.HTML(
                """
                <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
                    <div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
                        <img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
                        <img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
                    </div>

                    <p style="font-size: 16px; text-align: start;">
                        The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the 
                        <a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>. 
                        This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
                    </p>

                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
                        <li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
                        <li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
                        <li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
                    </ul>

                    <p style="font-size: 16px; margin-top: 15px;">
                        Feel free to contact us:
                    </p>

                    <p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
                </div>
                """
            )
        with gr.Tab("References"):
            gr.HTML(
            """
            <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
                        <li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
                        <li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
                        <li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
                        <li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
                        <li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
                        <li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
                    </ul>
                    <p style="font-size: 16px; margin-top: 15px;">
                        Feel free to contact us:
                    </p>
                </div>
                """
            )
        with gr.Row():
            with gr.Accordion("📙 Citation", open=False):
                citation_button = gr.Textbox(
                    value=CITATION_BUTTON_TEXT,
                    label=CITATION_BUTTON_LABEL,
                    lines=10,
                    elem_id="citation-button",
                    show_copy_button=True,
                )

    # event handlers, ugly way but it works
    task_radio.change(
        fn=update_benchmarks_by_task,
        inputs=[task_radio],
        outputs=[benchmark_radio, leaderboard],
    )
    benchmark_radio.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    model_type_dropdown.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    search_box.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    params_slider.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )

    def on_benchmark_change(benchmark, _):
        if benchmark == "RTL-Repo":
            metric = "Exact Matching (EM)"
            return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
                benchmark, metric
            )
        else:
            metric = non_rtl_metrics[0]
            return gr.update(
                choices=non_rtl_metrics[:-1], value=metric
            ), generate_scatter_plot(benchmark, metric)

    def on_metric_change(benchmark, metric):
        benchmark, metric = handle_special_cases(benchmark, metric)
        fig = generate_scatter_plot(benchmark, metric)
        return gr.update(value=benchmark), fig

    bubble_benchmark.change(
        fn=on_benchmark_change,
        inputs=[bubble_benchmark, bubble_metric],
        outputs=[bubble_metric, scatter_plot],
        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
        (benchmark, metric) => {
            let scrollY = window.scrollY;  
            const observer = new MutationObserver(() => {
                window.scrollTo(0, scrollY);
                observer.disconnect();
            });
            observer.observe(document.getElementById('full-width-plot'), { childList: true });
            return [benchmark, metric];  
        }
        """,
    )

    bubble_metric.change(
        fn=on_metric_change,
        inputs=[bubble_benchmark, bubble_metric],
        outputs=[bubble_benchmark, scatter_plot],
        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
        (benchmark, metric) => {
            let scrollY = window.scrollY;  
            const observer = new MutationObserver(() => {
                window.scrollTo(0, scrollY);
                observer.disconnect();
            });
            observer.observe(document.getElementById('full-width-plot'), { childList: true });
            return [benchmark, metric];  
        }
        """,
    )


app.launch(
    allowed_paths=[
        "logo.png",
        "hpai_logo_grad.png",
        "bsc-logo.png",
    ]
)