import gradio as gr import pandas as pd import matplotlib.pyplot as plt import numpy as np from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css ### Space initialisation # Load leaderboard data with multi-header, do not set index initially LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1]) # Calculate Average N-avg and Rank # Identify N-avg columns (adjust if names are different in CSV header row 2) n_avg_cols_to_average = [ ('Alignment', 'N-avg↑'), ('Descriptiveness', 'N-avg↑'), ('Complexity', 'N-avg↑'), ('Side effects', 'N-avg↑') ] # Ensure these columns are numeric, coercing errors to NaN (though they should be numbers) for col_tuple in n_avg_cols_to_average: if col_tuple in LEADERBOARD_DF_ORIGINAL.columns: LEADERBOARD_DF_ORIGINAL[col_tuple] = pd.to_numeric(LEADERBOARD_DF_ORIGINAL[col_tuple], errors='coerce') else: print(f"Warning: N-avg column {col_tuple} not found for averaging.") # Add a warning # Calculate average, handling cases where some N-avg columns might be missing existing_n_avg_cols = [col for col in n_avg_cols_to_average if col in LEADERBOARD_DF_ORIGINAL.columns] if existing_n_avg_cols: LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = LEADERBOARD_DF_ORIGINAL[existing_n_avg_cols].mean(axis=1) LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')].rank(method='min', ascending=False).astype(int) else: LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = np.nan LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = np.nan # Reorder columns to put Rank and Average N-avg first, then Model, then the rest model_col_tuple = ('Model', 'Model') # Original name of the model column rank_col_tuple = ('Avg-', 'Rank') avg_navg_col_tuple = ('Avg-', ' N-avg') new_col_order = [] if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: new_col_order.append(rank_col_tuple) if avg_navg_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: new_col_order.append(avg_navg_col_tuple) if model_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: new_col_order.append(model_col_tuple) for col in LEADERBOARD_DF_ORIGINAL.columns: if col not in new_col_order: new_col_order.append(col) LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL[new_col_order] # Sort by Rank ascending if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL.sort_values(by=rank_col_tuple, ascending=True) # Function to prepare DataFrame for display (format headers, ensure Model column) def format_leaderboard_df_for_display(df_orig): df_display = df_orig.copy() new_columns = [] for col_tuple in df_display.columns: if col_tuple == ('Avg-', 'Rank'): new_columns.append('Overall Rank') elif col_tuple == ('Avg-', ' N-avg'): new_columns.append('Average N-avg') elif col_tuple == ('Model', 'Model'): new_columns.append('Model') else: new_columns.append(f"{col_tuple[0]}\n{col_tuple[1]}") df_display.columns = new_columns # Create a new DataFrame with the formatted column names for display # and apply formatting to the 'Average N-avg' data if it exists temp_formatted_df = pd.DataFrame(df_display.values, columns=new_columns, index=df_display.index) if 'Average N-avg' in temp_formatted_df.columns: # Ensure the column is numeric before formatting, in case it became object type temp_formatted_df['Average N-avg'] = pd.to_numeric(temp_formatted_df['Average N-avg'], errors='coerce') temp_formatted_df['Average N-avg'] = temp_formatted_df['Average N-avg'].map(lambda x: f"{x:.4f}" if pd.notnull(x) else '-') # Convert the 'Overall Rank' to integer string to avoid '.0' if 'Overall Rank' in temp_formatted_df.columns: def format_rank_with_emoji(rank_val): if pd.isnull(rank_val): return '-' try: rank_int = int(float(rank_val)) # Ensure conversion from potential float string if rank_int == 1: return f"{rank_int} 🥇" elif rank_int == 2: return f"{rank_int} 🥈" elif rank_int == 3: return f"{rank_int} 🥉" else: return f"{rank_int}" except ValueError: return str(rank_val) # Return original if not convertible to int temp_formatted_df['Overall Rank'] = temp_formatted_df['Overall Rank'].map(format_rank_with_emoji) return temp_formatted_df LEADERBOARD_DF_DISPLAY_INIT = format_leaderboard_df_for_display(LEADERBOARD_DF_ORIGINAL) BIAS_DF = pd.read_csv("bias_evaluation_data.csv") BIAS_DF = BIAS_DF.astype(str).fillna("-") demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🧠 Unified perf eval VLM captioners", elem_id="llm-benchmark-tab-table", id=0): with gr.Column(): table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True) gr.Markdown("---") gr.Markdown("### Display Options") model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist() model_selector = gr.CheckboxGroup( choices=model_filter_choices, value=model_filter_choices, label="Filter by Model types:" ) def update_table(selected_models_from_filter): filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy() if not selected_models_from_filter: filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] else: valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices] if not valid_selected_models: filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] else: filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)] df_to_display = format_leaderboard_df_for_display(filtered_df_orig) return gr.DataFrame.update(value=df_to_display) model_selector.change( fn=update_table, inputs=[model_selector], outputs=[table_output] ) with gr.TabItem("📝 Bias-aware eval VLM ", elem_id="llm-benchmark-tab-table", id=2): with gr.Column(): gr.Markdown("### Bias-Aware Evaluation Results") bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True) gr.Markdown("---") gr.Markdown("### Display Options for Bias Table") bias_all_columns_list = BIAS_DF.columns.tolist() bias_column_selector = gr.CheckboxGroup( choices=bias_all_columns_list, value=bias_all_columns_list, label="Select Columns to Display:" ) bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else [] bias_type_selector = gr.CheckboxGroup( choices=bias_type_filter_choices, value=bias_type_filter_choices, label="Filter by Bias Type:" ) bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else [] bias_model_selector_for_bias_tab = gr.CheckboxGroup( choices=bias_model_filter_choices, value=bias_model_filter_choices, label="Filter by Model:" ) def update_bias_table(selected_cols, selected_bias_types, selected_models): temp_df = BIAS_DF.copy() if selected_bias_types and "Bias_Type" in temp_df.columns: temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)] elif not selected_bias_types and "Bias_Type" in temp_df.columns: temp_df = pd.DataFrame(columns=BIAS_DF.columns) if selected_models and "Model" in temp_df.columns: temp_df = temp_df[temp_df["Model"].isin(selected_models)] elif not selected_models and "Model" in temp_df.columns: if not selected_bias_types: temp_df = pd.DataFrame(columns=BIAS_DF.columns) elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any(): temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())] valid_selected_cols = [col for col in selected_cols if col in temp_df.columns] if not valid_selected_cols and not temp_df.empty: final_df = temp_df elif not valid_selected_cols and temp_df.empty: final_df = pd.DataFrame(columns=selected_cols) else: final_df = temp_df[valid_selected_cols] return gr.DataFrame.update(value=final_df) bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) with gr.TabItem("🧑‍🍳 User Type & Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3): with gr.Column(): gr.Markdown("### Preference-Oriented Scores by User Type and Model") def create_preference_score_chart(): user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused'] models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL'] scores = np.array([ [0.20, 0.35, 0.45, 0.50, 0.85], # Detail-oriented [0.40, 0.55, 0.67, 0.53, 0.58], # Risk-conscious [0.20, 0.60, 0.72, 0.69, 0.75] # Accuracy-focused ]) x = np.arange(len(user_types)) width = 0.15 fig, ax = plt.subplots(figsize=(12, 7)) for i, model in enumerate(models): ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model) ax.set_xlabel('User type', fontsize=12) ax.set_ylabel('Preference-oriented score', fontsize=12) ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14) ax.set_xticks(x) ax.set_xticklabels(user_types, fontsize=10) ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') plt.ylim(0, 1.1) plt.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout(rect=[0, 0, 0.85, 1]) return fig gr.Plot(value=create_preference_score_chart) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) gr.Markdown("---") link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew" gr.HTML(f'''
Submit Your Results / Open a New Discussion
''') demo.queue(default_concurrency_limit=40).launch()