huckiyang's picture
[rank] adding rank
5dbc39d
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
### Space initialisation
# Load leaderboard data with multi-header, do not set index initially
LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1])
# Calculate Average N-avg and Rank
# Identify N-avg columns (adjust if names are different in CSV header row 2)
n_avg_cols_to_average = [
('Alignment', 'N-avg↑'),
('Descriptiveness', 'N-avg↑'),
('Complexity', 'N-avg↑'),
('Side effects', 'N-avg↑')
]
# Ensure these columns are numeric, coercing errors to NaN (though they should be numbers)
for col_tuple in n_avg_cols_to_average:
if col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
LEADERBOARD_DF_ORIGINAL[col_tuple] = pd.to_numeric(LEADERBOARD_DF_ORIGINAL[col_tuple], errors='coerce')
else:
print(f"Warning: N-avg column {col_tuple} not found for averaging.") # Add a warning
# Calculate average, handling cases where some N-avg columns might be missing
existing_n_avg_cols = [col for col in n_avg_cols_to_average if col in LEADERBOARD_DF_ORIGINAL.columns]
if existing_n_avg_cols:
LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = LEADERBOARD_DF_ORIGINAL[existing_n_avg_cols].mean(axis=1)
LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')].rank(method='min', ascending=False).astype(int)
else:
LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = np.nan
LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = np.nan
# Reorder columns to put Rank and Average N-avg first, then Model, then the rest
model_col_tuple = ('Model', 'Model') # Original name of the model column
rank_col_tuple = ('Avg-', 'Rank')
avg_navg_col_tuple = ('Avg-', ' N-avg')
new_col_order = []
if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
new_col_order.append(rank_col_tuple)
if avg_navg_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
new_col_order.append(avg_navg_col_tuple)
if model_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
new_col_order.append(model_col_tuple)
for col in LEADERBOARD_DF_ORIGINAL.columns:
if col not in new_col_order:
new_col_order.append(col)
LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL[new_col_order]
# Sort by Rank ascending
if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL.sort_values(by=rank_col_tuple, ascending=True)
# Function to prepare DataFrame for display (format headers, ensure Model column)
def format_leaderboard_df_for_display(df_orig):
df_display = df_orig.copy()
new_columns = []
for col_tuple in df_display.columns:
if col_tuple == ('Avg-', 'Rank'):
new_columns.append('Overall Rank')
elif col_tuple == ('Avg-', ' N-avg'):
new_columns.append('Average N-avg')
elif col_tuple == ('Model', 'Model'):
new_columns.append('Model')
else:
new_columns.append(f"{col_tuple[0]}\n{col_tuple[1]}")
df_display.columns = new_columns
# Create a new DataFrame with the formatted column names for display
# and apply formatting to the 'Average N-avg' data if it exists
temp_formatted_df = pd.DataFrame(df_display.values, columns=new_columns, index=df_display.index)
if 'Average N-avg' in temp_formatted_df.columns:
# Ensure the column is numeric before formatting, in case it became object type
temp_formatted_df['Average N-avg'] = pd.to_numeric(temp_formatted_df['Average N-avg'], errors='coerce')
temp_formatted_df['Average N-avg'] = temp_formatted_df['Average N-avg'].map(lambda x: f"{x:.4f}" if pd.notnull(x) else '-')
# Convert the 'Overall Rank' to integer string to avoid '.0'
if 'Overall Rank' in temp_formatted_df.columns:
def format_rank_with_emoji(rank_val):
if pd.isnull(rank_val):
return '-'
try:
rank_int = int(float(rank_val)) # Ensure conversion from potential float string
if rank_int == 1:
return f"{rank_int} 🥇"
elif rank_int == 2:
return f"{rank_int} 🥈"
elif rank_int == 3:
return f"{rank_int} 🥉"
else:
return f"{rank_int}"
except ValueError:
return str(rank_val) # Return original if not convertible to int
temp_formatted_df['Overall Rank'] = temp_formatted_df['Overall Rank'].map(format_rank_with_emoji)
return temp_formatted_df
LEADERBOARD_DF_DISPLAY_INIT = format_leaderboard_df_for_display(LEADERBOARD_DF_ORIGINAL)
BIAS_DF = pd.read_csv("bias_evaluation_data.csv")
BIAS_DF = BIAS_DF.astype(str).fillna("-")
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🧠 Unified perf eval VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
with gr.Column():
table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True)
gr.Markdown("---")
gr.Markdown("### Display Options")
model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist()
model_selector = gr.CheckboxGroup(
choices=model_filter_choices,
value=model_filter_choices,
label="Filter by Model types:"
)
def update_table(selected_models_from_filter):
filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy()
if not selected_models_from_filter:
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])]
else:
valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices]
if not valid_selected_models:
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])]
else:
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)]
df_to_display = format_leaderboard_df_for_display(filtered_df_orig)
return gr.DataFrame.update(value=df_to_display)
model_selector.change(
fn=update_table,
inputs=[model_selector],
outputs=[table_output]
)
with gr.TabItem("📝 Bias-aware eval VLM ", elem_id="llm-benchmark-tab-table", id=2):
with gr.Column():
gr.Markdown("### Bias-Aware Evaluation Results")
bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
gr.Markdown("---")
gr.Markdown("### Display Options for Bias Table")
bias_all_columns_list = BIAS_DF.columns.tolist()
bias_column_selector = gr.CheckboxGroup(
choices=bias_all_columns_list,
value=bias_all_columns_list,
label="Select Columns to Display:"
)
bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
bias_type_selector = gr.CheckboxGroup(
choices=bias_type_filter_choices,
value=bias_type_filter_choices,
label="Filter by Bias Type:"
)
bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
bias_model_selector_for_bias_tab = gr.CheckboxGroup(
choices=bias_model_filter_choices,
value=bias_model_filter_choices,
label="Filter by Model:"
)
def update_bias_table(selected_cols, selected_bias_types, selected_models):
temp_df = BIAS_DF.copy()
if selected_bias_types and "Bias_Type" in temp_df.columns:
temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
elif not selected_bias_types and "Bias_Type" in temp_df.columns:
temp_df = pd.DataFrame(columns=BIAS_DF.columns)
if selected_models and "Model" in temp_df.columns:
temp_df = temp_df[temp_df["Model"].isin(selected_models)]
elif not selected_models and "Model" in temp_df.columns:
if not selected_bias_types:
temp_df = pd.DataFrame(columns=BIAS_DF.columns)
elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())]
valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
if not valid_selected_cols and not temp_df.empty:
final_df = temp_df
elif not valid_selected_cols and temp_df.empty:
final_df = pd.DataFrame(columns=selected_cols)
else:
final_df = temp_df[valid_selected_cols]
return gr.DataFrame.update(value=final_df)
bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
with gr.TabItem("🧑‍🍳 User Type & Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
gr.Markdown("### Preference-Oriented Scores by User Type and Model")
def create_preference_score_chart():
user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
scores = np.array([
[0.20, 0.35, 0.45, 0.50, 0.85], # Detail-oriented
[0.40, 0.55, 0.67, 0.53, 0.58], # Risk-conscious
[0.20, 0.60, 0.72, 0.69, 0.75] # Accuracy-focused
])
x = np.arange(len(user_types))
width = 0.15
fig, ax = plt.subplots(figsize=(12, 7))
for i, model in enumerate(models):
ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model)
ax.set_xlabel('User type', fontsize=12)
ax.set_ylabel('Preference-oriented score', fontsize=12)
ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(user_types, fontsize=10)
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout(rect=[0, 0, 0.85, 1])
return fig
gr.Plot(value=create_preference_score_chart)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
gr.Markdown("---")
link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew"
gr.HTML(f'''
<div style="text-align: center; margin-top: 20px; margin-bottom: 20px;">
<a href="{link_to_discussion}" target="_blank" rel="noopener noreferrer"
style="background-color: #007bff; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; font-size: 16px;">
Submit Your Results / Open a New Discussion
</a>
</div>
''')
demo.queue(default_concurrency_limit=40).launch()