|
import gradio as gr |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
|
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
INTRODUCTION_TEXT, |
|
LLM_BENCHMARKS_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css |
|
|
|
|
|
|
|
|
|
|
|
|
|
LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1]) |
|
|
|
|
|
|
|
n_avg_cols_to_average = [ |
|
('Alignment', 'N-avg↑'), |
|
('Descriptiveness', 'N-avg↑'), |
|
('Complexity', 'N-avg↑'), |
|
('Side effects', 'N-avg↑') |
|
] |
|
|
|
|
|
for col_tuple in n_avg_cols_to_average: |
|
if col_tuple in LEADERBOARD_DF_ORIGINAL.columns: |
|
LEADERBOARD_DF_ORIGINAL[col_tuple] = pd.to_numeric(LEADERBOARD_DF_ORIGINAL[col_tuple], errors='coerce') |
|
else: |
|
print(f"Warning: N-avg column {col_tuple} not found for averaging.") |
|
|
|
|
|
existing_n_avg_cols = [col for col in n_avg_cols_to_average if col in LEADERBOARD_DF_ORIGINAL.columns] |
|
if existing_n_avg_cols: |
|
LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = LEADERBOARD_DF_ORIGINAL[existing_n_avg_cols].mean(axis=1) |
|
LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')].rank(method='min', ascending=False).astype(int) |
|
else: |
|
LEADERBOARD_DF_ORIGINAL[('Avg-', ' N-avg')] = np.nan |
|
LEADERBOARD_DF_ORIGINAL[('Avg-', 'Rank')] = np.nan |
|
|
|
|
|
|
|
model_col_tuple = ('Model', 'Model') |
|
rank_col_tuple = ('Avg-', 'Rank') |
|
avg_navg_col_tuple = ('Avg-', ' N-avg') |
|
|
|
new_col_order = [] |
|
if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: |
|
new_col_order.append(rank_col_tuple) |
|
if avg_navg_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: |
|
new_col_order.append(avg_navg_col_tuple) |
|
if model_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: |
|
new_col_order.append(model_col_tuple) |
|
|
|
for col in LEADERBOARD_DF_ORIGINAL.columns: |
|
if col not in new_col_order: |
|
new_col_order.append(col) |
|
LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL[new_col_order] |
|
|
|
|
|
if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns: |
|
LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL.sort_values(by=rank_col_tuple, ascending=True) |
|
|
|
|
|
def format_leaderboard_df_for_display(df_orig): |
|
df_display = df_orig.copy() |
|
new_columns = [] |
|
for col_tuple in df_display.columns: |
|
if col_tuple == ('Avg-', 'Rank'): |
|
new_columns.append('Overall Rank') |
|
elif col_tuple == ('Avg-', ' N-avg'): |
|
new_columns.append('Average N-avg') |
|
elif col_tuple == ('Model', 'Model'): |
|
new_columns.append('Model') |
|
else: |
|
new_columns.append(f"{col_tuple[0]}\n{col_tuple[1]}") |
|
df_display.columns = new_columns |
|
|
|
|
|
|
|
temp_formatted_df = pd.DataFrame(df_display.values, columns=new_columns, index=df_display.index) |
|
if 'Average N-avg' in temp_formatted_df.columns: |
|
|
|
temp_formatted_df['Average N-avg'] = pd.to_numeric(temp_formatted_df['Average N-avg'], errors='coerce') |
|
temp_formatted_df['Average N-avg'] = temp_formatted_df['Average N-avg'].map(lambda x: f"{x:.4f}" if pd.notnull(x) else '-') |
|
|
|
|
|
if 'Overall Rank' in temp_formatted_df.columns: |
|
def format_rank_with_emoji(rank_val): |
|
if pd.isnull(rank_val): |
|
return '-' |
|
try: |
|
rank_int = int(float(rank_val)) |
|
if rank_int == 1: |
|
return f"{rank_int} 🥇" |
|
elif rank_int == 2: |
|
return f"{rank_int} 🥈" |
|
elif rank_int == 3: |
|
return f"{rank_int} 🥉" |
|
else: |
|
return f"{rank_int}" |
|
except ValueError: |
|
return str(rank_val) |
|
temp_formatted_df['Overall Rank'] = temp_formatted_df['Overall Rank'].map(format_rank_with_emoji) |
|
|
|
return temp_formatted_df |
|
|
|
LEADERBOARD_DF_DISPLAY_INIT = format_leaderboard_df_for_display(LEADERBOARD_DF_ORIGINAL) |
|
|
|
BIAS_DF = pd.read_csv("bias_evaluation_data.csv") |
|
BIAS_DF = BIAS_DF.astype(str).fillna("-") |
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("🧠 Unified perf eval VLM captioners", elem_id="llm-benchmark-tab-table", id=0): |
|
with gr.Column(): |
|
table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True) |
|
|
|
gr.Markdown("---") |
|
gr.Markdown("### Display Options") |
|
|
|
model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist() |
|
model_selector = gr.CheckboxGroup( |
|
choices=model_filter_choices, |
|
value=model_filter_choices, |
|
label="Filter by Model types:" |
|
) |
|
|
|
def update_table(selected_models_from_filter): |
|
filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy() |
|
if not selected_models_from_filter: |
|
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] |
|
else: |
|
valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices] |
|
if not valid_selected_models: |
|
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] |
|
else: |
|
filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)] |
|
|
|
df_to_display = format_leaderboard_df_for_display(filtered_df_orig) |
|
return gr.DataFrame.update(value=df_to_display) |
|
|
|
model_selector.change( |
|
fn=update_table, |
|
inputs=[model_selector], |
|
outputs=[table_output] |
|
) |
|
|
|
with gr.TabItem("📝 Bias-aware eval VLM ", elem_id="llm-benchmark-tab-table", id=2): |
|
with gr.Column(): |
|
gr.Markdown("### Bias-Aware Evaluation Results") |
|
bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True) |
|
gr.Markdown("---") |
|
gr.Markdown("### Display Options for Bias Table") |
|
bias_all_columns_list = BIAS_DF.columns.tolist() |
|
bias_column_selector = gr.CheckboxGroup( |
|
choices=bias_all_columns_list, |
|
value=bias_all_columns_list, |
|
label="Select Columns to Display:" |
|
) |
|
bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else [] |
|
bias_type_selector = gr.CheckboxGroup( |
|
choices=bias_type_filter_choices, |
|
value=bias_type_filter_choices, |
|
label="Filter by Bias Type:" |
|
) |
|
bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else [] |
|
bias_model_selector_for_bias_tab = gr.CheckboxGroup( |
|
choices=bias_model_filter_choices, |
|
value=bias_model_filter_choices, |
|
label="Filter by Model:" |
|
) |
|
def update_bias_table(selected_cols, selected_bias_types, selected_models): |
|
temp_df = BIAS_DF.copy() |
|
if selected_bias_types and "Bias_Type" in temp_df.columns: |
|
temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)] |
|
elif not selected_bias_types and "Bias_Type" in temp_df.columns: |
|
temp_df = pd.DataFrame(columns=BIAS_DF.columns) |
|
if selected_models and "Model" in temp_df.columns: |
|
temp_df = temp_df[temp_df["Model"].isin(selected_models)] |
|
elif not selected_models and "Model" in temp_df.columns: |
|
if not selected_bias_types: |
|
temp_df = pd.DataFrame(columns=BIAS_DF.columns) |
|
elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any(): |
|
temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())] |
|
valid_selected_cols = [col for col in selected_cols if col in temp_df.columns] |
|
if not valid_selected_cols and not temp_df.empty: |
|
final_df = temp_df |
|
elif not valid_selected_cols and temp_df.empty: |
|
final_df = pd.DataFrame(columns=selected_cols) |
|
else: |
|
final_df = temp_df[valid_selected_cols] |
|
return gr.DataFrame.update(value=final_df) |
|
bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) |
|
bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) |
|
bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output]) |
|
|
|
with gr.TabItem("🧑🍳 User Type & Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3): |
|
with gr.Column(): |
|
gr.Markdown("### Preference-Oriented Scores by User Type and Model") |
|
def create_preference_score_chart(): |
|
user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused'] |
|
models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL'] |
|
scores = np.array([ |
|
[0.20, 0.35, 0.45, 0.50, 0.85], |
|
[0.40, 0.55, 0.67, 0.53, 0.58], |
|
[0.20, 0.60, 0.72, 0.69, 0.75] |
|
]) |
|
x = np.arange(len(user_types)) |
|
width = 0.15 |
|
fig, ax = plt.subplots(figsize=(12, 7)) |
|
for i, model in enumerate(models): |
|
ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model) |
|
ax.set_xlabel('User type', fontsize=12) |
|
ax.set_ylabel('Preference-oriented score', fontsize=12) |
|
ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14) |
|
ax.set_xticks(x) |
|
ax.set_xticklabels(user_types, fontsize=10) |
|
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') |
|
plt.ylim(0, 1.1) |
|
plt.grid(axis='y', linestyle='--', alpha=0.7) |
|
plt.tight_layout(rect=[0, 0, 0.85, 1]) |
|
return fig |
|
gr.Plot(value=create_preference_score_chart) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("📙 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=20, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
gr.Markdown("---") |
|
link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew" |
|
gr.HTML(f''' |
|
<div style="text-align: center; margin-top: 20px; margin-bottom: 20px;"> |
|
<a href="{link_to_discussion}" target="_blank" rel="noopener noreferrer" |
|
style="background-color: #007bff; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; font-size: 16px;"> |
|
Submit Your Results / Open a New Discussion |
|
</a> |
|
</div> |
|
''') |
|
|
|
demo.queue(default_concurrency_limit=40).launch() |