Spaces:
Runtime error
Runtime error
File size: 6,623 Bytes
359f755 536d515 359f755 77c0f20 359f755 77c0f20 359f755 536d515 21bc425 536d515 ce8066d 536d515 ce8066d 21bc425 536d515 21bc425 536d515 21bc425 63076cf 21bc425 63076cf 21bc425 536d515 21bc425 536d515 21bc425 536d515 21bc425 536d515 ce8066d 21bc425 536d515 21bc425 ce8066d 536d515 63076cf 536d515 21bc425 63076cf 536d515 24c8512 536d515 21bc425 536d515 21bc425 536d515 21bc425 536d515 63076cf 536d515 21bc425 536d515 21bc425 536d515 21bc425 63076cf 24c8512 21bc425 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import pandas as pd
import sys
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn
from src.leaderboard.read_evals import get_raw_eval_results
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
try:
sys.stderr.write("\n=== GET_LEADERBOARD_DF DEBUG ===\n")
sys.stderr.write("Starting leaderboard creation...\n")
sys.stderr.write(f"Looking for results in: {results_path}\n")
sys.stderr.write(f"Expected columns: {cols}\n")
sys.stderr.write(f"Benchmark columns: {benchmark_cols}\n")
sys.stderr.flush()
raw_data = get_raw_eval_results(results_path)
sys.stderr.write(f"\nFound {len(raw_data)} raw results\n")
sys.stderr.flush()
if not raw_data:
sys.stderr.write("No raw data found, creating empty DataFrame\n")
sys.stderr.flush()
return create_empty_dataframe(cols, benchmark_cols)
all_data_json = []
for i, v in enumerate(raw_data):
try:
sys.stderr.write(f"Processing result {i+1}/{len(raw_data)}: {v.full_model}\n")
sys.stderr.flush()
data_dict = v.to_dict()
# Validate the data_dict has required columns
missing_cols = [col for col in cols if col not in data_dict]
if missing_cols:
sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
# Add missing columns with default values
for col in missing_cols:
if col in benchmark_cols:
data_dict[col] = 0.0
elif col == AutoEvalColumn.model_type_symbol.name:
data_dict[col] = "?"
else:
data_dict[col] = ""
sys.stderr.flush()
all_data_json.append(data_dict)
sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"Error processing result {i+1}/{len(raw_data)} ({v.full_model}): {e}\n")
import traceback
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
sys.stderr.flush()
continue
sys.stderr.write(f"\nConverted to {len(all_data_json)} JSON records\n")
sys.stderr.flush()
if not all_data_json:
sys.stderr.write("No valid JSON records, creating empty DataFrame\n")
sys.stderr.flush()
return create_empty_dataframe(cols, benchmark_cols)
if all_data_json:
sys.stderr.write("Sample record keys: " + str(list(all_data_json[0].keys())) + "\n")
sys.stderr.flush()
try:
df = pd.DataFrame.from_records(all_data_json)
sys.stderr.write("\nCreated DataFrame with columns: " + str(df.columns.tolist()) + "\n")
sys.stderr.write("DataFrame shape: " + str(df.shape) + "\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"Error creating DataFrame from records: {e}\n")
sys.stderr.flush()
return create_empty_dataframe(cols, benchmark_cols)
try:
# No sorting needed - we only have p-values
sys.stderr.write("\nNo sorting applied - only p-values\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"\nError with DataFrame: {e}\n")
sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
sys.stderr.flush()
try:
# Ensure all required columns exist before selecting
for col in cols:
if col not in df.columns:
sys.stderr.write(f"Adding missing column during selection: {col}\n")
if col in benchmark_cols or col == AutoEvalColumn.average.name:
df[col] = 0.0
else:
df[col] = ""
sys.stderr.flush()
df = df[cols].round(decimals=2)
sys.stderr.write("\nSelected and rounded columns\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"\nError selecting columns: {e}\n")
sys.stderr.write("Requested columns: " + str(cols) + "\n")
sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
sys.stderr.flush()
return create_empty_dataframe(cols, benchmark_cols)
# No filtering needed - we only have p-values
sys.stderr.write("\nFinal DataFrame shape (no filtering): " + str(df.shape) + "\n")
sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
sys.stderr.flush()
# Final validation
if df is None or df.empty:
sys.stderr.write("Final DataFrame is None or empty, returning fallback\n")
sys.stderr.flush()
return create_empty_dataframe(cols, benchmark_cols)
sys.stderr.write(f"=== FINAL RESULT: DataFrame with {len(df)} rows and {len(df.columns)} columns ===\n")
sys.stderr.flush()
return df
except Exception as e:
sys.stderr.write(f"\nCRITICAL ERROR in get_leaderboard_df: {e}\n")
import traceback
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
sys.stderr.flush()
# Always return a valid DataFrame, never None
return create_empty_dataframe(cols, benchmark_cols)
def create_empty_dataframe(cols: list, benchmark_cols: list) -> pd.DataFrame:
"""Create a valid empty DataFrame with all required columns"""
import sys
sys.stderr.write("Creating empty fallback DataFrame...\n")
sys.stderr.flush()
empty_df = pd.DataFrame(columns=cols)
# Ensure correct column types
for col in cols:
if col in benchmark_cols:
empty_df[col] = pd.Series(dtype=float)
else:
empty_df[col] = pd.Series(dtype=str)
sys.stderr.write(f"Empty DataFrame created with columns: {empty_df.columns.tolist()}\n")
sys.stderr.flush()
return empty_df
|