File size: 6,623 Bytes
359f755
536d515
359f755
77c0f20
359f755
 
77c0f20
359f755
536d515
21bc425
 
536d515
 
 
 
ce8066d
536d515
 
 
ce8066d
21bc425
 
 
 
 
536d515
 
 
21bc425
 
 
536d515
21bc425
 
 
 
 
 
 
63076cf
21bc425
 
 
 
 
63076cf
21bc425
536d515
 
 
21bc425
536d515
 
21bc425
 
536d515
 
 
 
 
 
21bc425
 
 
 
 
536d515
 
 
ce8066d
21bc425
 
 
 
536d515
21bc425
 
 
 
ce8066d
536d515
63076cf
 
536d515
21bc425
63076cf
536d515
 
24c8512
536d515
21bc425
 
 
 
 
 
 
 
 
 
536d515
 
 
21bc425
536d515
 
 
 
21bc425
536d515
63076cf
 
 
 
536d515
21bc425
 
 
 
 
 
 
 
536d515
 
 
21bc425
536d515
 
 
21bc425
 
 
 
 
 
 
 
 
 
 
 
 
63076cf
24c8512
21bc425
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
import sys
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn
from src.leaderboard.read_evals import get_raw_eval_results

def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    try:
        sys.stderr.write("\n=== GET_LEADERBOARD_DF DEBUG ===\n")
        sys.stderr.write("Starting leaderboard creation...\n")
        sys.stderr.write(f"Looking for results in: {results_path}\n")
        sys.stderr.write(f"Expected columns: {cols}\n")
        sys.stderr.write(f"Benchmark columns: {benchmark_cols}\n")
        sys.stderr.flush()

        raw_data = get_raw_eval_results(results_path)
        sys.stderr.write(f"\nFound {len(raw_data)} raw results\n")
        sys.stderr.flush()

        if not raw_data:
            sys.stderr.write("No raw data found, creating empty DataFrame\n")
            sys.stderr.flush()
            return create_empty_dataframe(cols, benchmark_cols)

        all_data_json = []
        for i, v in enumerate(raw_data):
            try:
                sys.stderr.write(f"Processing result {i+1}/{len(raw_data)}: {v.full_model}\n")
                sys.stderr.flush()
                
                data_dict = v.to_dict()
                
                # Validate the data_dict has required columns
                missing_cols = [col for col in cols if col not in data_dict]
                if missing_cols:
                    sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
                    # Add missing columns with default values
                    for col in missing_cols:
                        if col in benchmark_cols:
                            data_dict[col] = 0.0
                        elif col == AutoEvalColumn.model_type_symbol.name:
                            data_dict[col] = "?"
                        else:
                            data_dict[col] = ""
                        sys.stderr.flush()
                
                all_data_json.append(data_dict)
                sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
                sys.stderr.flush()
                
            except Exception as e:
                sys.stderr.write(f"Error processing result {i+1}/{len(raw_data)} ({v.full_model}): {e}\n")
                import traceback
                sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
                sys.stderr.flush()
                continue
        
        sys.stderr.write(f"\nConverted to {len(all_data_json)} JSON records\n")
        sys.stderr.flush()
        
        if not all_data_json:
            sys.stderr.write("No valid JSON records, creating empty DataFrame\n")
            sys.stderr.flush()
            return create_empty_dataframe(cols, benchmark_cols)
        
        if all_data_json:
            sys.stderr.write("Sample record keys: " + str(list(all_data_json[0].keys())) + "\n")
            sys.stderr.flush()

        try:
            df = pd.DataFrame.from_records(all_data_json)
            sys.stderr.write("\nCreated DataFrame with columns: " + str(df.columns.tolist()) + "\n")
            sys.stderr.write("DataFrame shape: " + str(df.shape) + "\n")
            sys.stderr.flush()
        except Exception as e:
            sys.stderr.write(f"Error creating DataFrame from records: {e}\n")
            sys.stderr.flush()
            return create_empty_dataframe(cols, benchmark_cols)

        try:
            # No sorting needed - we only have p-values
            sys.stderr.write("\nNo sorting applied - only p-values\n")
            sys.stderr.flush()
        except Exception as e:
            sys.stderr.write(f"\nError with DataFrame: {e}\n")
            sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
            sys.stderr.flush()

        try:
            # Ensure all required columns exist before selecting
            for col in cols:
                if col not in df.columns:
                    sys.stderr.write(f"Adding missing column during selection: {col}\n")
                    if col in benchmark_cols or col == AutoEvalColumn.average.name:
                        df[col] = 0.0
                    else:
                        df[col] = ""
                    sys.stderr.flush()
            
            df = df[cols].round(decimals=2)
            sys.stderr.write("\nSelected and rounded columns\n")
            sys.stderr.flush()
        except Exception as e:
            sys.stderr.write(f"\nError selecting columns: {e}\n")
            sys.stderr.write("Requested columns: " + str(cols) + "\n")
            sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
            sys.stderr.flush()
            return create_empty_dataframe(cols, benchmark_cols)

        # No filtering needed - we only have p-values
        sys.stderr.write("\nFinal DataFrame shape (no filtering): " + str(df.shape) + "\n")
        sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
        sys.stderr.flush()
        
        # Final validation
        if df is None or df.empty:
            sys.stderr.write("Final DataFrame is None or empty, returning fallback\n")
            sys.stderr.flush()
            return create_empty_dataframe(cols, benchmark_cols)
        
        sys.stderr.write(f"=== FINAL RESULT: DataFrame with {len(df)} rows and {len(df.columns)} columns ===\n")
        sys.stderr.flush()
        return df
        
    except Exception as e:
        sys.stderr.write(f"\nCRITICAL ERROR in get_leaderboard_df: {e}\n")
        import traceback
        sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
        sys.stderr.flush()
        # Always return a valid DataFrame, never None
        return create_empty_dataframe(cols, benchmark_cols)

def create_empty_dataframe(cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Create a valid empty DataFrame with all required columns"""
    import sys
    
    sys.stderr.write("Creating empty fallback DataFrame...\n")
    sys.stderr.flush()
    
    empty_df = pd.DataFrame(columns=cols)
    # Ensure correct column types
    for col in cols:
        if col in benchmark_cols:
            empty_df[col] = pd.Series(dtype=float)
        else:
            empty_df[col] = pd.Series(dtype=str)
    
    sys.stderr.write(f"Empty DataFrame created with columns: {empty_df.columns.tolist()}\n")
    sys.stderr.flush()
    return empty_df