Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Titova Ksenia
		
	commited on
		
		
					Commit 
							
							·
						
						758c9c5
	
1
								Parent(s):
							
							1077ec2
								
remove average_pb
Browse files- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +1 -8
- src/populate.py +3 -5
    	
        src/display/utils.py
    CHANGED
    
    | @@ -27,7 +27,7 @@ auto_eval_column_dict = [] | |
| 27 | 
             
            auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
         | 
| 28 | 
             
            #Scores
         | 
| 29 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Correlation ⬆️", "number", True)])
         | 
| 30 | 
            -
            auto_eval_column_dict.append(["average_pb", ColumnContent, ColumnContent("Positional Bias Impact", "number", True)])
         | 
| 31 | 
             
            for task in Tasks:
         | 
| 32 | 
             
                auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
         | 
| 33 | 
             
            # Model information
         | 
|  | |
| 27 | 
             
            auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
         | 
| 28 | 
             
            #Scores
         | 
| 29 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Correlation ⬆️", "number", True)])
         | 
| 30 | 
            +
            # auto_eval_column_dict.append(["average_pb", ColumnContent, ColumnContent("Positional Bias Impact", "number", True)])
         | 
| 31 | 
             
            for task in Tasks:
         | 
| 32 | 
             
                auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
         | 
| 33 | 
             
            # Model information
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -114,11 +114,7 @@ class EvalResult: | |
| 114 | 
             
                def to_dict(self, mina=0, maxa=1):
         | 
| 115 | 
             
                    """Converts the Eval Result to a dict compatible with our dataframe display"""
         | 
| 116 | 
             
                    average = sum([self.results["apcc"], self.results["mpcc"]]) / 2
         | 
| 117 | 
            -
                    print("self.results mpcc_delta", self.results["mpcc_delta"])
         | 
| 118 |  | 
| 119 | 
            -
                    norm_mpcc_delta = (float(self.results["mpcc_delta"]) - mina) / (maxa - mina)
         | 
| 120 | 
            -
                    print("norm_mpcc_delta", norm_mpcc_delta)
         | 
| 121 | 
            -
                    average_pb = sum([norm_mpcc_delta, self.results["mpcc_cons"], self.results["pcon_ab"]]) / 3
         | 
| 122 | 
             
                    data_dict = {
         | 
| 123 | 
             
                        "eval_name": self.eval_name,  # not a column, just a save name,
         | 
| 124 | 
             
                        AutoEvalColumn.precision.name: self.precision.value.name,
         | 
| @@ -129,7 +125,6 @@ class EvalResult: | |
| 129 | 
             
                        AutoEvalColumn.model.name: make_clickable_model(self.full_model),
         | 
| 130 | 
             
                        AutoEvalColumn.revision.name: self.revision,
         | 
| 131 | 
             
                        AutoEvalColumn.average.name: average,
         | 
| 132 | 
            -
                        AutoEvalColumn.average_pb.name: average_pb,
         | 
| 133 | 
             
                        AutoEvalColumn.license.name: self.license,
         | 
| 134 | 
             
                        AutoEvalColumn.likes.name: self.likes,
         | 
| 135 | 
             
                        AutoEvalColumn.params.name: self.num_params,
         | 
| @@ -201,11 +196,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu | |
| 201 | 
             
                        eval_results[eval_name] = eval_result
         | 
| 202 |  | 
| 203 | 
             
                results = []
         | 
| 204 | 
            -
                mina = min([a.results["mpcc_delta"] for a in eval_results.values()])
         | 
| 205 | 
            -
                maxa = max([a.results["mpcc_delta"] for a in eval_results.values()])
         | 
| 206 | 
             
                for v in eval_results.values():
         | 
| 207 | 
             
                    try:
         | 
| 208 | 
            -
                        v.to_dict( | 
| 209 | 
             
                        results.append(v)
         | 
| 210 | 
             
                    except KeyError as e:  # not all eval values present
         | 
| 211 | 
             
                        print("e", e)
         | 
|  | |
| 114 | 
             
                def to_dict(self, mina=0, maxa=1):
         | 
| 115 | 
             
                    """Converts the Eval Result to a dict compatible with our dataframe display"""
         | 
| 116 | 
             
                    average = sum([self.results["apcc"], self.results["mpcc"]]) / 2
         | 
|  | |
| 117 |  | 
|  | |
|  | |
|  | |
| 118 | 
             
                    data_dict = {
         | 
| 119 | 
             
                        "eval_name": self.eval_name,  # not a column, just a save name,
         | 
| 120 | 
             
                        AutoEvalColumn.precision.name: self.precision.value.name,
         | 
|  | |
| 125 | 
             
                        AutoEvalColumn.model.name: make_clickable_model(self.full_model),
         | 
| 126 | 
             
                        AutoEvalColumn.revision.name: self.revision,
         | 
| 127 | 
             
                        AutoEvalColumn.average.name: average,
         | 
|  | |
| 128 | 
             
                        AutoEvalColumn.license.name: self.license,
         | 
| 129 | 
             
                        AutoEvalColumn.likes.name: self.likes,
         | 
| 130 | 
             
                        AutoEvalColumn.params.name: self.num_params,
         | 
|  | |
| 196 | 
             
                        eval_results[eval_name] = eval_result
         | 
| 197 |  | 
| 198 | 
             
                results = []
         | 
|  | |
|  | |
| 199 | 
             
                for v in eval_results.values():
         | 
| 200 | 
             
                    try:
         | 
| 201 | 
            +
                        v.to_dict()  # we test if the dict version is complete
         | 
| 202 | 
             
                        results.append(v)
         | 
| 203 | 
             
                    except KeyError as e:  # not all eval values present
         | 
| 204 | 
             
                        print("e", e)
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -11,9 +11,7 @@ from src.leaderboard.read_evals import get_raw_eval_results | |
| 11 | 
             
            def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 12 | 
             
                """Creates a dataframe from all the individual experiment results"""
         | 
| 13 | 
             
                raw_data = get_raw_eval_results(results_path, requests_path)
         | 
| 14 | 
            -
                 | 
| 15 | 
            -
                maxa = max([a.results["mpcc_delta"] for a in raw_data.values()])
         | 
| 16 | 
            -
                all_data_json = [v.to_dict(mina, maxa) for v in raw_data]
         | 
| 17 |  | 
| 18 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 19 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| @@ -30,10 +28,10 @@ def set_style_for_leaderboard_df(df: pd.DataFrame) -> pd.DataFrame: | |
| 30 | 
             
                # Adding CSS to style the specific column header
         | 
| 31 | 
             
                styled_df.set_table_styles({
         | 
| 32 | 
             
                    AutoEvalColumn.average.name: [{'selector': 'th.col_heading.level0', 'props': 'color: green;'}],
         | 
| 33 | 
            -
                    AutoEvalColumn.average_pb.name: [{'selector': 'th.col_heading.level0', 'props': 'color: green;'}]
         | 
| 34 | 
             
                }, overwrite=False)
         | 
| 35 |  | 
| 36 | 
            -
                styled_df.format(na_rep="").bar(align=0,  subset=[AutoEvalColumn.average.name | 
| 37 | 
             
                return styled_df
         | 
| 38 |  | 
| 39 | 
             
            def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         | 
|  | |
| 11 | 
             
            def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 12 | 
             
                """Creates a dataframe from all the individual experiment results"""
         | 
| 13 | 
             
                raw_data = get_raw_eval_results(results_path, requests_path)
         | 
| 14 | 
            +
                all_data_json = [v.to_dict() for v in raw_data]
         | 
|  | |
|  | |
| 15 |  | 
| 16 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 17 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
|  | |
| 28 | 
             
                # Adding CSS to style the specific column header
         | 
| 29 | 
             
                styled_df.set_table_styles({
         | 
| 30 | 
             
                    AutoEvalColumn.average.name: [{'selector': 'th.col_heading.level0', 'props': 'color: green;'}],
         | 
| 31 | 
            +
                    # AutoEvalColumn.average_pb.name: [{'selector': 'th.col_heading.level0', 'props': 'color: green;'}]
         | 
| 32 | 
             
                }, overwrite=False)
         | 
| 33 |  | 
| 34 | 
            +
                styled_df.format(na_rep="").bar(align=0,  subset=[AutoEvalColumn.average.name], cmap="PiYG")
         | 
| 35 | 
             
                return styled_df
         | 
| 36 |  | 
| 37 | 
             
            def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         | 
