Spaces:

hpi-dhc
/

FairEval

Runtime error

App Files Files Community

illorca commited on Dec 3, 2022

Commit

d8424e9

1 Parent(s): 79a6fc4

Include weighted mode. ORIGINAL FAIREVAL SCRIPT IS MODIFIED

Browse files

Files changed (2) hide show

FairEval.py +23 -7
FairEvalUtils.py +2 -1

FairEval.py CHANGED Viewed

@@ -119,6 +119,7 @@ class FairEvaluation(evaluate.Metric):
             suffix: bool = False,
             scheme: Optional[str] = None,
             mode: Optional[str] = 'fair',
             error_format: Optional[str] = 'count',
             zero_division: Union[str, int] = "warn",
     ):
@@ -147,25 +148,38 @@ class FairEvaluation(evaluate.Metric):
         pred_spans = seq_to_fair(pred_spans)
         # (3) COUNT ERRORS AND CALCULATE SCORES
-        total_errors = compare_spans([], [])  # initialize empty error count dictionary
         for i in range(len(true_spans)):
             sentence_errors = compare_spans(true_spans[i], pred_spans[i])
             total_errors = add_dict(total_errors, sentence_errors)
-        results = calculate_results(total_errors)
         del results['conf']
-        # (4) SELECT OUTPUT MODE AND REFORMAT AS SEQEVAL HUGGINGFACE OUTPUT
         output = {}
         total_trad_errors = results['overall']['traditional']['FP'] + results['overall']['traditional']['FN']
         total_fair_errors = results['overall']['fair']['FP'] + results['overall']['fair']['FN'] + \
                             results['overall']['fair']['LE'] + results['overall']['fair']['BE'] + \
                             results['overall']['fair']['LBE']
-        assert mode in ['traditional', 'fair'], 'mode must be \'traditional\' or \'fair\''
         assert error_format in ['count', 'proportion'], 'error_format must be \'count\' or \'proportion\''
         if mode == 'traditional':
             for k, v in results['per_label'][mode].items():
                 if error_format == 'count':
@@ -174,7 +188,7 @@ class FairEvaluation(evaluate.Metric):
                 elif error_format == 'proportion':
                     output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'],
                                  'FP': v['FP'] / total_trad_errors, 'FN': v['FN'] / total_trad_errors}
-        elif mode == 'fair':
             for k, v in results['per_label'][mode].items():
                 if error_format == 'count':
                     output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'],
@@ -185,10 +199,12 @@ class FairEvaluation(evaluate.Metric):
                                  'LE': v['LE'] / total_fair_errors, 'BE': v['BE'] / total_fair_errors,
                                  'LBE': v['LBE'] / total_fair_errors}
         output['overall_precision'] = results['overall'][mode]['Prec']
         output['overall_recall'] = results['overall'][mode]['Rec']
         output['overall_f1'] = results['overall'][mode]['F1']
         if mode == 'traditional':
             output['TP'] = results['overall'][mode]['TP']
             output['FP'] = results['overall'][mode]['FP']
@@ -196,7 +212,7 @@ class FairEvaluation(evaluate.Metric):
             if error_format == 'proportion':
                 output['FP'] = output['FP'] / total_trad_errors
                 output['FN'] = output['FN'] / total_trad_errors
-        elif mode == 'fair':
             output['TP'] = results['overall'][mode]['TP']
             output['FP'] = results['overall'][mode]['FP']
             output['FN'] = results['overall'][mode]['FN']

             suffix: bool = False,
             scheme: Optional[str] = None,
             mode: Optional[str] = 'fair',
+            weights: dict = None,
             error_format: Optional[str] = 'count',
             zero_division: Union[str, int] = "warn",
     ):
         pred_spans = seq_to_fair(pred_spans)
         # (3) COUNT ERRORS AND CALCULATE SCORES
+        total_errors = compare_spans([], [])
         for i in range(len(true_spans)):
             sentence_errors = compare_spans(true_spans[i], pred_spans[i])
             total_errors = add_dict(total_errors, sentence_errors)
+        if weights is None and mode == 'weighted':
+            print("The chosen mode is \'weighted\', but no weights are given. Setting weights to:\n")
+            weights = {"TP": {"TP": 1},
+                 "FP": {"FP": 1},
+                 "FN": {"FN": 1},
+                 "LE": {"TP": 0, "FP": 0.5, "FN": 0.5},
+                 "BE": {"TP": 0.5, "FP": 0.25, "FN": 0.25},
+                 "LBE": {"TP": 0, "FP": 0.5, "FN": 0.5}}
+            print(weights)
+        config = {"labels": "all", "eval_method": [mode], "weights": weights,}
+        results = calculate_results(total_errors, config)
         del results['conf']
+        # (4) SELECT OUTPUT MODE AND REFORMAT AS SEQEVAL-HUGGINGFACE OUTPUT
+        # initialize empty dictionary and count errors
         output = {}
         total_trad_errors = results['overall']['traditional']['FP'] + results['overall']['traditional']['FN']
         total_fair_errors = results['overall']['fair']['FP'] + results['overall']['fair']['FN'] + \
                             results['overall']['fair']['LE'] + results['overall']['fair']['BE'] + \
                             results['overall']['fair']['LBE']
+        # assert valid options
+        assert mode in ['traditional', 'fair', 'weighted'], 'mode must be \'traditional\', \'fair\' or \'weighted\''
         assert error_format in ['count', 'proportion'], 'error_format must be \'count\' or \'proportion\''
+        # append entity-level errors and scores
         if mode == 'traditional':
             for k, v in results['per_label'][mode].items():
                 if error_format == 'count':
                 elif error_format == 'proportion':
                     output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'],
                                  'FP': v['FP'] / total_trad_errors, 'FN': v['FN'] / total_trad_errors}
+        elif mode == 'fair' or mode == 'weighted':
             for k, v in results['per_label'][mode].items():
                 if error_format == 'count':
                     output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'],
                                  'LE': v['LE'] / total_fair_errors, 'BE': v['BE'] / total_fair_errors,
                                  'LBE': v['LBE'] / total_fair_errors}
+        # append overall scores
         output['overall_precision'] = results['overall'][mode]['Prec']
         output['overall_recall'] = results['overall'][mode]['Rec']
         output['overall_f1'] = results['overall'][mode]['F1']
+        # append overall error counts
         if mode == 'traditional':
             output['TP'] = results['overall'][mode]['TP']
             output['FP'] = results['overall'][mode]['FP']
             if error_format == 'proportion':
                 output['FP'] = output['FP'] / total_trad_errors
                 output['FN'] = output['FN'] / total_trad_errors
+        elif mode == 'fair' or 'weighted':
             output['TP'] = results['overall'][mode]['TP']
             output['FP'] = results['overall'][mode]['FP']
             output['FN'] = results['overall'][mode]['FN']

FairEvalUtils.py CHANGED Viewed

@@ -1149,7 +1149,7 @@ def add_dict(base_dict, dict_to_add):
 #############################
-def calculate_results(eval_dict, **config):
     """
     Calculate overall precision, recall, and F-scores.
@@ -1173,6 +1173,7 @@ def calculate_results(eval_dict, **config):
         eval_dict["overall"]["weighted"] = {}
         for err_type in eval_dict["overall"]["fair"]:
             eval_dict["overall"]["weighted"][err_type] = eval_dict["overall"]["fair"][err_type]
         for label in eval_dict["per_label"]["fair"]:
             eval_dict["per_label"]["weighted"][label] = {}
             for err_type in eval_dict["per_label"]["fair"][label]:

 #############################
+def calculate_results(eval_dict, config):
     """
     Calculate overall precision, recall, and F-scores.
         eval_dict["overall"]["weighted"] = {}
         for err_type in eval_dict["overall"]["fair"]:
             eval_dict["overall"]["weighted"][err_type] = eval_dict["overall"]["fair"][err_type]
+        eval_dict["per_label"]["weighted"] = {}
         for label in eval_dict["per_label"]["fair"]:
             eval_dict["per_label"]["weighted"][label] = {}
             for err_type in eval_dict["per_label"]["fair"][label]: