Spaces:
Running
Running
Titova Ksenia
commited on
Commit
·
21865d4
1
Parent(s):
8d664ba
add sample data
Browse files- data/generations/gpt-3.5-turbo_arena_hard_ru_responses.jsonl +0 -0
- data/generations/gpt-4o-2024-11-20_arena_hard_ru_responses.jsonl +0 -0
- src/about.py +4 -6
- src/evaluate/calculate_metrics.py +98 -0
- src/evaluate/evaluate_answers.py +95 -118
- src/evaluate/util.py +14 -12
- src/llm_benchmarks_text.md +28 -0
data/generations/gpt-3.5-turbo_arena_hard_ru_responses.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/generations/gpt-4o-2024-11-20_arena_hard_ru_responses.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/about.py
CHANGED
|
@@ -32,13 +32,11 @@ MTSAIR draft leaderboard
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
## Reproducibility
|
| 39 |
-
To reproduce our results, here is the commands you can run:
|
| 40 |
-
|
| 41 |
-
"""
|
| 42 |
|
| 43 |
EVALUATION_QUEUE_TEXT = """
|
| 44 |
## Some good practices before submitting a model
|
|
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 35 |
+
|
| 36 |
+
system_prompt_filename = "./llm_benchmarks_text.md"
|
| 37 |
+
with open(system_prompt_filename, 'r') as file:
|
| 38 |
+
LLM_BENCHMARKS_TEXT = file.read()
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
EVALUATION_QUEUE_TEXT = """
|
| 42 |
## Some good practices before submitting a model
|
src/evaluate/calculate_metrics.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from tabulate import tabulate
|
| 4 |
+
from scipy import stats
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def draw_dataframe(
|
| 8 |
+
baseline_model: str,
|
| 9 |
+
candidate_model: str,
|
| 10 |
+
four_numbers: List,
|
| 11 |
+
unidentified: float,
|
| 12 |
+
input_filename: str,
|
| 13 |
+
pconab: float = -1,
|
| 14 |
+
) -> None:
|
| 15 |
+
|
| 16 |
+
df = pd.DataFrame(
|
| 17 |
+
{
|
| 18 |
+
"baseline_model": baseline_model,
|
| 19 |
+
"candidate_model": candidate_model,
|
| 20 |
+
"Four numbers": " ".join([str(i) for i in four_numbers]),
|
| 21 |
+
"PCon@AB (for candidate)": pconab,
|
| 22 |
+
"Unidentified": unidentified,
|
| 23 |
+
},
|
| 24 |
+
index=[input_filename],
|
| 25 |
+
)
|
| 26 |
+
df.index.name = "filename"
|
| 27 |
+
print(tabulate(df, headers='keys', tablefmt='psql'))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def calculate_pconab(
|
| 31 |
+
answers_swap_False: List,
|
| 32 |
+
answers_swap_True: List,
|
| 33 |
+
) -> float:
|
| 34 |
+
|
| 35 |
+
corr_1_2, corr, total, without_tie_total = 0, 0, 0, 0
|
| 36 |
+
for val_0, val_1 in zip(answers_swap_False, answers_swap_True):
|
| 37 |
+
total += 1
|
| 38 |
+
val_1 = {"A": "B", "B": "A"}.get(val_1, val_1) # swap A <-> for swap=True
|
| 39 |
+
if val_0 == val_1:
|
| 40 |
+
corr += 1
|
| 41 |
+
if val_0 in ["A", "B"] or val_1 in ["A", "B"]:
|
| 42 |
+
without_tie_total += 1
|
| 43 |
+
if val_0 == val_1:
|
| 44 |
+
corr_1_2 += 1
|
| 45 |
+
|
| 46 |
+
if without_tie_total > 0:
|
| 47 |
+
return corr_1_2 / without_tie_total
|
| 48 |
+
return 0.0
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def calculate_medians(results, window_len=10, stride=5):
|
| 52 |
+
results_len = len(results)
|
| 53 |
+
medians = []
|
| 54 |
+
for i in range(0, results_len, stride):
|
| 55 |
+
if i + window_len < results_len:
|
| 56 |
+
cur_batch = results[i: i + window_len]
|
| 57 |
+
cur_dict = {"A": 0, "B": 0, "C": 0}
|
| 58 |
+
for v in cur_batch:
|
| 59 |
+
if v == 1:
|
| 60 |
+
cur_dict["A"] += 1
|
| 61 |
+
if v == 2:
|
| 62 |
+
cur_dict["B"] += 1
|
| 63 |
+
if v == 3:
|
| 64 |
+
cur_dict["C"] += 1
|
| 65 |
+
try:
|
| 66 |
+
cur_med = (cur_dict["A"]+cur_dict["B"])/(cur_dict["A"]+cur_dict["B"]+2*cur_dict["C"])
|
| 67 |
+
except:
|
| 68 |
+
cur_med = 0
|
| 69 |
+
medians.append(cur_med)
|
| 70 |
+
return medians
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def calculate_correlations(
|
| 74 |
+
answers: List,
|
| 75 |
+
manual_answers: List,
|
| 76 |
+
) -> float:
|
| 77 |
+
|
| 78 |
+
result_mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
|
| 79 |
+
results, manual_results = [], []
|
| 80 |
+
five = [0, 0, 0, 0]
|
| 81 |
+
manual_five = [0, 0, 0, 0]
|
| 82 |
+
|
| 83 |
+
for answer, manual_answer in zip(answers, manual_answers):
|
| 84 |
+
if answer in result_mapping and manual_answer in result_mapping:
|
| 85 |
+
results.append(result_mapping[answer])
|
| 86 |
+
manual_results.append(result_mapping[manual_answer])
|
| 87 |
+
five[results[-1] - 1] += 1
|
| 88 |
+
manual_five[manual_results[-1] - 1] += 1
|
| 89 |
+
|
| 90 |
+
medians = calculate_medians(results)
|
| 91 |
+
manual_medians = calculate_medians(manual_results)
|
| 92 |
+
|
| 93 |
+
correlations = {}
|
| 94 |
+
correlations["FSCC"] = stats.spearmanr(manual_results, results).statistic
|
| 95 |
+
correlations["APCC"] = stats.pearsonr(manual_five, five).statistic
|
| 96 |
+
correlations["MPCC"] = stats.pearsonr(manual_medians, medians).statistic
|
| 97 |
+
|
| 98 |
+
return 0.0
|
src/evaluate/evaluate_answers.py
CHANGED
|
@@ -4,17 +4,11 @@ import json
|
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
from collections import defaultdict
|
| 7 |
-
from
|
| 8 |
-
from pathlib import PurePath
|
| 9 |
-
from typing import Any, Dict, List, Optional, Union
|
| 10 |
-
from warnings import warn
|
| 11 |
-
|
| 12 |
-
import openai
|
| 13 |
-
import pandas as pd
|
| 14 |
-
from openai import AsyncOpenAI
|
| 15 |
-
from tqdm import tqdm
|
| 16 |
-
from tabulate import tabulate
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
SYSTEM_PROMPT = """
|
| 20 |
Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
|
|
@@ -43,44 +37,27 @@ IMPORTANT:
|
|
| 43 |
- Focus purely on content quality based on the given factual information and evaluation criteria.
|
| 44 |
- Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
|
| 45 |
"""
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
STOP_THINKING = "</think>"
|
| 48 |
|
| 49 |
|
| 50 |
-
async def
|
| 51 |
-
async with openai.AsyncOpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
|
| 52 |
-
completion = await client.chat.completions.create(
|
| 53 |
-
model=eval_model,
|
| 54 |
-
n=1,
|
| 55 |
-
messages=[
|
| 56 |
-
{"role": "system", "content": system},
|
| 57 |
-
{"role": "user", "content": prompt},
|
| 58 |
-
],
|
| 59 |
-
max_tokens=1024,
|
| 60 |
-
timeout=10000,
|
| 61 |
-
temperature=0.0,
|
| 62 |
-
top_p=1.0,
|
| 63 |
-
logprobs=True,
|
| 64 |
-
top_logprobs=20,
|
| 65 |
-
)
|
| 66 |
-
return completion
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
|
| 70 |
tasks = []
|
| 71 |
models_answers = []
|
| 72 |
-
|
| 73 |
|
| 74 |
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 75 |
# extract only the responses from the input dict
|
| 76 |
responses = {base: None, cand: None}
|
| 77 |
if instance_cand["replies"]["model_name"] == cand:
|
| 78 |
-
responses[
|
| 79 |
if instance_base["replies"]["model_name"] == base:
|
| 80 |
-
responses[
|
| 81 |
|
| 82 |
if responses[base] is None or responses[cand] is None:
|
| 83 |
-
raise ValueError("There
|
| 84 |
|
| 85 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 86 |
instance_cand["turns"]["content"],
|
|
@@ -88,25 +65,25 @@ async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tu
|
|
| 88 |
responses[base],
|
| 89 |
)
|
| 90 |
models_answers.append(responses)
|
| 91 |
-
task = asyncio.create_task(
|
| 92 |
tasks.append(task)
|
| 93 |
|
| 94 |
eval_results = await asyncio.gather(*tasks)
|
| 95 |
return models_answers, eval_results
|
| 96 |
|
| 97 |
|
| 98 |
-
def generate_sync(
|
| 99 |
models_answers = []
|
| 100 |
eval_results = []
|
| 101 |
-
|
| 102 |
|
| 103 |
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 104 |
# extract only the responses from the input dict
|
| 105 |
responses = {base: None, cand: None}
|
| 106 |
if instance_cand["replies"]["model_name"] == cand:
|
| 107 |
-
responses[
|
| 108 |
if instance_base["replies"]["model_name"] == base:
|
| 109 |
-
responses[
|
| 110 |
|
| 111 |
if responses[base] is None or responses[cand] is None:
|
| 112 |
raise ValueError("There are no cand or base model answer")
|
|
@@ -117,33 +94,11 @@ def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tup
|
|
| 117 |
responses[base],
|
| 118 |
)
|
| 119 |
models_answers.append(responses)
|
| 120 |
-
result =
|
| 121 |
eval_results.append(result)
|
| 122 |
return models_answers, eval_results
|
| 123 |
|
| 124 |
|
| 125 |
-
def deepseek_eval_sync(system: str, prompt: str, eval_model: str, host: str) -> Optional[str]:
|
| 126 |
-
with openai.OpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
|
| 127 |
-
try:
|
| 128 |
-
completion = client.chat.completions.create(
|
| 129 |
-
model=eval_model,
|
| 130 |
-
n=1,
|
| 131 |
-
messages=[
|
| 132 |
-
{"role": "system", "content": system},
|
| 133 |
-
{"role": "user", "content": prompt},
|
| 134 |
-
],
|
| 135 |
-
max_tokens=1024,
|
| 136 |
-
temperature=0.0,
|
| 137 |
-
top_p=1.0,
|
| 138 |
-
logprobs=True,
|
| 139 |
-
top_logprobs=20,
|
| 140 |
-
)
|
| 141 |
-
return completion
|
| 142 |
-
except BaseException as e:
|
| 143 |
-
print(f"pausing {e}")
|
| 144 |
-
|
| 145 |
-
return None
|
| 146 |
-
|
| 147 |
|
| 148 |
def construct_output(
|
| 149 |
question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
|
|
@@ -175,7 +130,7 @@ def validate_answer(answer: str) -> str:
|
|
| 175 |
for ans in matches:
|
| 176 |
if ans == "A" or ans == "B" or ans == "C" or ans == "D":
|
| 177 |
return ans
|
| 178 |
-
print(answer)
|
| 179 |
return "E"
|
| 180 |
|
| 181 |
|
|
@@ -187,26 +142,6 @@ def model_wins_with_tie(row: Dict[str, Any], model_name: str):
|
|
| 187 |
return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
|
| 188 |
|
| 189 |
|
| 190 |
-
def draw_dataframe(
|
| 191 |
-
baseline_model: str,
|
| 192 |
-
candidate_model: str,
|
| 193 |
-
four_numbers: List,
|
| 194 |
-
input_filename: str,
|
| 195 |
-
) -> None:
|
| 196 |
-
|
| 197 |
-
df = pd.DataFrame(
|
| 198 |
-
{
|
| 199 |
-
"System prompt": SYSTEM_PROMPT,
|
| 200 |
-
"baseline_model": baseline_model,
|
| 201 |
-
"candidate_model": candidate_model,
|
| 202 |
-
"Four numbers": " ".join([str(i) for i in four_numbers]),
|
| 203 |
-
},
|
| 204 |
-
index=[input_filename],
|
| 205 |
-
)
|
| 206 |
-
df.index.name = "filename"
|
| 207 |
-
print(tabulate(df, headers='keys', tablefmt='psql'))
|
| 208 |
-
|
| 209 |
-
|
| 210 |
def get_df(
|
| 211 |
input_filename: str,
|
| 212 |
output_filename: str = None,
|
|
@@ -223,21 +158,20 @@ def get_df(
|
|
| 223 |
|
| 224 |
|
| 225 |
def main(
|
|
|
|
| 226 |
data_root_dir: str,
|
| 227 |
dataset_name: str,
|
| 228 |
-
output_filename: str,
|
| 229 |
baseline_model: str,
|
| 230 |
candidate_model: str,
|
| 231 |
swap: bool,
|
| 232 |
-
eval_model: str,
|
| 233 |
chunk_size: int,
|
| 234 |
-
host: str,
|
| 235 |
sync: bool = False,
|
|
|
|
| 236 |
):
|
| 237 |
dataset_name_cut = dataset_name.split("/")[-1]
|
| 238 |
input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
|
| 239 |
input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
|
| 240 |
-
output_filename = os.path.join(data_root_dir, "judgements", f"{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
|
| 241 |
df_cand = get_df(
|
| 242 |
input_filename=input_cand_filename,
|
| 243 |
output_filename=output_filename,
|
|
@@ -250,7 +184,7 @@ def main(
|
|
| 250 |
df_cand, df_base = df_base, df_cand
|
| 251 |
if sync:
|
| 252 |
models_answers, eval_results = generate_sync(
|
| 253 |
-
df_cand=df_cand, df_base=df_base,
|
| 254 |
)
|
| 255 |
else:
|
| 256 |
models_answers, eval_results = [], []
|
|
@@ -258,10 +192,9 @@ def main(
|
|
| 258 |
for i in range(0, len(df_cand), chunk_size):
|
| 259 |
model_answer, eval_result = asyncio.run(
|
| 260 |
generate(
|
|
|
|
| 261 |
df_cand=df_cand[i:i + chunk_size],
|
| 262 |
df_base=df_base[i:i + chunk_size],
|
| 263 |
-
eval_model=eval_model,
|
| 264 |
-
host=host,
|
| 265 |
)
|
| 266 |
)
|
| 267 |
models_answers.extend(model_answer)
|
|
@@ -269,6 +202,7 @@ def main(
|
|
| 269 |
bar.update(chunk_size)
|
| 270 |
|
| 271 |
better, both_good, both_bad, worse = 0, 0, 0, 0
|
|
|
|
| 272 |
not_defined = 0
|
| 273 |
total = len(eval_results)
|
| 274 |
|
|
@@ -285,11 +219,13 @@ def main(
|
|
| 285 |
continue
|
| 286 |
cat_total[instance["cluster"]] += 1
|
| 287 |
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
|
|
|
| 291 |
|
| 292 |
-
|
|
|
|
| 293 |
|
| 294 |
# count how many times the judge preffered candidate model
|
| 295 |
better += judge == "A"
|
|
@@ -297,6 +233,7 @@ def main(
|
|
| 297 |
both_good += judge == "C"
|
| 298 |
both_bad += judge == "D"
|
| 299 |
not_defined += judge == "E"
|
|
|
|
| 300 |
if swap:
|
| 301 |
cat_only_better[instance["cluster"]] += judge == "B"
|
| 302 |
cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
|
|
@@ -325,7 +262,7 @@ def main(
|
|
| 325 |
cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
|
| 326 |
cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
|
| 327 |
|
| 328 |
-
|
| 329 |
|
| 330 |
if swap:
|
| 331 |
accuracy, other_accuracy = other_accuracy, accuracy
|
|
@@ -340,7 +277,7 @@ def main(
|
|
| 340 |
f"Our Model preferred answers numbers with swap {swap}": better,
|
| 341 |
"Total number of questions": total,
|
| 342 |
"Accuracy": accuracy,
|
| 343 |
-
"Accuracy for not defined cases":
|
| 344 |
"Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 345 |
"Two numbers": pao_to_2,
|
| 346 |
}
|
|
@@ -348,22 +285,15 @@ def main(
|
|
| 348 |
|
| 349 |
results = {
|
| 350 |
"mean": accuracy,
|
| 351 |
-
"
|
| 352 |
"four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 353 |
-
"
|
| 354 |
"cat_only_accuracy": cat_only_accuracy,
|
| 355 |
"cat_tie_accuracy": cat_tie_accuracy,
|
|
|
|
|
|
|
| 356 |
}
|
| 357 |
|
| 358 |
-
draw_dataframe(
|
| 359 |
-
baseline_model=baseline_model,
|
| 360 |
-
candidate_model=candidate_model,
|
| 361 |
-
four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 362 |
-
two_numbers=pao_to_2,
|
| 363 |
-
input_filename=input_cand_filename,
|
| 364 |
-
swap=swap,
|
| 365 |
-
)
|
| 366 |
-
|
| 367 |
return results
|
| 368 |
|
| 369 |
|
|
@@ -372,16 +302,18 @@ if __name__ == "__main__":
|
|
| 372 |
parser.add_argument(
|
| 373 |
"--hostname",
|
| 374 |
type=str,
|
| 375 |
-
default="https://
|
| 376 |
help="Хостнейм, на котором крутится модель",
|
| 377 |
)
|
| 378 |
parser.add_argument(
|
| 379 |
"--data-root-dir",
|
|
|
|
| 380 |
type=str,
|
| 381 |
help="Файл который надо оценить",
|
| 382 |
)
|
| 383 |
parser.add_argument(
|
| 384 |
"--dataset-name",
|
|
|
|
| 385 |
type=str,
|
| 386 |
help="Название бенчмарка",
|
| 387 |
)
|
|
@@ -396,7 +328,7 @@ if __name__ == "__main__":
|
|
| 396 |
parser.add_argument(
|
| 397 |
"--candidate-model",
|
| 398 |
type=str,
|
| 399 |
-
default="gpt-
|
| 400 |
help="Модель, чьи ответы надо оценить против baseline-model",
|
| 401 |
)
|
| 402 |
parser.add_argument(
|
|
@@ -406,11 +338,17 @@ if __name__ == "__main__":
|
|
| 406 |
help="Модель, чьи ответы надо оценить против candidate-model",
|
| 407 |
)
|
| 408 |
parser.add_argument(
|
| 409 |
-
"--
|
| 410 |
default="deepseek-r1-distill-llama-70b-awq",
|
| 411 |
type=str,
|
| 412 |
help="Название для модели, которая будет оценивать",
|
| 413 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
parser.add_argument(
|
| 415 |
"--chunk-size",
|
| 416 |
default=256,
|
|
@@ -422,29 +360,68 @@ if __name__ == "__main__":
|
|
| 422 |
action="store_true",
|
| 423 |
help="Если true, генерация синхронная, иначе асинхронная",
|
| 424 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
args = parser.parse_args()
|
| 427 |
|
| 428 |
if not os.getenv("OPENAI_API_KEY"):
|
| 429 |
raise ValueError("OPENAI_API_KEY is not set")
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
swaps = [False, True]
|
| 432 |
results = dict.fromkeys(swaps)
|
| 433 |
|
| 434 |
for i in range(len(swaps)):
|
| 435 |
results[swaps[i]] = main(
|
|
|
|
|
|
|
| 436 |
data_root_dir=args.data_root_dir,
|
| 437 |
-
output_filename=f"swap_{i}_{args.candidate_model}",
|
| 438 |
baseline_model=args.baseline_model,
|
| 439 |
candidate_model=args.candidate_model,
|
| 440 |
swap=swaps[i],
|
| 441 |
-
eval_model=args.eval_model,
|
| 442 |
chunk_size=args.chunk_size,
|
| 443 |
-
host=args.hostname,
|
| 444 |
sync=args.sync,
|
|
|
|
| 445 |
)
|
| 446 |
-
|
| 447 |
-
|
|
|
|
| 448 |
|
| 449 |
-
print("Mean results (go up for divided swaps):\nFour numbers:")
|
| 450 |
-
print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
from collections import defaultdict
|
| 7 |
+
from typing import Any, Dict, List
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from src.evaluate.util import APIModelBase
|
| 11 |
+
from src.evaluate.calculate_metrics import draw_dataframe, calculate_pconab
|
| 12 |
|
| 13 |
SYSTEM_PROMPT = """
|
| 14 |
Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
|
|
|
|
| 37 |
- Focus purely on content quality based on the given factual information and evaluation criteria.
|
| 38 |
- Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
|
| 39 |
"""
|
| 40 |
+
TEMPERATURE = 0.0
|
| 41 |
+
TOP_P = 0.1
|
| 42 |
+
FREQUENCY_PENALTY = 1.2
|
| 43 |
STOP_THINKING = "</think>"
|
| 44 |
|
| 45 |
|
| 46 |
+
async def generate(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
tasks = []
|
| 48 |
models_answers = []
|
| 49 |
+
cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
|
| 50 |
|
| 51 |
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 52 |
# extract only the responses from the input dict
|
| 53 |
responses = {base: None, cand: None}
|
| 54 |
if instance_cand["replies"]["model_name"] == cand:
|
| 55 |
+
responses[cand] = instance_cand["replies"]["text"]
|
| 56 |
if instance_base["replies"]["model_name"] == base:
|
| 57 |
+
responses[base] = instance_base["replies"]["text"]
|
| 58 |
|
| 59 |
if responses[base] is None or responses[cand] is None:
|
| 60 |
+
raise ValueError("There is no cand or base model answer")
|
| 61 |
|
| 62 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 63 |
instance_cand["turns"]["content"],
|
|
|
|
| 65 |
responses[base],
|
| 66 |
)
|
| 67 |
models_answers.append(responses)
|
| 68 |
+
task = asyncio.create_task(model.generate_answers_async(prompt))
|
| 69 |
tasks.append(task)
|
| 70 |
|
| 71 |
eval_results = await asyncio.gather(*tasks)
|
| 72 |
return models_answers, eval_results
|
| 73 |
|
| 74 |
|
| 75 |
+
def generate_sync(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
|
| 76 |
models_answers = []
|
| 77 |
eval_results = []
|
| 78 |
+
cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
|
| 79 |
|
| 80 |
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 81 |
# extract only the responses from the input dict
|
| 82 |
responses = {base: None, cand: None}
|
| 83 |
if instance_cand["replies"]["model_name"] == cand:
|
| 84 |
+
responses[cand] = instance_cand["replies"]["text"]
|
| 85 |
if instance_base["replies"]["model_name"] == base:
|
| 86 |
+
responses[base] = instance_base["replies"]["text"]
|
| 87 |
|
| 88 |
if responses[base] is None or responses[cand] is None:
|
| 89 |
raise ValueError("There are no cand or base model answer")
|
|
|
|
| 94 |
responses[base],
|
| 95 |
)
|
| 96 |
models_answers.append(responses)
|
| 97 |
+
result = model.generate_answers(prompt)
|
| 98 |
eval_results.append(result)
|
| 99 |
return models_answers, eval_results
|
| 100 |
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
def construct_output(
|
| 104 |
question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
|
|
|
|
| 130 |
for ans in matches:
|
| 131 |
if ans == "A" or ans == "B" or ans == "C" or ans == "D":
|
| 132 |
return ans
|
| 133 |
+
print(f"Wasn't able to validate answer:\n{answer}")
|
| 134 |
return "E"
|
| 135 |
|
| 136 |
|
|
|
|
| 142 |
return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
|
| 143 |
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def get_df(
|
| 146 |
input_filename: str,
|
| 147 |
output_filename: str = None,
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
def main(
|
| 161 |
+
model: APIModelBase,
|
| 162 |
data_root_dir: str,
|
| 163 |
dataset_name: str,
|
|
|
|
| 164 |
baseline_model: str,
|
| 165 |
candidate_model: str,
|
| 166 |
swap: bool,
|
|
|
|
| 167 |
chunk_size: int,
|
|
|
|
| 168 |
sync: bool = False,
|
| 169 |
+
with_reasoning: bool = True,
|
| 170 |
):
|
| 171 |
dataset_name_cut = dataset_name.split("/")[-1]
|
| 172 |
input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
|
| 173 |
input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
|
| 174 |
+
output_filename = os.path.join(data_root_dir, "judgements", f"swap_{int(swap)}_{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
|
| 175 |
df_cand = get_df(
|
| 176 |
input_filename=input_cand_filename,
|
| 177 |
output_filename=output_filename,
|
|
|
|
| 184 |
df_cand, df_base = df_base, df_cand
|
| 185 |
if sync:
|
| 186 |
models_answers, eval_results = generate_sync(
|
| 187 |
+
model=model, df_cand=df_cand, df_base=df_base,
|
| 188 |
)
|
| 189 |
else:
|
| 190 |
models_answers, eval_results = [], []
|
|
|
|
| 192 |
for i in range(0, len(df_cand), chunk_size):
|
| 193 |
model_answer, eval_result = asyncio.run(
|
| 194 |
generate(
|
| 195 |
+
model=model,
|
| 196 |
df_cand=df_cand[i:i + chunk_size],
|
| 197 |
df_base=df_base[i:i + chunk_size],
|
|
|
|
|
|
|
| 198 |
)
|
| 199 |
)
|
| 200 |
models_answers.extend(model_answer)
|
|
|
|
| 202 |
bar.update(chunk_size)
|
| 203 |
|
| 204 |
better, both_good, both_bad, worse = 0, 0, 0, 0
|
| 205 |
+
result_judgements = []
|
| 206 |
not_defined = 0
|
| 207 |
total = len(eval_results)
|
| 208 |
|
|
|
|
| 219 |
continue
|
| 220 |
cat_total[instance["cluster"]] += 1
|
| 221 |
|
| 222 |
+
if with_reasoning:
|
| 223 |
+
possible_judgement = eval_result[eval_result.find(STOP_THINKING) + len(STOP_THINKING):]
|
| 224 |
+
else:
|
| 225 |
+
possible_judgement = eval_result
|
| 226 |
|
| 227 |
+
possible_judgement = possible_judgement.strip()
|
| 228 |
+
judge = validate_answer(possible_judgement)
|
| 229 |
|
| 230 |
# count how many times the judge preffered candidate model
|
| 231 |
better += judge == "A"
|
|
|
|
| 233 |
both_good += judge == "C"
|
| 234 |
both_bad += judge == "D"
|
| 235 |
not_defined += judge == "E"
|
| 236 |
+
result_judgements.append(judge)
|
| 237 |
if swap:
|
| 238 |
cat_only_better[instance["cluster"]] += judge == "B"
|
| 239 |
cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
|
|
|
|
| 262 |
cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
|
| 263 |
cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
|
| 264 |
|
| 265 |
+
unidentified_accuracy = (not_defined / total) * 100
|
| 266 |
|
| 267 |
if swap:
|
| 268 |
accuracy, other_accuracy = other_accuracy, accuracy
|
|
|
|
| 277 |
f"Our Model preferred answers numbers with swap {swap}": better,
|
| 278 |
"Total number of questions": total,
|
| 279 |
"Accuracy": accuracy,
|
| 280 |
+
"Accuracy for not defined cases": unidentified_accuracy,
|
| 281 |
"Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 282 |
"Two numbers": pao_to_2,
|
| 283 |
}
|
|
|
|
| 285 |
|
| 286 |
results = {
|
| 287 |
"mean": accuracy,
|
| 288 |
+
"unidentified": unidentified_accuracy,
|
| 289 |
"four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 290 |
+
"two_numbers": pao_to_2,
|
| 291 |
"cat_only_accuracy": cat_only_accuracy,
|
| 292 |
"cat_tie_accuracy": cat_tie_accuracy,
|
| 293 |
+
"result_judgements": result_judgements,
|
| 294 |
+
"input_filename": input_cand_filename,
|
| 295 |
}
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
return results
|
| 298 |
|
| 299 |
|
|
|
|
| 302 |
parser.add_argument(
|
| 303 |
"--hostname",
|
| 304 |
type=str,
|
| 305 |
+
default="https://demo-eval-fundres.dev.mts.ai/v1",
|
| 306 |
help="Хостнейм, на котором крутится модель",
|
| 307 |
)
|
| 308 |
parser.add_argument(
|
| 309 |
"--data-root-dir",
|
| 310 |
+
default="./data/",
|
| 311 |
type=str,
|
| 312 |
help="Файл который надо оценить",
|
| 313 |
)
|
| 314 |
parser.add_argument(
|
| 315 |
"--dataset-name",
|
| 316 |
+
default="Vikhrmodels/arena_hard_ru",
|
| 317 |
type=str,
|
| 318 |
help="Название бенчмарка",
|
| 319 |
)
|
|
|
|
| 328 |
parser.add_argument(
|
| 329 |
"--candidate-model",
|
| 330 |
type=str,
|
| 331 |
+
default="gpt-3.5-turbo",
|
| 332 |
help="Модель, чьи ответы надо оценить против baseline-model",
|
| 333 |
)
|
| 334 |
parser.add_argument(
|
|
|
|
| 338 |
help="Модель, чьи ответы надо оценить против candidate-model",
|
| 339 |
)
|
| 340 |
parser.add_argument(
|
| 341 |
+
"--model-openai",
|
| 342 |
default="deepseek-r1-distill-llama-70b-awq",
|
| 343 |
type=str,
|
| 344 |
help="Название для модели, которая будет оценивать",
|
| 345 |
)
|
| 346 |
+
parser.add_argument(
|
| 347 |
+
"--with-reasoning",
|
| 348 |
+
default=True,
|
| 349 |
+
action="store_true",
|
| 350 |
+
help="Если true, генерация синхронная, иначе асинхронная",
|
| 351 |
+
)
|
| 352 |
parser.add_argument(
|
| 353 |
"--chunk-size",
|
| 354 |
default=256,
|
|
|
|
| 360 |
action="store_true",
|
| 361 |
help="Если true, генерация синхронная, иначе асинхронная",
|
| 362 |
)
|
| 363 |
+
parser.add_argument(
|
| 364 |
+
"--system-prompt",
|
| 365 |
+
default=None,
|
| 366 |
+
type=str,
|
| 367 |
+
help="Если true, генерация синхронная, иначе асинхронная",
|
| 368 |
+
)
|
| 369 |
+
parser.add_argument(
|
| 370 |
+
"--max-gen-length",
|
| 371 |
+
type=int,
|
| 372 |
+
default=4096,
|
| 373 |
+
help="Максимальная длина генерируемого текста",
|
| 374 |
+
)
|
| 375 |
+
parser.add_argument("--temperature", type=float, default=TEMPERATURE)
|
| 376 |
+
parser.add_argument("--top-p", type=float, default=TOP_P)
|
| 377 |
+
parser.add_argument(
|
| 378 |
+
"--frequency-penalty",
|
| 379 |
+
type=float,
|
| 380 |
+
default=FREQUENCY_PENALTY,
|
| 381 |
+
)
|
| 382 |
|
| 383 |
args = parser.parse_args()
|
| 384 |
|
| 385 |
if not os.getenv("OPENAI_API_KEY"):
|
| 386 |
raise ValueError("OPENAI_API_KEY is not set")
|
| 387 |
|
| 388 |
+
from src.evaluate.util import OpenaiModel
|
| 389 |
+
# parameters for OpenaiModel
|
| 390 |
+
args.system_prompt = SYSTEM_PROMPT if not args.system_prompt else args.system_prompt
|
| 391 |
+
model = OpenaiModel(args)
|
| 392 |
+
|
| 393 |
swaps = [False, True]
|
| 394 |
results = dict.fromkeys(swaps)
|
| 395 |
|
| 396 |
for i in range(len(swaps)):
|
| 397 |
results[swaps[i]] = main(
|
| 398 |
+
model=model,
|
| 399 |
+
dataset_name=args.dataset_name,
|
| 400 |
data_root_dir=args.data_root_dir,
|
|
|
|
| 401 |
baseline_model=args.baseline_model,
|
| 402 |
candidate_model=args.candidate_model,
|
| 403 |
swap=swaps[i],
|
|
|
|
| 404 |
chunk_size=args.chunk_size,
|
|
|
|
| 405 |
sync=args.sync,
|
| 406 |
+
with_reasoning=args.with_reasoning,
|
| 407 |
)
|
| 408 |
+
mean_four = [(x + y) / 2 for x, y in zip(*[v["four_numbers"] for v in list(results.values())])]
|
| 409 |
+
mean_unidentified = (results[False]["unidentified"] + results[True]["unidentified"]) / 2
|
| 410 |
+
# better, worse, both_good, both_bad = mean_4
|
| 411 |
|
| 412 |
+
# print("Mean results (go up for divided swaps):\nFour numbers:")
|
| 413 |
+
# print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")
|
| 414 |
+
|
| 415 |
+
pconab = calculate_pconab(
|
| 416 |
+
results[False]["result_judgements"],
|
| 417 |
+
results[True]["result_judgements"]
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
draw_dataframe(
|
| 421 |
+
baseline_model=args.baseline_model,
|
| 422 |
+
candidate_model=args.candidate_model,
|
| 423 |
+
four_numbers=mean_four,
|
| 424 |
+
unidentified=mean_unidentified,
|
| 425 |
+
pconab=pconab,
|
| 426 |
+
input_filename=results[False]["input_filename"],
|
| 427 |
+
)
|
src/evaluate/util.py
CHANGED
|
@@ -75,22 +75,14 @@ class OpenaiModel(APIModelBase):
|
|
| 75 |
def respond(self, texts):
|
| 76 |
answers = []
|
| 77 |
for text in texts:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
) as client:
|
| 81 |
-
messages = self.create_prompt(text)
|
| 82 |
-
payload = self.get_payload(messages)
|
| 83 |
-
completion = client.chat.completions.create(**payload)
|
| 84 |
-
answer = completion.choices[0].message.content
|
| 85 |
-
# print(answer)
|
| 86 |
-
# print("\n")
|
| 87 |
-
answers.append(answer)
|
| 88 |
return answers
|
| 89 |
|
| 90 |
async def respond_async(self, texts):
|
| 91 |
tasks = []
|
| 92 |
for text in texts:
|
| 93 |
-
task = asyncio.create_task(self.
|
| 94 |
tasks.append(task)
|
| 95 |
|
| 96 |
result = await asyncio.gather(*tasks)
|
|
@@ -98,7 +90,17 @@ class OpenaiModel(APIModelBase):
|
|
| 98 |
assert len(result) == len(texts)
|
| 99 |
return result
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
async with openai.AsyncOpenAI(
|
| 103 |
api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
|
| 104 |
) as client:
|
|
|
|
| 75 |
def respond(self, texts):
|
| 76 |
answers = []
|
| 77 |
for text in texts:
|
| 78 |
+
answer = self.generate_answers(text)
|
| 79 |
+
answers.append(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
return answers
|
| 81 |
|
| 82 |
async def respond_async(self, texts):
|
| 83 |
tasks = []
|
| 84 |
for text in texts:
|
| 85 |
+
task = asyncio.create_task(self.generate_answers_async(text))
|
| 86 |
tasks.append(task)
|
| 87 |
|
| 88 |
result = await asyncio.gather(*tasks)
|
|
|
|
| 90 |
assert len(result) == len(texts)
|
| 91 |
return result
|
| 92 |
|
| 93 |
+
def generate_answers(self, text):
|
| 94 |
+
with openai.OpenAI(
|
| 95 |
+
api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
|
| 96 |
+
) as client:
|
| 97 |
+
messages = self.create_prompt(text)
|
| 98 |
+
payload = self.get_payload(messages)
|
| 99 |
+
completion = client.chat.completions.create(**payload)
|
| 100 |
+
|
| 101 |
+
return completion.choices[0].message.content
|
| 102 |
+
|
| 103 |
+
async def generate_answers_async(self, text):
|
| 104 |
async with openai.AsyncOpenAI(
|
| 105 |
api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
|
| 106 |
) as client:
|
src/llm_benchmarks_text.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This leaderboard displays the metrics indicating the adequacy of various LLM-as-a-judge systems during the side-by-side evaluation of model generations from Qwen2.5-32B-Instruct and GPT-4o. In our comparisons, we ask the judge model to determine whether:
|
| 2 |
+
|
| 3 |
+
1. The response from the candidate model is better than the baseline,
|
| 4 |
+
2. Vice versa,
|
| 5 |
+
3. Both responses are good, or
|
| 6 |
+
4. Both responses are bad.
|
| 7 |
+
|
| 8 |
+
Instead of randomizing the order of model responses, we conduct two runs through the dataset. In the first run, the candidate model's response is presented first, followed by the baseline model's response; in the second run, the order is reversed. The scores are averaged after both runs are completed.
|
| 9 |
+
|
| 10 |
+
After this, we calculate metrics showcasing two aspects:
|
| 11 |
+
|
| 12 |
+
1. APCC and MPCC demonstrate the correlation of LLM-as-a-judge assessments with expert evaluations.
|
| 13 |
+
- Aggregated Pearson Correlation Coefficient (APCC): We count the number of verdicts in each class (A/B/C/D) and calculate the correlation between LLM-as-judge and expert assessments based on these four values. This metric sacrifices detailed verdict information but can estimate how closely the model aligns with experts in delivering a final verdict for the entire benchmark.
|
| 14 |
+
- Median Pearson Correlation Coefficient (MPCC): We apply a sliding window with a size of 10 and a stride of 5 across all benchmark verdicts. For each batch, we calculate the median using the formula:
|
| 15 |
+
$$
|
| 16 |
+
\text{Median} = \frac{\sum{\textbf{A}}+\sum{\textbf{C}}}{\sum{\textbf{A}}+\sum{\textbf{B}}+2\cdot\sum{\textbf{C}}}
|
| 17 |
+
$$
|
| 18 |
+
This provides a set of medians for expert and model verdicts, and we calculate the PCC between them. This method retains most verdict information but imposes a linear relationship between verdict classes, which may not be entirely accurate.
|
| 19 |
+
|
| 20 |
+
2. Metrics of Positional Bias: We introduce the metric PCon@AB, which indicates the presence of bias in evaluator models.
|
| 21 |
+
$$
|
| 22 |
+
\textbf{PCon@AB} = \frac{I(J{\text{swap}=0} = J{\text{swap}=1}| J=\textbf{A} \vee \textbf{B})}{ I\left((J{\text{swap}=0}=\textbf{A} \vee \textbf{B}) \vee (J{\text{swap}=1}=\textbf{A} \vee \textbf{B})\right)}
|
| 23 |
+
$$
|
| 24 |
+
This metric shows the consistency of the model's answers without swap and with swap, indicating the proportion of matching answers among A and B given different model response orders.
|
| 25 |
+
|
| 26 |
+
The metric MPCC-Consistency is calculated as the Pearson correlation coefficient between two sets of medians obtained for verdicts with and without swap, while the metric MPCC-∆ is the difference between the MPCC calculated separately for verdicts obtained with and without swap.
|
| 27 |
+
|
| 28 |
+
PCon@AB, MPCC-Consistency, and MPCC-∆ do not rely on manual annotation, allowing us to determine the model's susceptibility to positional bias without expert involvement.
|