ru_leaderboard

Running

App Files Files Community

Titova Ksenia commited on Mar 10

Commit

21865d4

1 Parent(s): 8d664ba

add sample data

Browse files

Files changed (7) hide show

data/generations/gpt-3.5-turbo_arena_hard_ru_responses.jsonl +0 -0
data/generations/gpt-4o-2024-11-20_arena_hard_ru_responses.jsonl +0 -0
src/about.py +4 -6
src/evaluate/calculate_metrics.py +98 -0
src/evaluate/evaluate_answers.py +95 -118
src/evaluate/util.py +14 -12
src/llm_benchmarks_text.md +28 -0

data/generations/gpt-3.5-turbo_arena_hard_ru_responses.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/generations/gpt-4o-2024-11-20_arena_hard_ru_responses.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

src/about.py CHANGED Viewed

@@ -32,13 +32,11 @@ MTSAIR draft leaderboard
 """
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model

 """
 # Which evaluations are you running? how can people reproduce what you have?
+system_prompt_filename = "./llm_benchmarks_text.md"
+with open(system_prompt_filename, 'r') as file:
+    LLM_BENCHMARKS_TEXT = file.read()
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model

src/evaluate/calculate_metrics.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import List
+import pandas as pd
+from tabulate import tabulate
+from scipy import stats
+def draw_dataframe(
+    baseline_model: str,
+    candidate_model: str,
+    four_numbers: List,
+    unidentified: float,
+    input_filename: str,
+    pconab: float = -1,
+) -> None:
+    df = pd.DataFrame(
+        {
+            "baseline_model": baseline_model,
+            "candidate_model": candidate_model,
+            "Four numbers": " ".join([str(i) for i in four_numbers]),
+            "PCon@AB (for candidate)": pconab,
+            "Unidentified": unidentified,
+        },
+        index=[input_filename],
+    )
+    df.index.name = "filename"
+    print(tabulate(df, headers='keys', tablefmt='psql'))
+def calculate_pconab(
+    answers_swap_False: List,
+    answers_swap_True: List,
+) -> float:
+    corr_1_2, corr, total, without_tie_total = 0, 0, 0, 0
+    for val_0, val_1 in zip(answers_swap_False, answers_swap_True):
+        total += 1
+        val_1 = {"A": "B", "B": "A"}.get(val_1, val_1)  # swap A <-> for swap=True
+        if val_0 == val_1:
+            corr += 1
+        if val_0 in ["A", "B"] or val_1 in ["A", "B"]:
+            without_tie_total += 1
+            if val_0 == val_1:
+                corr_1_2 += 1
+    if without_tie_total > 0:
+        return corr_1_2 / without_tie_total
+    return 0.0
+def calculate_medians(results, window_len=10, stride=5):
+    results_len = len(results)
+    medians = []
+    for i in range(0, results_len, stride):
+        if i + window_len < results_len:
+            cur_batch = results[i: i + window_len]
+            cur_dict = {"A": 0, "B": 0, "C": 0}
+            for v in cur_batch:
+                if v == 1:
+                    cur_dict["A"] += 1
+                if v == 2:
+                    cur_dict["B"] += 1
+                if v == 3:
+                    cur_dict["C"] += 1
+            try:
+                cur_med = (cur_dict["A"]+cur_dict["B"])/(cur_dict["A"]+cur_dict["B"]+2*cur_dict["C"])
+            except:
+                cur_med = 0
+            medians.append(cur_med)
+    return medians
+def calculate_correlations(
+    answers: List,
+    manual_answers: List,
+) -> float:
+    result_mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
+    results, manual_results = [], []
+    five = [0, 0, 0, 0]
+    manual_five = [0, 0, 0, 0]
+    for answer, manual_answer in zip(answers, manual_answers):
+        if answer in result_mapping and manual_answer in result_mapping:
+            results.append(result_mapping[answer])
+            manual_results.append(result_mapping[manual_answer])
+            five[results[-1] - 1] += 1
+            manual_five[manual_results[-1] - 1] += 1
+    medians = calculate_medians(results)
+    manual_medians = calculate_medians(manual_results)
+    correlations = {}
+    correlations["FSCC"] = stats.spearmanr(manual_results, results).statistic
+    correlations["APCC"] = stats.pearsonr(manual_five, five).statistic
+    correlations["MPCC"] = stats.pearsonr(manual_medians, medians).statistic
+    return 0.0

src/evaluate/evaluate_answers.py CHANGED Viewed

@@ -4,17 +4,11 @@ import json
 import os
 import re
 from collections import defaultdict
-from copy import copy
-from pathlib import PurePath
-from typing import Any, Dict, List, Optional, Union
-from warnings import warn
-import openai
-import pandas as pd
-from openai import AsyncOpenAI
-from tqdm import tqdm
-from tabulate import tabulate
 SYSTEM_PROMPT = """
 Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
@@ -43,44 +37,27 @@ IMPORTANT:
 - Focus purely on content quality based on the given factual information and evaluation criteria.
 - Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
 """
 STOP_THINKING = "</think>"
-async def deepseek_eval(system: str, prompt: str, eval_model: str, host: str) -> Optional[str]:
-    async with openai.AsyncOpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
-        completion = await client.chat.completions.create(
-            model=eval_model,
-            n=1,
-            messages=[
-                {"role": "system", "content": system},
-                {"role": "user", "content": prompt},
-            ],
-            max_tokens=1024,
-            timeout=10000,
-            temperature=0.0,
-            top_p=1.0,
-            logprobs=True,
-            top_logprobs=20,
-        )
-        return completion
-async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
     tasks = []
     models_answers = []
-    base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
     for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
         # extract only the responses from the input dict
         responses = {base: None, cand: None}
         if instance_cand["replies"]["model_name"] == cand:
-            responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
         if instance_base["replies"]["model_name"] == base:
-            responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
         if responses[base] is None or responses[cand] is None:
-            raise ValueError("There are no cand or base model answer")
         prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
             instance_cand["turns"]["content"],
@@ -88,25 +65,25 @@ async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tu
             responses[base],
         )
         models_answers.append(responses)
-        task = asyncio.create_task(deepseek_eval(SYSTEM_PROMPT, prompt, eval_model, host))
         tasks.append(task)
     eval_results = await asyncio.gather(*tasks)
     return models_answers, eval_results
-def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
     models_answers = []
     eval_results = []
-    base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
     for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
         # extract only the responses from the input dict
         responses = {base: None, cand: None}
         if instance_cand["replies"]["model_name"] == cand:
-            responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
         if instance_base["replies"]["model_name"] == base:
-            responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
         if responses[base] is None or responses[cand] is None:
             raise ValueError("There are no cand or base model answer")
@@ -117,33 +94,11 @@ def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tup
             responses[base],
         )
         models_answers.append(responses)
-        result = deepseek_eval_sync(SYSTEM_PROMPT, prompt, eval_model, host)
         eval_results.append(result)
     return models_answers, eval_results
-def deepseek_eval_sync(system: str, prompt: str, eval_model: str, host: str) -> Optional[str]:
-    with openai.OpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
-        try:
-            completion = client.chat.completions.create(
-                model=eval_model,
-                n=1,
-                messages=[
-                    {"role": "system", "content": system},
-                    {"role": "user", "content": prompt},
-                ],
-                max_tokens=1024,
-                temperature=0.0,
-                top_p=1.0,
-                logprobs=True,
-                top_logprobs=20,
-            )
-            return completion
-        except BaseException as e:
-            print(f"pausing {e}")
-            return None
 def construct_output(
     question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
@@ -175,7 +130,7 @@ def validate_answer(answer: str) -> str:
     for ans in matches:
         if ans == "A" or ans == "B" or ans == "C" or ans == "D":
             return ans
-    print(answer)
     return "E"
@@ -187,26 +142,6 @@ def model_wins_with_tie(row: Dict[str, Any], model_name: str):
     return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
-def draw_dataframe(
-    baseline_model: str,
-    candidate_model: str,
-    four_numbers: List,
-    input_filename: str,
-) -> None:
-    df = pd.DataFrame(
-        {
-            "System prompt": SYSTEM_PROMPT,
-            "baseline_model": baseline_model,
-            "candidate_model": candidate_model,
-            "Four numbers": " ".join([str(i) for i in four_numbers]),
-        },
-        index=[input_filename],
-    )
-    df.index.name = "filename"
-    print(tabulate(df, headers='keys', tablefmt='psql'))
 def get_df(
     input_filename: str,
     output_filename: str = None,
@@ -223,21 +158,20 @@ def get_df(
 def main(
     data_root_dir: str,
     dataset_name: str,
-    output_filename: str,
     baseline_model: str,
     candidate_model: str,
     swap: bool,
-    eval_model: str,
     chunk_size: int,
-    host: str,
     sync: bool = False,
 ):
     dataset_name_cut = dataset_name.split("/")[-1]
     input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
     input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
-    output_filename = os.path.join(data_root_dir, "judgements", f"{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
     df_cand = get_df(
         input_filename=input_cand_filename,
         output_filename=output_filename,
@@ -250,7 +184,7 @@ def main(
         df_cand, df_base = df_base, df_cand
     if sync:
         models_answers, eval_results = generate_sync(
-            df_cand=df_cand, df_base=df_base, eval_model=eval_model, host=host
         )
     else:
         models_answers, eval_results = [], []
@@ -258,10 +192,9 @@ def main(
         for i in range(0, len(df_cand), chunk_size):
             model_answer, eval_result = asyncio.run(
                 generate(
                     df_cand=df_cand[i:i + chunk_size],
                     df_base=df_base[i:i + chunk_size],
-                    eval_model=eval_model,
-                    host=host,
                 )
             )
             models_answers.extend(model_answer)
@@ -269,6 +202,7 @@ def main(
             bar.update(chunk_size)
     better, both_good, both_bad, worse = 0, 0, 0, 0
     not_defined = 0
     total = len(eval_results)
@@ -285,11 +219,13 @@ def main(
             continue
         cat_total[instance["cluster"]] += 1
-        possible_judge = eval_result.choices[0].message.content
-        possible_judge_cut = possible_judge[possible_judge.find(STOP_THINKING) + len(STOP_THINKING) :]
-        possible_judge_cut = possible_judge_cut.strip()
-        judge = validate_answer(possible_judge_cut)
         # count how many times the judge preffered candidate model
         better += judge == "A"
@@ -297,6 +233,7 @@ def main(
         both_good += judge == "C"
         both_bad += judge == "D"
         not_defined += judge == "E"
         if swap:
             cat_only_better[instance["cluster"]] += judge == "B"
             cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
@@ -325,7 +262,7 @@ def main(
         cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
         cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
-    accuracy_draw = (not_defined / total) * 100
     if swap:
         accuracy, other_accuracy = other_accuracy, accuracy
@@ -340,7 +277,7 @@ def main(
                 f"Our Model preferred answers numbers with swap {swap}": better,
                 "Total number of questions": total,
                 "Accuracy": accuracy,
-                "Accuracy for not defined cases": accuracy_draw,
                 "Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
                 "Two numbers": pao_to_2,
             }
@@ -348,22 +285,15 @@ def main(
     results = {
         "mean": accuracy,
-        "not_defined": accuracy_draw,
         "four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
-        "pao_to_2": pao_to_2,
         "cat_only_accuracy": cat_only_accuracy,
         "cat_tie_accuracy": cat_tie_accuracy,
     }
-    draw_dataframe(
-        baseline_model=baseline_model,
-        candidate_model=candidate_model,
-        four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
-        two_numbers=pao_to_2,
-        input_filename=input_cand_filename,
-        swap=swap,
-    )
     return results
@@ -372,16 +302,18 @@ if __name__ == "__main__":
     parser.add_argument(
         "--hostname",
         type=str,
-        default="https://demo8-miqu-fundres.dev.mts.ai/v1",
         help="Хостнейм, на котором крутится модель",
     )
     parser.add_argument(
         "--data-root-dir",
         type=str,
         help="Файл который надо оценить",
     )
     parser.add_argument(
         "--dataset-name",
         type=str,
         help="Название бенчмарка",
     )
@@ -396,7 +328,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--candidate-model",
         type=str,
-        default="gpt-4o-2024-11-20",
         help="Модель, чьи ответы надо оценить против baseline-model",
     )
     parser.add_argument(
@@ -406,11 +338,17 @@ if __name__ == "__main__":
         help="Модель, чьи ответы надо оценить против candidate-model",
     )
     parser.add_argument(
-        "--eval-model",
         default="deepseek-r1-distill-llama-70b-awq",
         type=str,
         help="Название для модели, которая будет оценивать",
     )
     parser.add_argument(
         "--chunk-size",
         default=256,
@@ -422,29 +360,68 @@ if __name__ == "__main__":
         action="store_true",
         help="Если true, генерация синхронная, иначе асинхронная",
     )
     args = parser.parse_args()
     if not os.getenv("OPENAI_API_KEY"):
         raise ValueError("OPENAI_API_KEY is not set")
     swaps = [False, True]
     results = dict.fromkeys(swaps)
     for i in range(len(swaps)):
         results[swaps[i]] = main(
             data_root_dir=args.data_root_dir,
-            output_filename=f"swap_{i}_{args.candidate_model}",
             baseline_model=args.baseline_model,
             candidate_model=args.candidate_model,
             swap=swaps[i],
-            eval_model=args.eval_model,
             chunk_size=args.chunk_size,
-            host=args.hostname,
             sync=args.sync,
         )
-    mean_4 = [(x + y) / 2 for x, y in zip(*[v["four_numbers"] for v in list(results.values())])]
-    better, worse, both_good, both_bad = mean_4
-    print("Mean results (go up for divided swaps):\nFour numbers:")
-    print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")

 import os
 import re
 from collections import defaultdict
+from typing import Any, Dict, List
+from tqdm import tqdm
+from src.evaluate.util import APIModelBase
+from src.evaluate.calculate_metrics import draw_dataframe, calculate_pconab
 SYSTEM_PROMPT = """
 Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
 - Focus purely on content quality based on the given factual information and evaluation criteria.
 - Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
 """
+TEMPERATURE = 0.0
+TOP_P = 0.1
+FREQUENCY_PENALTY = 1.2
 STOP_THINKING = "</think>"
+async def generate(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
     tasks = []
     models_answers = []
+    cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
     for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
         # extract only the responses from the input dict
         responses = {base: None, cand: None}
         if instance_cand["replies"]["model_name"] == cand:
+            responses[cand] = instance_cand["replies"]["text"]
         if instance_base["replies"]["model_name"] == base:
+            responses[base] = instance_base["replies"]["text"]
         if responses[base] is None or responses[cand] is None:
+            raise ValueError("There is no cand or base model answer")
         prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
             instance_cand["turns"]["content"],
             responses[base],
         )
         models_answers.append(responses)
+        task = asyncio.create_task(model.generate_answers_async(prompt))
         tasks.append(task)
     eval_results = await asyncio.gather(*tasks)
     return models_answers, eval_results
+def generate_sync(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
     models_answers = []
     eval_results = []
+    cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
     for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
         # extract only the responses from the input dict
         responses = {base: None, cand: None}
         if instance_cand["replies"]["model_name"] == cand:
+            responses[cand] = instance_cand["replies"]["text"]
         if instance_base["replies"]["model_name"] == base:
+            responses[base] = instance_base["replies"]["text"]
         if responses[base] is None or responses[cand] is None:
             raise ValueError("There are no cand or base model answer")
             responses[base],
         )
         models_answers.append(responses)
+        result = model.generate_answers(prompt)
         eval_results.append(result)
     return models_answers, eval_results
 def construct_output(
     question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
     for ans in matches:
         if ans == "A" or ans == "B" or ans == "C" or ans == "D":
             return ans
+    print(f"Wasn't able to validate answer:\n{answer}")
     return "E"
     return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
 def get_df(
     input_filename: str,
     output_filename: str = None,
 def main(
+    model: APIModelBase,
     data_root_dir: str,
     dataset_name: str,
     baseline_model: str,
     candidate_model: str,
     swap: bool,
     chunk_size: int,
     sync: bool = False,
+    with_reasoning: bool = True,
 ):
     dataset_name_cut = dataset_name.split("/")[-1]
     input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
     input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
+    output_filename = os.path.join(data_root_dir, "judgements", f"swap_{int(swap)}_{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
     df_cand = get_df(
         input_filename=input_cand_filename,
         output_filename=output_filename,
         df_cand, df_base = df_base, df_cand
     if sync:
         models_answers, eval_results = generate_sync(
+            model=model, df_cand=df_cand, df_base=df_base,
         )
     else:
         models_answers, eval_results = [], []
         for i in range(0, len(df_cand), chunk_size):
             model_answer, eval_result = asyncio.run(
                 generate(
+                    model=model,
                     df_cand=df_cand[i:i + chunk_size],
                     df_base=df_base[i:i + chunk_size],
                 )
             )
             models_answers.extend(model_answer)
             bar.update(chunk_size)
     better, both_good, both_bad, worse = 0, 0, 0, 0
+    result_judgements = []
     not_defined = 0
     total = len(eval_results)
             continue
         cat_total[instance["cluster"]] += 1
+        if with_reasoning:
+            possible_judgement = eval_result[eval_result.find(STOP_THINKING) + len(STOP_THINKING):]
+        else:
+            possible_judgement = eval_result
+        possible_judgement = possible_judgement.strip()
+        judge = validate_answer(possible_judgement)
         # count how many times the judge preffered candidate model
         better += judge == "A"
         both_good += judge == "C"
         both_bad += judge == "D"
         not_defined += judge == "E"
+        result_judgements.append(judge)
         if swap:
             cat_only_better[instance["cluster"]] += judge == "B"
             cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
         cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
         cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
+    unidentified_accuracy = (not_defined / total) * 100
     if swap:
         accuracy, other_accuracy = other_accuracy, accuracy
                 f"Our Model preferred answers numbers with swap {swap}": better,
                 "Total number of questions": total,
                 "Accuracy": accuracy,
+                "Accuracy for not defined cases": unidentified_accuracy,
                 "Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
                 "Two numbers": pao_to_2,
             }
     results = {
         "mean": accuracy,
+        "unidentified": unidentified_accuracy,
         "four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
+        "two_numbers": pao_to_2,
         "cat_only_accuracy": cat_only_accuracy,
         "cat_tie_accuracy": cat_tie_accuracy,
+        "result_judgements": result_judgements,
+        "input_filename": input_cand_filename,
     }
     return results
     parser.add_argument(
         "--hostname",
         type=str,
+        default="https://demo-eval-fundres.dev.mts.ai/v1",
         help="Хостнейм, на котором крутится модель",
     )
     parser.add_argument(
         "--data-root-dir",
+        default="./data/",
         type=str,
         help="Файл который надо оценить",
     )
     parser.add_argument(
         "--dataset-name",
+        default="Vikhrmodels/arena_hard_ru",
         type=str,
         help="Название бенчмарка",
     )
     parser.add_argument(
         "--candidate-model",
         type=str,
+        default="gpt-3.5-turbo",
         help="Модель, чьи ответы надо оценить против baseline-model",
     )
     parser.add_argument(
         help="Модель, чьи ответы надо оценить против candidate-model",
     )
     parser.add_argument(
+        "--model-openai",
         default="deepseek-r1-distill-llama-70b-awq",
         type=str,
         help="Название для модели, которая будет оценивать",
     )
+    parser.add_argument(
+        "--with-reasoning",
+        default=True,
+        action="store_true",
+        help="Если true, генерация синхронная, иначе асинхронная",
+    )
     parser.add_argument(
         "--chunk-size",
         default=256,
         action="store_true",
         help="Если true, генерация синхронная, иначе асинхронная",
     )
+    parser.add_argument(
+        "--system-prompt",
+        default=None,
+        type=str,
+        help="Если true, генерация синхронная, иначе асинхронная",
+    )
+    parser.add_argument(
+        "--max-gen-length",
+        type=int,
+        default=4096,
+        help="Максимальная длина генерируемого текста",
+    )
+    parser.add_argument("--temperature", type=float, default=TEMPERATURE)
+    parser.add_argument("--top-p", type=float, default=TOP_P)
+    parser.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=FREQUENCY_PENALTY,
+    )
     args = parser.parse_args()
     if not os.getenv("OPENAI_API_KEY"):
         raise ValueError("OPENAI_API_KEY is not set")
+    from src.evaluate.util import OpenaiModel
+    # parameters for OpenaiModel
+    args.system_prompt = SYSTEM_PROMPT if not args.system_prompt else args.system_prompt
+    model = OpenaiModel(args)
     swaps = [False, True]
     results = dict.fromkeys(swaps)
     for i in range(len(swaps)):
         results[swaps[i]] = main(
+            model=model,
+            dataset_name=args.dataset_name,
             data_root_dir=args.data_root_dir,
             baseline_model=args.baseline_model,
             candidate_model=args.candidate_model,
             swap=swaps[i],
             chunk_size=args.chunk_size,
             sync=args.sync,
+            with_reasoning=args.with_reasoning,
         )
+    mean_four = [(x + y) / 2 for x, y in zip(*[v["four_numbers"] for v in list(results.values())])]
+    mean_unidentified = (results[False]["unidentified"] + results[True]["unidentified"]) / 2
+    # better, worse, both_good, both_bad = mean_4
+    # print("Mean results (go up for divided swaps):\nFour numbers:")
+    # print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")
+    pconab = calculate_pconab(
+        results[False]["result_judgements"],
+        results[True]["result_judgements"]
+        )
+    draw_dataframe(
+        baseline_model=args.baseline_model,
+        candidate_model=args.candidate_model,
+        four_numbers=mean_four,
+        unidentified=mean_unidentified,
+        pconab=pconab,
+        input_filename=results[False]["input_filename"],
+    )

src/evaluate/util.py CHANGED Viewed

@@ -75,22 +75,14 @@ class OpenaiModel(APIModelBase):
     def respond(self, texts):
         answers = []
         for text in texts:
-            with openai.OpenAI(
-                api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
-            ) as client:
-                messages = self.create_prompt(text)
-                payload = self.get_payload(messages)
-                completion = client.chat.completions.create(**payload)
-                answer = completion.choices[0].message.content
-                # print(answer)
-                # print("\n")
-                answers.append(answer)
         return answers
     async def respond_async(self, texts):
         tasks = []
         for text in texts:
-            task = asyncio.create_task(self.generate_answers(text))
             tasks.append(task)
         result = await asyncio.gather(*tasks)
@@ -98,7 +90,17 @@ class OpenaiModel(APIModelBase):
         assert len(result) == len(texts)
         return result
-    async def generate_answers(self, text):
         async with openai.AsyncOpenAI(
             api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
         ) as client:

     def respond(self, texts):
         answers = []
         for text in texts:
+            answer = self.generate_answers(text)
+            answers.append(answer)
         return answers
     async def respond_async(self, texts):
         tasks = []
         for text in texts:
+            task = asyncio.create_task(self.generate_answers_async(text))
             tasks.append(task)
         result = await asyncio.gather(*tasks)
         assert len(result) == len(texts)
         return result
+    def generate_answers(self, text):
+        with openai.OpenAI(
+            api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
+        ) as client:
+            messages = self.create_prompt(text)
+            payload = self.get_payload(messages)
+            completion = client.chat.completions.create(**payload)
+        return completion.choices[0].message.content
+    async def generate_answers_async(self, text):
         async with openai.AsyncOpenAI(
             api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
         ) as client:

src/llm_benchmarks_text.md ADDED Viewed

	@@ -0,0 +1,28 @@

+This leaderboard displays the metrics indicating the adequacy of various LLM-as-a-judge systems during the side-by-side evaluation of model generations from Qwen2.5-32B-Instruct and GPT-4o. In our comparisons, we ask the judge model to determine whether:
+1. The response from the candidate model is better than the baseline,
+2. Vice versa,
+3. Both responses are good, or
+4. Both responses are bad.
+Instead of randomizing the order of model responses, we conduct two runs through the dataset. In the first run, the candidate model's response is presented first, followed by the baseline model's response; in the second run, the order is reversed. The scores are averaged after both runs are completed.
+After this, we calculate metrics showcasing two aspects:
+1. APCC and MPCC demonstrate the correlation of LLM-as-a-judge assessments with expert evaluations.
+   - Aggregated Pearson Correlation Coefficient (APCC): We count the number of verdicts in each class (A/B/C/D) and calculate the correlation between LLM-as-judge and expert assessments based on these four values. This metric sacrifices detailed verdict information but can estimate how closely the model aligns with experts in delivering a final verdict for the entire benchmark.
+   - Median Pearson Correlation Coefficient (MPCC): We apply a sliding window with a size of 10 and a stride of 5 across all benchmark verdicts. For each batch, we calculate the median using the formula:
+     $$
+     \text{Median} = \frac{\sum{\textbf{A}}+\sum{\textbf{C}}}{\sum{\textbf{A}}+\sum{\textbf{B}}+2\cdot\sum{\textbf{C}}}
+     $$
+     This provides a set of medians for expert and model verdicts, and we calculate the PCC between them. This method retains most verdict information but imposes a linear relationship between verdict classes, which may not be entirely accurate.
+2. Metrics of Positional Bias: We introduce the metric PCon@AB, which indicates the presence of bias in evaluator models.
+   $$
+   \textbf{PCon@AB} = \frac{I(J{\text{swap}=0} = J{\text{swap}=1}| J=\textbf{A} \vee \textbf{B})}{ I\left((J{\text{swap}=0}=\textbf{A} \vee \textbf{B}) \vee (J{\text{swap}=1}=\textbf{A} \vee \textbf{B})\right)}
+   $$
+   This metric shows the consistency of the model's answers without swap and with swap, indicating the proportion of matching answers among A and B given different model response orders.
+   The metric MPCC-Consistency is calculated as the Pearson correlation coefficient between two sets of medians obtained for verdicts with and without swap, while the metric MPCC-∆ is the difference between the MPCC calculated separately for verdicts obtained with and without swap.
+PCon@AB, MPCC-Consistency, and MPCC-∆ do not rely on manual annotation, allowing us to determine the model's susceptibility to positional bias without expert involvement.