Titova Ksenia commited on
Commit
21865d4
·
1 Parent(s): 8d664ba

add sample data

Browse files
data/generations/gpt-3.5-turbo_arena_hard_ru_responses.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/generations/gpt-4o-2024-11-20_arena_hard_ru_responses.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
src/about.py CHANGED
@@ -32,13 +32,11 @@ MTSAIR draft leaderboard
32
  """
33
 
34
  # Which evaluations are you running? how can people reproduce what you have?
35
- LLM_BENCHMARKS_TEXT = f"""
36
- ## How it works
 
 
37
 
38
- ## Reproducibility
39
- To reproduce our results, here is the commands you can run:
40
-
41
- """
42
 
43
  EVALUATION_QUEUE_TEXT = """
44
  ## Some good practices before submitting a model
 
32
  """
33
 
34
  # Which evaluations are you running? how can people reproduce what you have?
35
+
36
+ system_prompt_filename = "./llm_benchmarks_text.md"
37
+ with open(system_prompt_filename, 'r') as file:
38
+ LLM_BENCHMARKS_TEXT = file.read()
39
 
 
 
 
 
40
 
41
  EVALUATION_QUEUE_TEXT = """
42
  ## Some good practices before submitting a model
src/evaluate/calculate_metrics.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import pandas as pd
3
+ from tabulate import tabulate
4
+ from scipy import stats
5
+
6
+
7
+ def draw_dataframe(
8
+ baseline_model: str,
9
+ candidate_model: str,
10
+ four_numbers: List,
11
+ unidentified: float,
12
+ input_filename: str,
13
+ pconab: float = -1,
14
+ ) -> None:
15
+
16
+ df = pd.DataFrame(
17
+ {
18
+ "baseline_model": baseline_model,
19
+ "candidate_model": candidate_model,
20
+ "Four numbers": " ".join([str(i) for i in four_numbers]),
21
+ "PCon@AB (for candidate)": pconab,
22
+ "Unidentified": unidentified,
23
+ },
24
+ index=[input_filename],
25
+ )
26
+ df.index.name = "filename"
27
+ print(tabulate(df, headers='keys', tablefmt='psql'))
28
+
29
+
30
+ def calculate_pconab(
31
+ answers_swap_False: List,
32
+ answers_swap_True: List,
33
+ ) -> float:
34
+
35
+ corr_1_2, corr, total, without_tie_total = 0, 0, 0, 0
36
+ for val_0, val_1 in zip(answers_swap_False, answers_swap_True):
37
+ total += 1
38
+ val_1 = {"A": "B", "B": "A"}.get(val_1, val_1) # swap A <-> for swap=True
39
+ if val_0 == val_1:
40
+ corr += 1
41
+ if val_0 in ["A", "B"] or val_1 in ["A", "B"]:
42
+ without_tie_total += 1
43
+ if val_0 == val_1:
44
+ corr_1_2 += 1
45
+
46
+ if without_tie_total > 0:
47
+ return corr_1_2 / without_tie_total
48
+ return 0.0
49
+
50
+
51
+ def calculate_medians(results, window_len=10, stride=5):
52
+ results_len = len(results)
53
+ medians = []
54
+ for i in range(0, results_len, stride):
55
+ if i + window_len < results_len:
56
+ cur_batch = results[i: i + window_len]
57
+ cur_dict = {"A": 0, "B": 0, "C": 0}
58
+ for v in cur_batch:
59
+ if v == 1:
60
+ cur_dict["A"] += 1
61
+ if v == 2:
62
+ cur_dict["B"] += 1
63
+ if v == 3:
64
+ cur_dict["C"] += 1
65
+ try:
66
+ cur_med = (cur_dict["A"]+cur_dict["B"])/(cur_dict["A"]+cur_dict["B"]+2*cur_dict["C"])
67
+ except:
68
+ cur_med = 0
69
+ medians.append(cur_med)
70
+ return medians
71
+
72
+
73
+ def calculate_correlations(
74
+ answers: List,
75
+ manual_answers: List,
76
+ ) -> float:
77
+
78
+ result_mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
79
+ results, manual_results = [], []
80
+ five = [0, 0, 0, 0]
81
+ manual_five = [0, 0, 0, 0]
82
+
83
+ for answer, manual_answer in zip(answers, manual_answers):
84
+ if answer in result_mapping and manual_answer in result_mapping:
85
+ results.append(result_mapping[answer])
86
+ manual_results.append(result_mapping[manual_answer])
87
+ five[results[-1] - 1] += 1
88
+ manual_five[manual_results[-1] - 1] += 1
89
+
90
+ medians = calculate_medians(results)
91
+ manual_medians = calculate_medians(manual_results)
92
+
93
+ correlations = {}
94
+ correlations["FSCC"] = stats.spearmanr(manual_results, results).statistic
95
+ correlations["APCC"] = stats.pearsonr(manual_five, five).statistic
96
+ correlations["MPCC"] = stats.pearsonr(manual_medians, medians).statistic
97
+
98
+ return 0.0
src/evaluate/evaluate_answers.py CHANGED
@@ -4,17 +4,11 @@ import json
4
  import os
5
  import re
6
  from collections import defaultdict
7
- from copy import copy
8
- from pathlib import PurePath
9
- from typing import Any, Dict, List, Optional, Union
10
- from warnings import warn
11
-
12
- import openai
13
- import pandas as pd
14
- from openai import AsyncOpenAI
15
- from tqdm import tqdm
16
- from tabulate import tabulate
17
 
 
 
 
18
 
19
  SYSTEM_PROMPT = """
20
  Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
@@ -43,44 +37,27 @@ IMPORTANT:
43
  - Focus purely on content quality based on the given factual information and evaluation criteria.
44
  - Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
45
  """
46
-
 
 
47
  STOP_THINKING = "</think>"
48
 
49
 
50
- async def deepseek_eval(system: str, prompt: str, eval_model: str, host: str) -> Optional[str]:
51
- async with openai.AsyncOpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
52
- completion = await client.chat.completions.create(
53
- model=eval_model,
54
- n=1,
55
- messages=[
56
- {"role": "system", "content": system},
57
- {"role": "user", "content": prompt},
58
- ],
59
- max_tokens=1024,
60
- timeout=10000,
61
- temperature=0.0,
62
- top_p=1.0,
63
- logprobs=True,
64
- top_logprobs=20,
65
- )
66
- return completion
67
-
68
-
69
- async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
70
  tasks = []
71
  models_answers = []
72
- base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
73
 
74
  for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
75
  # extract only the responses from the input dict
76
  responses = {base: None, cand: None}
77
  if instance_cand["replies"]["model_name"] == cand:
78
- responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
79
  if instance_base["replies"]["model_name"] == base:
80
- responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
81
 
82
  if responses[base] is None or responses[cand] is None:
83
- raise ValueError("There are no cand or base model answer")
84
 
85
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
86
  instance_cand["turns"]["content"],
@@ -88,25 +65,25 @@ async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tu
88
  responses[base],
89
  )
90
  models_answers.append(responses)
91
- task = asyncio.create_task(deepseek_eval(SYSTEM_PROMPT, prompt, eval_model, host))
92
  tasks.append(task)
93
 
94
  eval_results = await asyncio.gather(*tasks)
95
  return models_answers, eval_results
96
 
97
 
98
- def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
99
  models_answers = []
100
  eval_results = []
101
- base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
102
 
103
  for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
104
  # extract only the responses from the input dict
105
  responses = {base: None, cand: None}
106
  if instance_cand["replies"]["model_name"] == cand:
107
- responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
108
  if instance_base["replies"]["model_name"] == base:
109
- responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
110
 
111
  if responses[base] is None or responses[cand] is None:
112
  raise ValueError("There are no cand or base model answer")
@@ -117,33 +94,11 @@ def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tup
117
  responses[base],
118
  )
119
  models_answers.append(responses)
120
- result = deepseek_eval_sync(SYSTEM_PROMPT, prompt, eval_model, host)
121
  eval_results.append(result)
122
  return models_answers, eval_results
123
 
124
 
125
- def deepseek_eval_sync(system: str, prompt: str, eval_model: str, host: str) -> Optional[str]:
126
- with openai.OpenAI(api_key=os.getenv("MTSAI_API_KEY"), base_url=host) as client:
127
- try:
128
- completion = client.chat.completions.create(
129
- model=eval_model,
130
- n=1,
131
- messages=[
132
- {"role": "system", "content": system},
133
- {"role": "user", "content": prompt},
134
- ],
135
- max_tokens=1024,
136
- temperature=0.0,
137
- top_p=1.0,
138
- logprobs=True,
139
- top_logprobs=20,
140
- )
141
- return completion
142
- except BaseException as e:
143
- print(f"pausing {e}")
144
-
145
- return None
146
-
147
 
148
  def construct_output(
149
  question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
@@ -175,7 +130,7 @@ def validate_answer(answer: str) -> str:
175
  for ans in matches:
176
  if ans == "A" or ans == "B" or ans == "C" or ans == "D":
177
  return ans
178
- print(answer)
179
  return "E"
180
 
181
 
@@ -187,26 +142,6 @@ def model_wins_with_tie(row: Dict[str, Any], model_name: str):
187
  return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
188
 
189
 
190
- def draw_dataframe(
191
- baseline_model: str,
192
- candidate_model: str,
193
- four_numbers: List,
194
- input_filename: str,
195
- ) -> None:
196
-
197
- df = pd.DataFrame(
198
- {
199
- "System prompt": SYSTEM_PROMPT,
200
- "baseline_model": baseline_model,
201
- "candidate_model": candidate_model,
202
- "Four numbers": " ".join([str(i) for i in four_numbers]),
203
- },
204
- index=[input_filename],
205
- )
206
- df.index.name = "filename"
207
- print(tabulate(df, headers='keys', tablefmt='psql'))
208
-
209
-
210
  def get_df(
211
  input_filename: str,
212
  output_filename: str = None,
@@ -223,21 +158,20 @@ def get_df(
223
 
224
 
225
  def main(
 
226
  data_root_dir: str,
227
  dataset_name: str,
228
- output_filename: str,
229
  baseline_model: str,
230
  candidate_model: str,
231
  swap: bool,
232
- eval_model: str,
233
  chunk_size: int,
234
- host: str,
235
  sync: bool = False,
 
236
  ):
237
  dataset_name_cut = dataset_name.split("/")[-1]
238
  input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
239
  input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
240
- output_filename = os.path.join(data_root_dir, "judgements", f"{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
241
  df_cand = get_df(
242
  input_filename=input_cand_filename,
243
  output_filename=output_filename,
@@ -250,7 +184,7 @@ def main(
250
  df_cand, df_base = df_base, df_cand
251
  if sync:
252
  models_answers, eval_results = generate_sync(
253
- df_cand=df_cand, df_base=df_base, eval_model=eval_model, host=host
254
  )
255
  else:
256
  models_answers, eval_results = [], []
@@ -258,10 +192,9 @@ def main(
258
  for i in range(0, len(df_cand), chunk_size):
259
  model_answer, eval_result = asyncio.run(
260
  generate(
 
261
  df_cand=df_cand[i:i + chunk_size],
262
  df_base=df_base[i:i + chunk_size],
263
- eval_model=eval_model,
264
- host=host,
265
  )
266
  )
267
  models_answers.extend(model_answer)
@@ -269,6 +202,7 @@ def main(
269
  bar.update(chunk_size)
270
 
271
  better, both_good, both_bad, worse = 0, 0, 0, 0
 
272
  not_defined = 0
273
  total = len(eval_results)
274
 
@@ -285,11 +219,13 @@ def main(
285
  continue
286
  cat_total[instance["cluster"]] += 1
287
 
288
- possible_judge = eval_result.choices[0].message.content
289
- possible_judge_cut = possible_judge[possible_judge.find(STOP_THINKING) + len(STOP_THINKING) :]
290
- possible_judge_cut = possible_judge_cut.strip()
 
291
 
292
- judge = validate_answer(possible_judge_cut)
 
293
 
294
  # count how many times the judge preffered candidate model
295
  better += judge == "A"
@@ -297,6 +233,7 @@ def main(
297
  both_good += judge == "C"
298
  both_bad += judge == "D"
299
  not_defined += judge == "E"
 
300
  if swap:
301
  cat_only_better[instance["cluster"]] += judge == "B"
302
  cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
@@ -325,7 +262,7 @@ def main(
325
  cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
326
  cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
327
 
328
- accuracy_draw = (not_defined / total) * 100
329
 
330
  if swap:
331
  accuracy, other_accuracy = other_accuracy, accuracy
@@ -340,7 +277,7 @@ def main(
340
  f"Our Model preferred answers numbers with swap {swap}": better,
341
  "Total number of questions": total,
342
  "Accuracy": accuracy,
343
- "Accuracy for not defined cases": accuracy_draw,
344
  "Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
345
  "Two numbers": pao_to_2,
346
  }
@@ -348,22 +285,15 @@ def main(
348
 
349
  results = {
350
  "mean": accuracy,
351
- "not_defined": accuracy_draw,
352
  "four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
353
- "pao_to_2": pao_to_2,
354
  "cat_only_accuracy": cat_only_accuracy,
355
  "cat_tie_accuracy": cat_tie_accuracy,
 
 
356
  }
357
 
358
- draw_dataframe(
359
- baseline_model=baseline_model,
360
- candidate_model=candidate_model,
361
- four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
362
- two_numbers=pao_to_2,
363
- input_filename=input_cand_filename,
364
- swap=swap,
365
- )
366
-
367
  return results
368
 
369
 
@@ -372,16 +302,18 @@ if __name__ == "__main__":
372
  parser.add_argument(
373
  "--hostname",
374
  type=str,
375
- default="https://demo8-miqu-fundres.dev.mts.ai/v1",
376
  help="Хостнейм, на котором крутится модель",
377
  )
378
  parser.add_argument(
379
  "--data-root-dir",
 
380
  type=str,
381
  help="Файл который надо оценить",
382
  )
383
  parser.add_argument(
384
  "--dataset-name",
 
385
  type=str,
386
  help="Название бенчмарка",
387
  )
@@ -396,7 +328,7 @@ if __name__ == "__main__":
396
  parser.add_argument(
397
  "--candidate-model",
398
  type=str,
399
- default="gpt-4o-2024-11-20",
400
  help="Модель, чьи ответы надо оценить против baseline-model",
401
  )
402
  parser.add_argument(
@@ -406,11 +338,17 @@ if __name__ == "__main__":
406
  help="Модель, чьи ответы надо оценить против candidate-model",
407
  )
408
  parser.add_argument(
409
- "--eval-model",
410
  default="deepseek-r1-distill-llama-70b-awq",
411
  type=str,
412
  help="Название для модели, которая будет оценивать",
413
  )
 
 
 
 
 
 
414
  parser.add_argument(
415
  "--chunk-size",
416
  default=256,
@@ -422,29 +360,68 @@ if __name__ == "__main__":
422
  action="store_true",
423
  help="Если true, генерация синхронная, иначе асинхронная",
424
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
  args = parser.parse_args()
427
 
428
  if not os.getenv("OPENAI_API_KEY"):
429
  raise ValueError("OPENAI_API_KEY is not set")
430
 
 
 
 
 
 
431
  swaps = [False, True]
432
  results = dict.fromkeys(swaps)
433
 
434
  for i in range(len(swaps)):
435
  results[swaps[i]] = main(
 
 
436
  data_root_dir=args.data_root_dir,
437
- output_filename=f"swap_{i}_{args.candidate_model}",
438
  baseline_model=args.baseline_model,
439
  candidate_model=args.candidate_model,
440
  swap=swaps[i],
441
- eval_model=args.eval_model,
442
  chunk_size=args.chunk_size,
443
- host=args.hostname,
444
  sync=args.sync,
 
445
  )
446
- mean_4 = [(x + y) / 2 for x, y in zip(*[v["four_numbers"] for v in list(results.values())])]
447
- better, worse, both_good, both_bad = mean_4
 
448
 
449
- print("Mean results (go up for divided swaps):\nFour numbers:")
450
- print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import os
5
  import re
6
  from collections import defaultdict
7
+ from typing import Any, Dict, List
 
 
 
 
 
 
 
 
 
8
 
9
+ from tqdm import tqdm
10
+ from src.evaluate.util import APIModelBase
11
+ from src.evaluate.calculate_metrics import draw_dataframe, calculate_pconab
12
 
13
  SYSTEM_PROMPT = """
14
  Please act as an objective and strict judge, evaluating the responses of two AI assistants to the user's question based on the provided factual information and strict quality standards. Assess each response against the following criteria to determine which assistant provides the best overall answer.
 
37
  - Focus purely on content quality based on the given factual information and evaluation criteria.
38
  - Provide the final decision enclosed in double brackets to ensure proper parsing, for example: [[A]], [[B]], [[C]] or [[D]].
39
  """
40
+ TEMPERATURE = 0.0
41
+ TOP_P = 0.1
42
+ FREQUENCY_PENALTY = 1.2
43
  STOP_THINKING = "</think>"
44
 
45
 
46
+ async def generate(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  tasks = []
48
  models_answers = []
49
+ cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
50
 
51
  for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
52
  # extract only the responses from the input dict
53
  responses = {base: None, cand: None}
54
  if instance_cand["replies"]["model_name"] == cand:
55
+ responses[cand] = instance_cand["replies"]["text"]
56
  if instance_base["replies"]["model_name"] == base:
57
+ responses[base] = instance_base["replies"]["text"]
58
 
59
  if responses[base] is None or responses[cand] is None:
60
+ raise ValueError("There is no cand or base model answer")
61
 
62
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
63
  instance_cand["turns"]["content"],
 
65
  responses[base],
66
  )
67
  models_answers.append(responses)
68
+ task = asyncio.create_task(model.generate_answers_async(prompt))
69
  tasks.append(task)
70
 
71
  eval_results = await asyncio.gather(*tasks)
72
  return models_answers, eval_results
73
 
74
 
75
+ def generate_sync(model: APIModelBase, df_cand: str, df_base: str) -> tuple:
76
  models_answers = []
77
  eval_results = []
78
+ cand, base = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
79
 
80
  for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
81
  # extract only the responses from the input dict
82
  responses = {base: None, cand: None}
83
  if instance_cand["replies"]["model_name"] == cand:
84
+ responses[cand] = instance_cand["replies"]["text"]
85
  if instance_base["replies"]["model_name"] == base:
86
+ responses[base] = instance_base["replies"]["text"]
87
 
88
  if responses[base] is None or responses[cand] is None:
89
  raise ValueError("There are no cand or base model answer")
 
94
  responses[base],
95
  )
96
  models_answers.append(responses)
97
+ result = model.generate_answers(prompt)
98
  eval_results.append(result)
99
  return models_answers, eval_results
100
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def construct_output(
104
  question: str, res: str, responses: Dict, baseline_model: str, candidate_model: str, category: str, swap: bool
 
130
  for ans in matches:
131
  if ans == "A" or ans == "B" or ans == "C" or ans == "D":
132
  return ans
133
+ print(f"Wasn't able to validate answer:\n{answer}")
134
  return "E"
135
 
136
 
 
142
  return 1 if row["judge_result"] == model_name or row["judge_result"] == "both_good" else 0
143
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  def get_df(
146
  input_filename: str,
147
  output_filename: str = None,
 
158
 
159
 
160
  def main(
161
+ model: APIModelBase,
162
  data_root_dir: str,
163
  dataset_name: str,
 
164
  baseline_model: str,
165
  candidate_model: str,
166
  swap: bool,
 
167
  chunk_size: int,
 
168
  sync: bool = False,
169
+ with_reasoning: bool = True,
170
  ):
171
  dataset_name_cut = dataset_name.split("/")[-1]
172
  input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
173
  input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
174
+ output_filename = os.path.join(data_root_dir, "judgements", f"swap_{int(swap)}_{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
175
  df_cand = get_df(
176
  input_filename=input_cand_filename,
177
  output_filename=output_filename,
 
184
  df_cand, df_base = df_base, df_cand
185
  if sync:
186
  models_answers, eval_results = generate_sync(
187
+ model=model, df_cand=df_cand, df_base=df_base,
188
  )
189
  else:
190
  models_answers, eval_results = [], []
 
192
  for i in range(0, len(df_cand), chunk_size):
193
  model_answer, eval_result = asyncio.run(
194
  generate(
195
+ model=model,
196
  df_cand=df_cand[i:i + chunk_size],
197
  df_base=df_base[i:i + chunk_size],
 
 
198
  )
199
  )
200
  models_answers.extend(model_answer)
 
202
  bar.update(chunk_size)
203
 
204
  better, both_good, both_bad, worse = 0, 0, 0, 0
205
+ result_judgements = []
206
  not_defined = 0
207
  total = len(eval_results)
208
 
 
219
  continue
220
  cat_total[instance["cluster"]] += 1
221
 
222
+ if with_reasoning:
223
+ possible_judgement = eval_result[eval_result.find(STOP_THINKING) + len(STOP_THINKING):]
224
+ else:
225
+ possible_judgement = eval_result
226
 
227
+ possible_judgement = possible_judgement.strip()
228
+ judge = validate_answer(possible_judgement)
229
 
230
  # count how many times the judge preffered candidate model
231
  better += judge == "A"
 
233
  both_good += judge == "C"
234
  both_bad += judge == "D"
235
  not_defined += judge == "E"
236
+ result_judgements.append(judge)
237
  if swap:
238
  cat_only_better[instance["cluster"]] += judge == "B"
239
  cat_tie_better[instance["cluster"]] += (judge == "B") or (judge == "C")
 
262
  cat_only_accuracy[k] = cat_only_better[k] / cat_total[k]
263
  cat_tie_accuracy[k] = cat_tie_better[k] / cat_total[k]
264
 
265
+ unidentified_accuracy = (not_defined / total) * 100
266
 
267
  if swap:
268
  accuracy, other_accuracy = other_accuracy, accuracy
 
277
  f"Our Model preferred answers numbers with swap {swap}": better,
278
  "Total number of questions": total,
279
  "Accuracy": accuracy,
280
+ "Accuracy for not defined cases": unidentified_accuracy,
281
  "Four numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
282
  "Two numbers": pao_to_2,
283
  }
 
285
 
286
  results = {
287
  "mean": accuracy,
288
+ "unidentified": unidentified_accuracy,
289
  "four_numbers": [accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
290
+ "two_numbers": pao_to_2,
291
  "cat_only_accuracy": cat_only_accuracy,
292
  "cat_tie_accuracy": cat_tie_accuracy,
293
+ "result_judgements": result_judgements,
294
+ "input_filename": input_cand_filename,
295
  }
296
 
 
 
 
 
 
 
 
 
 
297
  return results
298
 
299
 
 
302
  parser.add_argument(
303
  "--hostname",
304
  type=str,
305
+ default="https://demo-eval-fundres.dev.mts.ai/v1",
306
  help="Хостнейм, на котором крутится модель",
307
  )
308
  parser.add_argument(
309
  "--data-root-dir",
310
+ default="./data/",
311
  type=str,
312
  help="Файл который надо оценить",
313
  )
314
  parser.add_argument(
315
  "--dataset-name",
316
+ default="Vikhrmodels/arena_hard_ru",
317
  type=str,
318
  help="Название бенчмарка",
319
  )
 
328
  parser.add_argument(
329
  "--candidate-model",
330
  type=str,
331
+ default="gpt-3.5-turbo",
332
  help="Модель, чьи ответы надо оценить против baseline-model",
333
  )
334
  parser.add_argument(
 
338
  help="Модель, чьи ответы надо оценить против candidate-model",
339
  )
340
  parser.add_argument(
341
+ "--model-openai",
342
  default="deepseek-r1-distill-llama-70b-awq",
343
  type=str,
344
  help="Название для модели, которая будет оценивать",
345
  )
346
+ parser.add_argument(
347
+ "--with-reasoning",
348
+ default=True,
349
+ action="store_true",
350
+ help="Если true, генерация синхронная, иначе асинхронная",
351
+ )
352
  parser.add_argument(
353
  "--chunk-size",
354
  default=256,
 
360
  action="store_true",
361
  help="Если true, генерация синхронная, иначе асинхронная",
362
  )
363
+ parser.add_argument(
364
+ "--system-prompt",
365
+ default=None,
366
+ type=str,
367
+ help="Если true, генерация синхронная, иначе асинхронная",
368
+ )
369
+ parser.add_argument(
370
+ "--max-gen-length",
371
+ type=int,
372
+ default=4096,
373
+ help="Максимальная длина генерируемого текста",
374
+ )
375
+ parser.add_argument("--temperature", type=float, default=TEMPERATURE)
376
+ parser.add_argument("--top-p", type=float, default=TOP_P)
377
+ parser.add_argument(
378
+ "--frequency-penalty",
379
+ type=float,
380
+ default=FREQUENCY_PENALTY,
381
+ )
382
 
383
  args = parser.parse_args()
384
 
385
  if not os.getenv("OPENAI_API_KEY"):
386
  raise ValueError("OPENAI_API_KEY is not set")
387
 
388
+ from src.evaluate.util import OpenaiModel
389
+ # parameters for OpenaiModel
390
+ args.system_prompt = SYSTEM_PROMPT if not args.system_prompt else args.system_prompt
391
+ model = OpenaiModel(args)
392
+
393
  swaps = [False, True]
394
  results = dict.fromkeys(swaps)
395
 
396
  for i in range(len(swaps)):
397
  results[swaps[i]] = main(
398
+ model=model,
399
+ dataset_name=args.dataset_name,
400
  data_root_dir=args.data_root_dir,
 
401
  baseline_model=args.baseline_model,
402
  candidate_model=args.candidate_model,
403
  swap=swaps[i],
 
404
  chunk_size=args.chunk_size,
 
405
  sync=args.sync,
406
+ with_reasoning=args.with_reasoning,
407
  )
408
+ mean_four = [(x + y) / 2 for x, y in zip(*[v["four_numbers"] for v in list(results.values())])]
409
+ mean_unidentified = (results[False]["unidentified"] + results[True]["unidentified"]) / 2
410
+ # better, worse, both_good, both_bad = mean_4
411
 
412
+ # print("Mean results (go up for divided swaps):\nFour numbers:")
413
+ # print(f"{better:.2f}\t{worse:.2f}\t{both_good:.2f}\t{both_bad:.2f}")
414
+
415
+ pconab = calculate_pconab(
416
+ results[False]["result_judgements"],
417
+ results[True]["result_judgements"]
418
+ )
419
+
420
+ draw_dataframe(
421
+ baseline_model=args.baseline_model,
422
+ candidate_model=args.candidate_model,
423
+ four_numbers=mean_four,
424
+ unidentified=mean_unidentified,
425
+ pconab=pconab,
426
+ input_filename=results[False]["input_filename"],
427
+ )
src/evaluate/util.py CHANGED
@@ -75,22 +75,14 @@ class OpenaiModel(APIModelBase):
75
  def respond(self, texts):
76
  answers = []
77
  for text in texts:
78
- with openai.OpenAI(
79
- api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
80
- ) as client:
81
- messages = self.create_prompt(text)
82
- payload = self.get_payload(messages)
83
- completion = client.chat.completions.create(**payload)
84
- answer = completion.choices[0].message.content
85
- # print(answer)
86
- # print("\n")
87
- answers.append(answer)
88
  return answers
89
 
90
  async def respond_async(self, texts):
91
  tasks = []
92
  for text in texts:
93
- task = asyncio.create_task(self.generate_answers(text))
94
  tasks.append(task)
95
 
96
  result = await asyncio.gather(*tasks)
@@ -98,7 +90,17 @@ class OpenaiModel(APIModelBase):
98
  assert len(result) == len(texts)
99
  return result
100
 
101
- async def generate_answers(self, text):
 
 
 
 
 
 
 
 
 
 
102
  async with openai.AsyncOpenAI(
103
  api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
104
  ) as client:
 
75
  def respond(self, texts):
76
  answers = []
77
  for text in texts:
78
+ answer = self.generate_answers(text)
79
+ answers.append(answer)
 
 
 
 
 
 
 
 
80
  return answers
81
 
82
  async def respond_async(self, texts):
83
  tasks = []
84
  for text in texts:
85
+ task = asyncio.create_task(self.generate_answers_async(text))
86
  tasks.append(task)
87
 
88
  result = await asyncio.gather(*tasks)
 
90
  assert len(result) == len(texts)
91
  return result
92
 
93
+ def generate_answers(self, text):
94
+ with openai.OpenAI(
95
+ api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
96
+ ) as client:
97
+ messages = self.create_prompt(text)
98
+ payload = self.get_payload(messages)
99
+ completion = client.chat.completions.create(**payload)
100
+
101
+ return completion.choices[0].message.content
102
+
103
+ async def generate_answers_async(self, text):
104
  async with openai.AsyncOpenAI(
105
  api_key=os.getenv("OPENAI_API_KEY"), base_url=self.args.hostname
106
  ) as client:
src/llm_benchmarks_text.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This leaderboard displays the metrics indicating the adequacy of various LLM-as-a-judge systems during the side-by-side evaluation of model generations from Qwen2.5-32B-Instruct and GPT-4o. In our comparisons, we ask the judge model to determine whether:
2
+
3
+ 1. The response from the candidate model is better than the baseline,
4
+ 2. Vice versa,
5
+ 3. Both responses are good, or
6
+ 4. Both responses are bad.
7
+
8
+ Instead of randomizing the order of model responses, we conduct two runs through the dataset. In the first run, the candidate model's response is presented first, followed by the baseline model's response; in the second run, the order is reversed. The scores are averaged after both runs are completed.
9
+
10
+ After this, we calculate metrics showcasing two aspects:
11
+
12
+ 1. APCC and MPCC demonstrate the correlation of LLM-as-a-judge assessments with expert evaluations.
13
+ - Aggregated Pearson Correlation Coefficient (APCC): We count the number of verdicts in each class (A/B/C/D) and calculate the correlation between LLM-as-judge and expert assessments based on these four values. This metric sacrifices detailed verdict information but can estimate how closely the model aligns with experts in delivering a final verdict for the entire benchmark.
14
+ - Median Pearson Correlation Coefficient (MPCC): We apply a sliding window with a size of 10 and a stride of 5 across all benchmark verdicts. For each batch, we calculate the median using the formula:
15
+ $$
16
+ \text{Median} = \frac{\sum{\textbf{A}}+\sum{\textbf{C}}}{\sum{\textbf{A}}+\sum{\textbf{B}}+2\cdot\sum{\textbf{C}}}
17
+ $$
18
+ This provides a set of medians for expert and model verdicts, and we calculate the PCC between them. This method retains most verdict information but imposes a linear relationship between verdict classes, which may not be entirely accurate.
19
+
20
+ 2. Metrics of Positional Bias: We introduce the metric PCon@AB, which indicates the presence of bias in evaluator models.
21
+ $$
22
+ \textbf{PCon@AB} = \frac{I(J{\text{swap}=0} = J{\text{swap}=1}| J=\textbf{A} \vee \textbf{B})}{ I\left((J{\text{swap}=0}=\textbf{A} \vee \textbf{B}) \vee (J{\text{swap}=1}=\textbf{A} \vee \textbf{B})\right)}
23
+ $$
24
+ This metric shows the consistency of the model's answers without swap and with swap, indicating the proportion of matching answers among A and B given different model response orders.
25
+
26
+ The metric MPCC-Consistency is calculated as the Pearson correlation coefficient between two sets of medians obtained for verdicts with and without swap, while the metric MPCC-∆ is the difference between the MPCC calculated separately for verdicts obtained with and without swap.
27
+
28
+ PCon@AB, MPCC-Consistency, and MPCC-∆ do not rely on manual annotation, allowing us to determine the model's susceptibility to positional bias without expert involvement.