Titova Ksenia commited on
Commit
8d664ba
·
1 Parent(s): 72a7e68

divide results

Browse files
src/evaluate/evaluate_answers.py CHANGED
@@ -66,22 +66,24 @@ async def deepseek_eval(system: str, prompt: str, eval_model: str, host: str) ->
66
  return completion
67
 
68
 
69
- async def generate(df: List, cand: str, base: str, eval_model: str, host: str) -> tuple:
70
  tasks = []
71
  models_answers = []
 
72
 
73
- for instance in df:
74
  # extract only the responses from the input dict
75
  responses = {base: None, cand: None}
76
- for resp in instance["replies"]:
77
- if resp["model_name"] == base or resp["model_name"] == cand:
78
- responses[resp["model_name"]] = resp["text"]
 
79
 
80
  if responses[base] is None or responses[cand] is None:
81
  raise ValueError("There are no cand or base model answer")
82
 
83
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
84
- instance["turns"]["content"],
85
  responses[cand],
86
  responses[base],
87
  )
@@ -93,22 +95,24 @@ async def generate(df: List, cand: str, base: str, eval_model: str, host: str) -
93
  return models_answers, eval_results
94
 
95
 
96
- def generate_sync(df: List, cand: str, base: str, eval_model: str, host: str) -> tuple:
97
  models_answers = []
98
  eval_results = []
 
99
 
100
- for instance in tqdm(df):
101
  # extract only the responses from the input dict
102
  responses = {base: None, cand: None}
103
- for resp in instance["replies"]:
104
- if resp["model_name"] == base or resp["model_name"] == cand:
105
- responses[resp["model_name"]] = resp["text"]
 
106
 
107
  if responses[base] is None or responses[cand] is None:
108
  raise ValueError("There are no cand or base model answer")
109
 
110
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
111
- instance["turns"]["content"],
112
  responses[cand],
113
  responses[base],
114
  )
@@ -205,20 +209,22 @@ def draw_dataframe(
205
 
206
  def get_df(
207
  input_filename: str,
208
- output_filename: str,
209
  ):
210
 
211
  with open(input_filename, "r") as f:
212
  df = [json.loads(line) for line in f]
213
 
214
- if os.path.exists(output_filename):
215
- os.remove(output_filename)
 
216
 
217
  return df
218
 
219
 
220
  def main(
221
- input_filename: str,
 
222
  output_filename: str,
223
  baseline_model: str,
224
  candidate_model: str,
@@ -228,26 +234,32 @@ def main(
228
  host: str,
229
  sync: bool = False,
230
  ):
231
- df = get_df(
232
- input_filename=input_filename,
 
 
 
 
233
  output_filename=output_filename,
234
  )
 
 
 
235
 
236
  if swap:
237
- baseline_model, candidate_model = candidate_model, baseline_model
238
  if sync:
239
  models_answers, eval_results = generate_sync(
240
- df=df, cand=candidate_model, base=baseline_model, eval_model=eval_model, host=host
241
  )
242
  else:
243
  models_answers, eval_results = [], []
244
- bar = tqdm(total=len(df))
245
- for i in range(0, len(df), chunk_size):
246
  model_answer, eval_result = asyncio.run(
247
  generate(
248
- df=df[i:i + chunk_size],
249
- cand=candidate_model,
250
- base=baseline_model,
251
  eval_model=eval_model,
252
  host=host,
253
  )
@@ -267,7 +279,7 @@ def main(
267
  cat_only_accuracy = defaultdict(int)
268
  cat_tie_accuracy = defaultdict(int)
269
 
270
- for instance, eval_result, answers in zip(df, eval_results, models_answers):
271
  if eval_result is None:
272
  total -= 1
273
  continue
@@ -348,7 +360,7 @@ def main(
348
  candidate_model=candidate_model,
349
  four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
350
  two_numbers=pao_to_2,
351
- input_filename=input_filename,
352
  swap=swap,
353
  )
354
 
@@ -364,10 +376,15 @@ if __name__ == "__main__":
364
  help="Хостнейм, на котором крутится модель",
365
  )
366
  parser.add_argument(
367
- "--input-filename",
368
  type=str,
369
  help="Файл который надо оценить",
370
  )
 
 
 
 
 
371
  parser.add_argument("--output-filename", type=str, default="judge_results_with_probs.jsonl")
372
  parser.add_argument(
373
  "--sleep-time",
@@ -416,7 +433,7 @@ if __name__ == "__main__":
416
 
417
  for i in range(len(swaps)):
418
  results[swaps[i]] = main(
419
- input_filename=args.input_filename,
420
  output_filename=f"swap_{i}_{args.candidate_model}",
421
  baseline_model=args.baseline_model,
422
  candidate_model=args.candidate_model,
 
66
  return completion
67
 
68
 
69
+ async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
70
  tasks = []
71
  models_answers = []
72
+ base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
73
 
74
+ for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
75
  # extract only the responses from the input dict
76
  responses = {base: None, cand: None}
77
+ if instance_cand["replies"]["model_name"] == cand:
78
+ responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
79
+ if instance_base["replies"]["model_name"] == base:
80
+ responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
81
 
82
  if responses[base] is None or responses[cand] is None:
83
  raise ValueError("There are no cand or base model answer")
84
 
85
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
86
+ instance_cand["turns"]["content"],
87
  responses[cand],
88
  responses[base],
89
  )
 
95
  return models_answers, eval_results
96
 
97
 
98
+ def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
99
  models_answers = []
100
  eval_results = []
101
+ base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
102
 
103
+ for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
104
  # extract only the responses from the input dict
105
  responses = {base: None, cand: None}
106
+ if instance_cand["replies"]["model_name"] == cand:
107
+ responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
108
+ if instance_base["replies"]["model_name"] == base:
109
+ responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
110
 
111
  if responses[base] is None or responses[cand] is None:
112
  raise ValueError("There are no cand or base model answer")
113
 
114
  prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
115
+ instance_cand["turns"]["content"],
116
  responses[cand],
117
  responses[base],
118
  )
 
209
 
210
  def get_df(
211
  input_filename: str,
212
+ output_filename: str = None,
213
  ):
214
 
215
  with open(input_filename, "r") as f:
216
  df = [json.loads(line) for line in f]
217
 
218
+ if output_filename:
219
+ if os.path.exists(output_filename):
220
+ os.remove(output_filename)
221
 
222
  return df
223
 
224
 
225
  def main(
226
+ data_root_dir: str,
227
+ dataset_name: str,
228
  output_filename: str,
229
  baseline_model: str,
230
  candidate_model: str,
 
234
  host: str,
235
  sync: bool = False,
236
  ):
237
+ dataset_name_cut = dataset_name.split("/")[-1]
238
+ input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
239
+ input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
240
+ output_filename = os.path.join(data_root_dir, "judgements", f"{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
241
+ df_cand = get_df(
242
+ input_filename=input_cand_filename,
243
  output_filename=output_filename,
244
  )
245
+ df_base = get_df(
246
+ input_filename=input_base_filename,
247
+ )
248
 
249
  if swap:
250
+ df_cand, df_base = df_base, df_cand
251
  if sync:
252
  models_answers, eval_results = generate_sync(
253
+ df_cand=df_cand, df_base=df_base, eval_model=eval_model, host=host
254
  )
255
  else:
256
  models_answers, eval_results = [], []
257
+ bar = tqdm(total=len(df_cand))
258
+ for i in range(0, len(df_cand), chunk_size):
259
  model_answer, eval_result = asyncio.run(
260
  generate(
261
+ df_cand=df_cand[i:i + chunk_size],
262
+ df_base=df_base[i:i + chunk_size],
 
263
  eval_model=eval_model,
264
  host=host,
265
  )
 
279
  cat_only_accuracy = defaultdict(int)
280
  cat_tie_accuracy = defaultdict(int)
281
 
282
+ for instance, eval_result, answers in zip(df_cand, eval_results, models_answers):
283
  if eval_result is None:
284
  total -= 1
285
  continue
 
360
  candidate_model=candidate_model,
361
  four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
362
  two_numbers=pao_to_2,
363
+ input_filename=input_cand_filename,
364
  swap=swap,
365
  )
366
 
 
376
  help="Хостнейм, на котором крутится модель",
377
  )
378
  parser.add_argument(
379
+ "--data-root-dir",
380
  type=str,
381
  help="Файл который надо оценить",
382
  )
383
+ parser.add_argument(
384
+ "--dataset-name",
385
+ type=str,
386
+ help="Название бенчмарка",
387
+ )
388
  parser.add_argument("--output-filename", type=str, default="judge_results_with_probs.jsonl")
389
  parser.add_argument(
390
  "--sleep-time",
 
433
 
434
  for i in range(len(swaps)):
435
  results[swaps[i]] = main(
436
+ data_root_dir=args.data_root_dir,
437
  output_filename=f"swap_{i}_{args.candidate_model}",
438
  baseline_model=args.baseline_model,
439
  candidate_model=args.candidate_model,
src/evaluate/generate_answers.py CHANGED
@@ -30,16 +30,13 @@ def write_response_jsonl(response_text, counter, question, model_name, output_fi
30
  "question_id": question["question_id"][counter],
31
  "cluster": question["cluster"][counter],
32
  "turns": question["turns"][counter],
33
- "replies": question.get("replies", [])
34
  }
35
 
36
- cur_dict["replies"].append(
37
- {
38
  "message_id": message_id,
39
  "text": response_text,
40
  "model_name": model_name,
41
- }
42
- )
43
 
44
  with open(output_filename, "a") as f:
45
  json.dump(cur_dict, f, ensure_ascii=False)
 
30
  "question_id": question["question_id"][counter],
31
  "cluster": question["cluster"][counter],
32
  "turns": question["turns"][counter],
 
33
  }
34
 
35
+ cur_dict["replies"] = {
 
36
  "message_id": message_id,
37
  "text": response_text,
38
  "model_name": model_name,
39
+ }
 
40
 
41
  with open(output_filename, "a") as f:
42
  json.dump(cur_dict, f, ensure_ascii=False)