Spaces:
Running
Running
Titova Ksenia
commited on
Commit
·
8d664ba
1
Parent(s):
72a7e68
divide results
Browse files
src/evaluate/evaluate_answers.py
CHANGED
|
@@ -66,22 +66,24 @@ async def deepseek_eval(system: str, prompt: str, eval_model: str, host: str) ->
|
|
| 66 |
return completion
|
| 67 |
|
| 68 |
|
| 69 |
-
async def generate(
|
| 70 |
tasks = []
|
| 71 |
models_answers = []
|
|
|
|
| 72 |
|
| 73 |
-
for
|
| 74 |
# extract only the responses from the input dict
|
| 75 |
responses = {base: None, cand: None}
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
if responses[base] is None or responses[cand] is None:
|
| 81 |
raise ValueError("There are no cand or base model answer")
|
| 82 |
|
| 83 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 84 |
-
|
| 85 |
responses[cand],
|
| 86 |
responses[base],
|
| 87 |
)
|
|
@@ -93,22 +95,24 @@ async def generate(df: List, cand: str, base: str, eval_model: str, host: str) -
|
|
| 93 |
return models_answers, eval_results
|
| 94 |
|
| 95 |
|
| 96 |
-
def generate_sync(
|
| 97 |
models_answers = []
|
| 98 |
eval_results = []
|
|
|
|
| 99 |
|
| 100 |
-
for
|
| 101 |
# extract only the responses from the input dict
|
| 102 |
responses = {base: None, cand: None}
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
if responses[base] is None or responses[cand] is None:
|
| 108 |
raise ValueError("There are no cand or base model answer")
|
| 109 |
|
| 110 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 111 |
-
|
| 112 |
responses[cand],
|
| 113 |
responses[base],
|
| 114 |
)
|
|
@@ -205,20 +209,22 @@ def draw_dataframe(
|
|
| 205 |
|
| 206 |
def get_df(
|
| 207 |
input_filename: str,
|
| 208 |
-
output_filename: str,
|
| 209 |
):
|
| 210 |
|
| 211 |
with open(input_filename, "r") as f:
|
| 212 |
df = [json.loads(line) for line in f]
|
| 213 |
|
| 214 |
-
if
|
| 215 |
-
os.
|
|
|
|
| 216 |
|
| 217 |
return df
|
| 218 |
|
| 219 |
|
| 220 |
def main(
|
| 221 |
-
|
|
|
|
| 222 |
output_filename: str,
|
| 223 |
baseline_model: str,
|
| 224 |
candidate_model: str,
|
|
@@ -228,26 +234,32 @@ def main(
|
|
| 228 |
host: str,
|
| 229 |
sync: bool = False,
|
| 230 |
):
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
output_filename=output_filename,
|
| 234 |
)
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
if swap:
|
| 237 |
-
|
| 238 |
if sync:
|
| 239 |
models_answers, eval_results = generate_sync(
|
| 240 |
-
|
| 241 |
)
|
| 242 |
else:
|
| 243 |
models_answers, eval_results = [], []
|
| 244 |
-
bar = tqdm(total=len(
|
| 245 |
-
for i in range(0, len(
|
| 246 |
model_answer, eval_result = asyncio.run(
|
| 247 |
generate(
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
base=baseline_model,
|
| 251 |
eval_model=eval_model,
|
| 252 |
host=host,
|
| 253 |
)
|
|
@@ -267,7 +279,7 @@ def main(
|
|
| 267 |
cat_only_accuracy = defaultdict(int)
|
| 268 |
cat_tie_accuracy = defaultdict(int)
|
| 269 |
|
| 270 |
-
for instance, eval_result, answers in zip(
|
| 271 |
if eval_result is None:
|
| 272 |
total -= 1
|
| 273 |
continue
|
|
@@ -348,7 +360,7 @@ def main(
|
|
| 348 |
candidate_model=candidate_model,
|
| 349 |
four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 350 |
two_numbers=pao_to_2,
|
| 351 |
-
input_filename=
|
| 352 |
swap=swap,
|
| 353 |
)
|
| 354 |
|
|
@@ -364,10 +376,15 @@ if __name__ == "__main__":
|
|
| 364 |
help="Хостнейм, на котором крутится модель",
|
| 365 |
)
|
| 366 |
parser.add_argument(
|
| 367 |
-
"--
|
| 368 |
type=str,
|
| 369 |
help="Файл который надо оценить",
|
| 370 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
parser.add_argument("--output-filename", type=str, default="judge_results_with_probs.jsonl")
|
| 372 |
parser.add_argument(
|
| 373 |
"--sleep-time",
|
|
@@ -416,7 +433,7 @@ if __name__ == "__main__":
|
|
| 416 |
|
| 417 |
for i in range(len(swaps)):
|
| 418 |
results[swaps[i]] = main(
|
| 419 |
-
|
| 420 |
output_filename=f"swap_{i}_{args.candidate_model}",
|
| 421 |
baseline_model=args.baseline_model,
|
| 422 |
candidate_model=args.candidate_model,
|
|
|
|
| 66 |
return completion
|
| 67 |
|
| 68 |
|
| 69 |
+
async def generate(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
|
| 70 |
tasks = []
|
| 71 |
models_answers = []
|
| 72 |
+
base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
|
| 73 |
|
| 74 |
+
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 75 |
# extract only the responses from the input dict
|
| 76 |
responses = {base: None, cand: None}
|
| 77 |
+
if instance_cand["replies"]["model_name"] == cand:
|
| 78 |
+
responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
|
| 79 |
+
if instance_base["replies"]["model_name"] == base:
|
| 80 |
+
responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
|
| 81 |
|
| 82 |
if responses[base] is None or responses[cand] is None:
|
| 83 |
raise ValueError("There are no cand or base model answer")
|
| 84 |
|
| 85 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 86 |
+
instance_cand["turns"]["content"],
|
| 87 |
responses[cand],
|
| 88 |
responses[base],
|
| 89 |
)
|
|
|
|
| 95 |
return models_answers, eval_results
|
| 96 |
|
| 97 |
|
| 98 |
+
def generate_sync(df_cand: str, df_base: str, eval_model: str, host: str) -> tuple:
|
| 99 |
models_answers = []
|
| 100 |
eval_results = []
|
| 101 |
+
base, cand = df_cand[0]["replies"]["model_name"], df_base[0]["replies"]["model_name"]
|
| 102 |
|
| 103 |
+
for instance_cand, instance_base in tqdm(zip(df_cand, df_base)):
|
| 104 |
# extract only the responses from the input dict
|
| 105 |
responses = {base: None, cand: None}
|
| 106 |
+
if instance_cand["replies"]["model_name"] == cand:
|
| 107 |
+
responses[instance_cand["replies"]["model_name"]] = instance_cand["replies"]["text"]
|
| 108 |
+
if instance_base["replies"]["model_name"] == base:
|
| 109 |
+
responses[instance_base["replies"]["model_name"]] = instance_base["replies"]["text"]
|
| 110 |
|
| 111 |
if responses[base] is None or responses[cand] is None:
|
| 112 |
raise ValueError("There are no cand or base model answer")
|
| 113 |
|
| 114 |
prompt = "Question: {}\n\nFirst Response: {}\nSecond Response: {}\nAnswer: ".format(
|
| 115 |
+
instance_cand["turns"]["content"],
|
| 116 |
responses[cand],
|
| 117 |
responses[base],
|
| 118 |
)
|
|
|
|
| 209 |
|
| 210 |
def get_df(
|
| 211 |
input_filename: str,
|
| 212 |
+
output_filename: str = None,
|
| 213 |
):
|
| 214 |
|
| 215 |
with open(input_filename, "r") as f:
|
| 216 |
df = [json.loads(line) for line in f]
|
| 217 |
|
| 218 |
+
if output_filename:
|
| 219 |
+
if os.path.exists(output_filename):
|
| 220 |
+
os.remove(output_filename)
|
| 221 |
|
| 222 |
return df
|
| 223 |
|
| 224 |
|
| 225 |
def main(
|
| 226 |
+
data_root_dir: str,
|
| 227 |
+
dataset_name: str,
|
| 228 |
output_filename: str,
|
| 229 |
baseline_model: str,
|
| 230 |
candidate_model: str,
|
|
|
|
| 234 |
host: str,
|
| 235 |
sync: bool = False,
|
| 236 |
):
|
| 237 |
+
dataset_name_cut = dataset_name.split("/")[-1]
|
| 238 |
+
input_cand_filename = os.path.join(data_root_dir, "generations", f"{candidate_model}_{dataset_name_cut}_responses.jsonl")
|
| 239 |
+
input_base_filename = os.path.join(data_root_dir, "generations", f"{baseline_model}_{dataset_name_cut}_responses.jsonl")
|
| 240 |
+
output_filename = os.path.join(data_root_dir, "judgements", f"{candidate_model}_vs_{baseline_model}_{dataset_name_cut}.jsonl")
|
| 241 |
+
df_cand = get_df(
|
| 242 |
+
input_filename=input_cand_filename,
|
| 243 |
output_filename=output_filename,
|
| 244 |
)
|
| 245 |
+
df_base = get_df(
|
| 246 |
+
input_filename=input_base_filename,
|
| 247 |
+
)
|
| 248 |
|
| 249 |
if swap:
|
| 250 |
+
df_cand, df_base = df_base, df_cand
|
| 251 |
if sync:
|
| 252 |
models_answers, eval_results = generate_sync(
|
| 253 |
+
df_cand=df_cand, df_base=df_base, eval_model=eval_model, host=host
|
| 254 |
)
|
| 255 |
else:
|
| 256 |
models_answers, eval_results = [], []
|
| 257 |
+
bar = tqdm(total=len(df_cand))
|
| 258 |
+
for i in range(0, len(df_cand), chunk_size):
|
| 259 |
model_answer, eval_result = asyncio.run(
|
| 260 |
generate(
|
| 261 |
+
df_cand=df_cand[i:i + chunk_size],
|
| 262 |
+
df_base=df_base[i:i + chunk_size],
|
|
|
|
| 263 |
eval_model=eval_model,
|
| 264 |
host=host,
|
| 265 |
)
|
|
|
|
| 279 |
cat_only_accuracy = defaultdict(int)
|
| 280 |
cat_tie_accuracy = defaultdict(int)
|
| 281 |
|
| 282 |
+
for instance, eval_result, answers in zip(df_cand, eval_results, models_answers):
|
| 283 |
if eval_result is None:
|
| 284 |
total -= 1
|
| 285 |
continue
|
|
|
|
| 360 |
candidate_model=candidate_model,
|
| 361 |
four_numbers=[accuracy, other_accuracy, both_good_accuracy, both_bad_accuracy],
|
| 362 |
two_numbers=pao_to_2,
|
| 363 |
+
input_filename=input_cand_filename,
|
| 364 |
swap=swap,
|
| 365 |
)
|
| 366 |
|
|
|
|
| 376 |
help="Хостнейм, на котором крутится модель",
|
| 377 |
)
|
| 378 |
parser.add_argument(
|
| 379 |
+
"--data-root-dir",
|
| 380 |
type=str,
|
| 381 |
help="Файл который надо оценить",
|
| 382 |
)
|
| 383 |
+
parser.add_argument(
|
| 384 |
+
"--dataset-name",
|
| 385 |
+
type=str,
|
| 386 |
+
help="Название бенчмарка",
|
| 387 |
+
)
|
| 388 |
parser.add_argument("--output-filename", type=str, default="judge_results_with_probs.jsonl")
|
| 389 |
parser.add_argument(
|
| 390 |
"--sleep-time",
|
|
|
|
| 433 |
|
| 434 |
for i in range(len(swaps)):
|
| 435 |
results[swaps[i]] = main(
|
| 436 |
+
data_root_dir=args.data_root_dir,
|
| 437 |
output_filename=f"swap_{i}_{args.candidate_model}",
|
| 438 |
baseline_model=args.baseline_model,
|
| 439 |
candidate_model=args.candidate_model,
|
src/evaluate/generate_answers.py
CHANGED
|
@@ -30,16 +30,13 @@ def write_response_jsonl(response_text, counter, question, model_name, output_fi
|
|
| 30 |
"question_id": question["question_id"][counter],
|
| 31 |
"cluster": question["cluster"][counter],
|
| 32 |
"turns": question["turns"][counter],
|
| 33 |
-
"replies": question.get("replies", [])
|
| 34 |
}
|
| 35 |
|
| 36 |
-
cur_dict["replies"]
|
| 37 |
-
{
|
| 38 |
"message_id": message_id,
|
| 39 |
"text": response_text,
|
| 40 |
"model_name": model_name,
|
| 41 |
-
|
| 42 |
-
)
|
| 43 |
|
| 44 |
with open(output_filename, "a") as f:
|
| 45 |
json.dump(cur_dict, f, ensure_ascii=False)
|
|
|
|
| 30 |
"question_id": question["question_id"][counter],
|
| 31 |
"cluster": question["cluster"][counter],
|
| 32 |
"turns": question["turns"][counter],
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
+
cur_dict["replies"] = {
|
|
|
|
| 36 |
"message_id": message_id,
|
| 37 |
"text": response_text,
|
| 38 |
"model_name": model_name,
|
| 39 |
+
}
|
|
|
|
| 40 |
|
| 41 |
with open(output_filename, "a") as f:
|
| 42 |
json.dump(cur_dict, f, ensure_ascii=False)
|