|
import os
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
os.environ["HF_HOME"] = "../../cache/hgCache"
|
|
os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/"
|
|
|
|
import gzip
|
|
import logging
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
import numpy as np
|
|
import pytrec_eval
|
|
import tqdm
|
|
import pandas as pd
|
|
from pylate import models, rank
|
|
from FlagEmbedding import BGEM3FlagModel
|
|
|
|
|
|
datasetnames = [
|
|
"fiqa2018",
|
|
"climatefever",
|
|
"dbpedia",
|
|
"fever",
|
|
"fiqa2018",
|
|
"hotpotqa",
|
|
|
|
"nfcorpus",
|
|
"nq",
|
|
"quoraretrieval",
|
|
"scidocs",
|
|
"arguana",
|
|
"scifact",
|
|
"touche2020",
|
|
]
|
|
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
|
|
for datasetname in datasetnames:
|
|
evalResultsDf = None
|
|
|
|
dfDocs = pd.read_parquet(
|
|
f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet"
|
|
).dropna()
|
|
dfQueries = pd.read_parquet(
|
|
f"datasets/{datasetname}/queries/train-00000-of-00001.parquet"
|
|
).dropna()
|
|
|
|
|
|
queries = []
|
|
documents = []
|
|
passage_cand = {}
|
|
relevant_qid = []
|
|
relevant_docs = defaultdict(lambda: defaultdict(int))
|
|
|
|
|
|
newId2oldId_Docs = {}
|
|
for i, row in enumerate(dfDocs.values):
|
|
documents.append(row[2])
|
|
newId2oldId_Docs[i] = str(row[0])
|
|
relevant_qid.append(str(row[0]))
|
|
|
|
|
|
newId2oldId_Queries = {}
|
|
for i, row in enumerate(dfQueries.values):
|
|
queries.append(row[2])
|
|
newId2oldId_Queries[i] = str(row[0])
|
|
|
|
for j, rowDoc in enumerate(dfDocs.values):
|
|
relevant_docs[str(row[0])][str(rowDoc[0])] = 0
|
|
|
|
|
|
dfQrels = pd.read_parquet(
|
|
f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet"
|
|
)
|
|
for i, row in enumerate(dfQrels.values):
|
|
relevant_docs[str(row[0])][str(row[1])] = 1
|
|
|
|
candidateIds = [[i for i in range(len(documents))]]
|
|
|
|
queries_result_list = []
|
|
run = {}
|
|
|
|
document_embeddings = model.encode(
|
|
documents,
|
|
batch_size=4,
|
|
max_length=512,
|
|
return_dense=True,
|
|
return_sparse=True,
|
|
return_colbert_vecs=True,
|
|
)
|
|
|
|
for i, query in enumerate(tqdm.tqdm(queries)):
|
|
|
|
queries_embeddings = model.encode(
|
|
[query],
|
|
max_length=32,
|
|
return_dense=True,
|
|
return_sparse=True,
|
|
return_colbert_vecs=True,
|
|
)
|
|
|
|
similarities = []
|
|
for j in range(len(documents)):
|
|
similarities.append(
|
|
model.colbert_score(
|
|
queries_embeddings["colbert_vecs"][0],
|
|
document_embeddings["colbert_vecs"][j],
|
|
)
|
|
)
|
|
|
|
run[newId2oldId_Queries[i]] = {}
|
|
for j, score in enumerate(similarities):
|
|
run[newId2oldId_Queries[i]][newId2oldId_Docs[j]] = float(score)
|
|
|
|
evaluator = pytrec_eval.RelevanceEvaluator(
|
|
relevant_docs, pytrec_eval.supported_measures
|
|
)
|
|
scores = evaluator.evaluate(run)
|
|
|
|
def print_line(measure, scope, value):
|
|
print("{:25s}{:8s}{:.4f}".format(measure, scope, value))
|
|
|
|
for query_id, query_measures in sorted(scores.items()):
|
|
break
|
|
for measure, value in sorted(query_measures.items()):
|
|
print_line(measure, query_id, value)
|
|
|
|
|
|
|
|
resultsColumns = ["model name"]
|
|
resultsRow = ["bgem3"]
|
|
for measure in sorted(query_measures.keys()):
|
|
resultsColumns.append(measure)
|
|
resultsRow.append(
|
|
pytrec_eval.compute_aggregated_measure(
|
|
measure, [query_measures[measure] for query_measures in scores.values()]
|
|
)
|
|
)
|
|
|
|
if evalResultsDf is None:
|
|
evalResultsDf = pd.DataFrame(columns=resultsColumns)
|
|
evalResultsDf.loc[-1] = resultsRow
|
|
evalResultsDf.index = evalResultsDf.index + 1
|
|
|
|
evalResultsDf.to_csv(f"results/{datasetname}_bgem3.csv", encoding="utf-8")
|
|
|