TrColBERT / assets /evalPytrecBGEm3.py
99eren99's picture
Upload 27 files
d155e36 verified
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["HF_HOME"] = "../../cache/hgCache"
os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/"
import gzip
import logging
import sys
from collections import defaultdict
import numpy as np
import pytrec_eval
import tqdm
import pandas as pd
from pylate import models, rank
from FlagEmbedding import BGEM3FlagModel
datasetnames = [
"fiqa2018",
"climatefever",
"dbpedia",
"fever",
"fiqa2018",
"hotpotqa",
# "msmarco",
"nfcorpus",
"nq",
"quoraretrieval",
"scidocs",
"arguana",
"scifact",
"touche2020",
]
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
for datasetname in datasetnames:
evalResultsDf = None
dfDocs = pd.read_parquet(
f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet"
).dropna()
dfQueries = pd.read_parquet(
f"datasets/{datasetname}/queries/train-00000-of-00001.parquet"
).dropna()
# Read test queries
queries = []
documents = []
passage_cand = {}
relevant_qid = []
relevant_docs = defaultdict(lambda: defaultdict(int))
# read corpus
newId2oldId_Docs = {}
for i, row in enumerate(dfDocs.values):
documents.append(row[2])
newId2oldId_Docs[i] = str(row[0])
relevant_qid.append(str(row[0]))
# read queries
newId2oldId_Queries = {}
for i, row in enumerate(dfQueries.values):
queries.append(row[2])
newId2oldId_Queries[i] = str(row[0])
for j, rowDoc in enumerate(dfDocs.values):
relevant_docs[str(row[0])][str(rowDoc[0])] = 0
# read qrels
dfQrels = pd.read_parquet(
f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet"
)
for i, row in enumerate(dfQrels.values):
relevant_docs[str(row[0])][str(row[1])] = 1
candidateIds = [[i for i in range(len(documents))]]
queries_result_list = []
run = {}
document_embeddings = model.encode(
documents,
batch_size=4,
max_length=512,
return_dense=True,
return_sparse=True,
return_colbert_vecs=True,
)
for i, query in enumerate(tqdm.tqdm(queries)):
queries_embeddings = model.encode(
[query],
max_length=32,
return_dense=True,
return_sparse=True,
return_colbert_vecs=True,
)
similarities = []
for j in range(len(documents)):
similarities.append(
model.colbert_score(
queries_embeddings["colbert_vecs"][0],
document_embeddings["colbert_vecs"][j],
)
)
run[newId2oldId_Queries[i]] = {}
for j, score in enumerate(similarities):
run[newId2oldId_Queries[i]][newId2oldId_Docs[j]] = float(score)
evaluator = pytrec_eval.RelevanceEvaluator(
relevant_docs, pytrec_eval.supported_measures
)
scores = evaluator.evaluate(run)
def print_line(measure, scope, value):
print("{:25s}{:8s}{:.4f}".format(measure, scope, value))
for query_id, query_measures in sorted(scores.items()):
break
for measure, value in sorted(query_measures.items()):
print_line(measure, query_id, value)
# Scope hack: use query_measures of last item in previous loop to
# figure out all unique measure names.
resultsColumns = ["model name"]
resultsRow = ["bgem3"]
for measure in sorted(query_measures.keys()):
resultsColumns.append(measure)
resultsRow.append(
pytrec_eval.compute_aggregated_measure(
measure, [query_measures[measure] for query_measures in scores.values()]
)
)
if evalResultsDf is None:
evalResultsDf = pd.DataFrame(columns=resultsColumns)
evalResultsDf.loc[-1] = resultsRow
evalResultsDf.index = evalResultsDf.index + 1
evalResultsDf.to_csv(f"results/{datasetname}_bgem3.csv", encoding="utf-8")