TrColBERT / assets /evalPytrec.py
99eren99's picture
Update assets/evalPytrec.py
4751db5 verified
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HF_HOME"] = "../../cache/hgCache"
os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/"
import glob
import logging
import sys
from collections import defaultdict
import numpy as np
import pytrec_eval
import tqdm, torch
import pandas as pd
from pylate import models, rank
document_length = 512
model_name_or_paths = [
"9eren99/TrColBERT",
"jinaai/jina-colbert-v2",
"antoinelouis/colbert-xm",
]
datasetnames = [
"fiqa2018",
"climatefever",
"dbpedia",
"fever",
"hotpotqa",
# "msmarco",
"nfcorpus",
"nq",
"quoraretrieval",
"scidocs",
"arguana",
"scifact",
"touche2020",
]
for datasetname in datasetnames:
print("#############", datasetname, "##############")
evalResultsDf = None
for model_name_or_path in model_name_or_paths:
torch.cuda.empty_cache()
if "jinaai/jina-colbert-v2" == model_name_or_path:
model = models.ColBERT(
model_name_or_path=model_name_or_path,
query_prefix="[QueryMarker]",
document_prefix="[DocumentMarker]",
attend_to_expansion_tokens=True,
trust_remote_code=True,
document_length=document_length,
)
elif "antoinelouis/colbert-xm" == model_name_or_path:
model = models.ColBERT(model_name_or_path="antoinelouis/colbert-xm")
language = "tr_TR" # Use a code from https://huggingface.co/facebook/xmod-base#languages
backbone = model[0].auto_model
if backbone.__class__.__name__.lower().startswith("xmod"):
backbone.set_default_language(language)
else:
model = models.ColBERT(
model_name_or_path=model_name_or_path,
document_length=document_length,
attend_to_expansion_tokens=(
True if "attend" in model_name_or_path else False
),
)
model.eval()
model.to("cuda")
dfDocs = pd.read_parquet(
f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet"
).dropna()
dfQueries = pd.read_parquet(
f"datasets/{datasetname}/queries/train-00000-of-00001.parquet"
).dropna()
if "99eren99/TrColBERT" == model_name_or_path:
try:
model.tokenizer.model_input_names.remove("token_type_ids")
except:
print(model_name_or_path)
dfDocs.TurkishText = dfDocs.TurkishText.apply(
lambda x: x.replace("İ", "i").replace("I", "ı").lower()
)
dfQueries.TurkishText = dfQueries.TurkishText.apply(
lambda x: x.replace("İ", "i").replace("I", "ı").lower()
)
# Read test queries
queries = []
documents = []
passage_cand = {}
relevant_qid = []
relevant_docs = defaultdict(lambda: defaultdict(int))
# read corpus
newId2oldId_Docs = {}
for i, row in enumerate(dfDocs.values):
documents.append(row[2])
newId2oldId_Docs[i] = str(row[0])
relevant_qid.append(str(row[0]))
# read queries
newId2oldId_Queries = {}
for i, row in enumerate(dfQueries.values):
queries.append(row[2])
newId2oldId_Queries[i] = str(row[0])
for j, rowDoc in enumerate(dfDocs.values):
relevant_docs[str(row[0])][str(rowDoc[0])] = 0
# read qrels
dfQrels = pd.read_parquet(
f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet"
)
for i, row in enumerate(dfQrels.values):
relevant_docs[str(row[0])][str(row[1])] = 1
candidateIds = [[i for i in range(len(documents))]]
queries_result_list = []
run = {}
documents_embeddings = model.encode(
[documents], is_query=False, show_progress_bar=True
)
for i, query in enumerate(tqdm.tqdm(queries)):
queries_embeddings = model.encode(
[query],
is_query=True,
)
reranked_documents = rank.rerank(
documents_ids=candidateIds,
queries_embeddings=queries_embeddings,
documents_embeddings=documents_embeddings,
)
run[newId2oldId_Queries[i]] = {}
for resDict in reranked_documents[0]:
run[newId2oldId_Queries[i]][newId2oldId_Docs[resDict["id"]]] = float(
resDict["score"]
)
evaluator = pytrec_eval.RelevanceEvaluator(
relevant_docs, pytrec_eval.supported_measures
)
scores = evaluator.evaluate(run)
def print_line(measure, scope, value):
print("{:25s}{:8s}{:.4f}".format(measure, scope, value))
for query_id, query_measures in sorted(scores.items()):
break
for measure, value in sorted(query_measures.items()):
print_line(measure, query_id, value)
# Scope hack: use query_measures of last item in previous loop to
# figure out all unique measure names.
resultsColumns = ["model name"]
resultsRow = [model_name_or_path]
for measure in sorted(query_measures.keys()):
resultsColumns.append(measure)
resultsRow.append(
pytrec_eval.compute_aggregated_measure(
measure,
[query_measures[measure] for query_measures in scores.values()],
)
)
if evalResultsDf is None:
evalResultsDf = pd.DataFrame(columns=resultsColumns)
evalResultsDf.loc[-1] = resultsRow
evalResultsDf.index = evalResultsDf.index + 1
evalResultsDf.to_csv(f"resultsn/{datasetname}.csv", encoding="utf-8")