|
import os |
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|
os.environ["HF_HOME"] = "../../cache/hgCache" |
|
os.environ["TRANSFORMERS_CACHE"] = "../../cache/transformersCache/" |
|
|
|
import glob |
|
import logging |
|
import sys |
|
from collections import defaultdict |
|
|
|
import numpy as np |
|
import pytrec_eval |
|
import tqdm, torch |
|
import pandas as pd |
|
from pylate import models, rank |
|
|
|
|
|
document_length = 512 |
|
|
|
model_name_or_paths = [ |
|
"9eren99/TrColBERT", |
|
"jinaai/jina-colbert-v2", |
|
"antoinelouis/colbert-xm", |
|
] |
|
|
|
datasetnames = [ |
|
"fiqa2018", |
|
"climatefever", |
|
"dbpedia", |
|
"fever", |
|
"hotpotqa", |
|
|
|
"nfcorpus", |
|
"nq", |
|
"quoraretrieval", |
|
"scidocs", |
|
"arguana", |
|
"scifact", |
|
"touche2020", |
|
] |
|
for datasetname in datasetnames: |
|
print("#############", datasetname, "##############") |
|
evalResultsDf = None |
|
for model_name_or_path in model_name_or_paths: |
|
torch.cuda.empty_cache() |
|
if "jinaai/jina-colbert-v2" == model_name_or_path: |
|
model = models.ColBERT( |
|
model_name_or_path=model_name_or_path, |
|
query_prefix="[QueryMarker]", |
|
document_prefix="[DocumentMarker]", |
|
attend_to_expansion_tokens=True, |
|
trust_remote_code=True, |
|
document_length=document_length, |
|
) |
|
elif "antoinelouis/colbert-xm" == model_name_or_path: |
|
model = models.ColBERT(model_name_or_path="antoinelouis/colbert-xm") |
|
language = "tr_TR" |
|
|
|
backbone = model[0].auto_model |
|
if backbone.__class__.__name__.lower().startswith("xmod"): |
|
backbone.set_default_language(language) |
|
else: |
|
model = models.ColBERT( |
|
model_name_or_path=model_name_or_path, |
|
document_length=document_length, |
|
attend_to_expansion_tokens=( |
|
True if "attend" in model_name_or_path else False |
|
), |
|
) |
|
|
|
model.eval() |
|
model.to("cuda") |
|
|
|
dfDocs = pd.read_parquet( |
|
f"datasets/{datasetname}/corpus/train-00000-of-00001.parquet" |
|
).dropna() |
|
dfQueries = pd.read_parquet( |
|
f"datasets/{datasetname}/queries/train-00000-of-00001.parquet" |
|
).dropna() |
|
|
|
if "99eren99/TrColBERT" == model_name_or_path: |
|
try: |
|
model.tokenizer.model_input_names.remove("token_type_ids") |
|
except: |
|
print(model_name_or_path) |
|
dfDocs.TurkishText = dfDocs.TurkishText.apply( |
|
lambda x: x.replace("İ", "i").replace("I", "ı").lower() |
|
) |
|
dfQueries.TurkishText = dfQueries.TurkishText.apply( |
|
lambda x: x.replace("İ", "i").replace("I", "ı").lower() |
|
) |
|
|
|
|
|
queries = [] |
|
documents = [] |
|
passage_cand = {} |
|
relevant_qid = [] |
|
relevant_docs = defaultdict(lambda: defaultdict(int)) |
|
|
|
|
|
newId2oldId_Docs = {} |
|
for i, row in enumerate(dfDocs.values): |
|
documents.append(row[2]) |
|
newId2oldId_Docs[i] = str(row[0]) |
|
relevant_qid.append(str(row[0])) |
|
|
|
|
|
newId2oldId_Queries = {} |
|
for i, row in enumerate(dfQueries.values): |
|
queries.append(row[2]) |
|
newId2oldId_Queries[i] = str(row[0]) |
|
|
|
for j, rowDoc in enumerate(dfDocs.values): |
|
relevant_docs[str(row[0])][str(rowDoc[0])] = 0 |
|
|
|
|
|
dfQrels = pd.read_parquet( |
|
f"datasets/{datasetname}/qrels/train-00000-of-00001.parquet" |
|
) |
|
for i, row in enumerate(dfQrels.values): |
|
relevant_docs[str(row[0])][str(row[1])] = 1 |
|
|
|
candidateIds = [[i for i in range(len(documents))]] |
|
|
|
queries_result_list = [] |
|
run = {} |
|
|
|
documents_embeddings = model.encode( |
|
[documents], is_query=False, show_progress_bar=True |
|
) |
|
|
|
for i, query in enumerate(tqdm.tqdm(queries)): |
|
|
|
queries_embeddings = model.encode( |
|
[query], |
|
is_query=True, |
|
) |
|
|
|
reranked_documents = rank.rerank( |
|
documents_ids=candidateIds, |
|
queries_embeddings=queries_embeddings, |
|
documents_embeddings=documents_embeddings, |
|
) |
|
|
|
run[newId2oldId_Queries[i]] = {} |
|
for resDict in reranked_documents[0]: |
|
run[newId2oldId_Queries[i]][newId2oldId_Docs[resDict["id"]]] = float( |
|
resDict["score"] |
|
) |
|
|
|
evaluator = pytrec_eval.RelevanceEvaluator( |
|
relevant_docs, pytrec_eval.supported_measures |
|
) |
|
scores = evaluator.evaluate(run) |
|
|
|
def print_line(measure, scope, value): |
|
print("{:25s}{:8s}{:.4f}".format(measure, scope, value)) |
|
|
|
for query_id, query_measures in sorted(scores.items()): |
|
break |
|
for measure, value in sorted(query_measures.items()): |
|
print_line(measure, query_id, value) |
|
|
|
|
|
|
|
resultsColumns = ["model name"] |
|
resultsRow = [model_name_or_path] |
|
for measure in sorted(query_measures.keys()): |
|
resultsColumns.append(measure) |
|
resultsRow.append( |
|
pytrec_eval.compute_aggregated_measure( |
|
measure, |
|
[query_measures[measure] for query_measures in scores.values()], |
|
) |
|
) |
|
|
|
if evalResultsDf is None: |
|
evalResultsDf = pd.DataFrame(columns=resultsColumns) |
|
evalResultsDf.loc[-1] = resultsRow |
|
evalResultsDf.index = evalResultsDf.index + 1 |
|
|
|
evalResultsDf.to_csv(f"resultsn/{datasetname}.csv", encoding="utf-8") |
|
|