|
|
|
|
|
""" |
|
|
MTEB Evaluation Script for Distilled Model - Code-Focused Tasks. |
|
|
|
|
|
This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB |
|
|
(Massive Text Embedding Benchmark) with a focus on tasks relevant for code: |
|
|
|
|
|
- Classification: Tests ability to distinguish between different categories (e.g., programming languages) |
|
|
- Clustering: Tests ability to group similar code by functionality |
|
|
- STS: Tests semantic similarity understanding between code snippets |
|
|
- Retrieval: Tests code search and duplicate detection capabilities |
|
|
|
|
|
Features: |
|
|
- Incremental evaluation: Skips tasks that already have results in mteb_results/ |
|
|
- Combines existing and new results automatically |
|
|
- Saves results in multiple formats for analysis |
|
|
|
|
|
Usage: |
|
|
python MTEB_evaluate.py |
|
|
|
|
|
Configuration: |
|
|
- Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS |
|
|
- Modify CODE_SPECIFIC_TASKS for granular task selection |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import sys |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
import mteb |
|
|
from model2vec import StaticModel |
|
|
from mteb import ModelMeta |
|
|
|
|
|
from evaluation import ( |
|
|
CustomMTEB, |
|
|
get_tasks, |
|
|
make_leaderboard, |
|
|
parse_mteb_results, |
|
|
summarize_results, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_PATH = "." |
|
|
MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled" |
|
|
|
|
|
|
|
|
OUTPUT_DIR = "mteb_results" |
|
|
|
|
|
EVAL_ALL_TASKS = True |
|
|
|
|
|
|
|
|
CODE_SPECIFIC_TASKS = [ |
|
|
|
|
|
"Banking77Classification", |
|
|
|
|
|
"StackExchangeClustering.v2", |
|
|
|
|
|
"STSBenchmark", |
|
|
|
|
|
"CQADupstackProgrammersRetrieval", |
|
|
|
|
|
"SprintDuplicateQuestions", |
|
|
] |
|
|
|
|
|
|
|
|
EVAL_SPLITS = ["test"] |
|
|
VERBOSITY = 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def check_existing_results(output_path: Path, tasks: list) -> list: |
|
|
"""Check for existing task results and filter out completed tasks.""" |
|
|
remaining_tasks = [] |
|
|
completed_tasks = [] |
|
|
|
|
|
for task in tasks: |
|
|
task_name = task.metadata.name |
|
|
|
|
|
result_file = output_path / MODEL_NAME / f"{task_name}.json" |
|
|
|
|
|
if result_file.exists(): |
|
|
completed_tasks.append(task_name) |
|
|
logger.info(f"Skipping {task_name} - results already exist") |
|
|
else: |
|
|
remaining_tasks.append(task) |
|
|
|
|
|
if completed_tasks: |
|
|
logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}") |
|
|
|
|
|
return remaining_tasks |
|
|
|
|
|
|
|
|
def load_existing_parsed_results(output_path: Path) -> dict: |
|
|
"""Load existing parsed results if they exist.""" |
|
|
parsed_results_file = output_path / "mteb_parsed_results.json" |
|
|
if parsed_results_file.exists(): |
|
|
try: |
|
|
with parsed_results_file.open("r") as f: |
|
|
return json.load(f) |
|
|
except (json.JSONDecodeError, OSError) as e: |
|
|
logger.warning(f"Could not load existing parsed results: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
def load_and_display_existing_results(output_path: Path) -> None: |
|
|
"""Load and display existing MTEB results.""" |
|
|
summary_file = output_path / "mteb_summary.json" |
|
|
if summary_file.exists(): |
|
|
with summary_file.open("r") as f: |
|
|
summary = json.load(f) |
|
|
|
|
|
logger.info("=" * 80) |
|
|
logger.info("EXISTING MTEB EVALUATION RESULTS:") |
|
|
logger.info("=" * 80) |
|
|
|
|
|
stats = summary.get("summary_stats") |
|
|
if stats: |
|
|
logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}") |
|
|
logger.info(f"Average Score: {stats.get('average_score', 0):.4f}") |
|
|
logger.info(f"Median Score: {stats.get('median_score', 0):.4f}") |
|
|
|
|
|
logger.info("=" * 80) |
|
|
else: |
|
|
logger.info("No existing summary found. Individual task results may still exist.") |
|
|
|
|
|
|
|
|
def run_mteb_evaluation() -> None: |
|
|
"""Run MTEB evaluation using the evaluation package.""" |
|
|
output_path = Path(OUTPUT_DIR) |
|
|
output_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
logger.info(f"Loading model from {MODEL_PATH}") |
|
|
model = StaticModel.from_pretrained(MODEL_PATH) |
|
|
logger.info("Model loaded successfully") |
|
|
|
|
|
|
|
|
model.mteb_model_meta = ModelMeta( |
|
|
name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"] |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("Getting focused code-relevant MTEB tasks") |
|
|
logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}") |
|
|
|
|
|
if EVAL_ALL_TASKS: |
|
|
all_tasks = get_tasks() |
|
|
else: |
|
|
all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS] |
|
|
|
|
|
logger.info(f"Found {len(all_tasks)} total tasks") |
|
|
|
|
|
|
|
|
tasks = check_existing_results(output_path, all_tasks) |
|
|
logger.info(f"Will evaluate {len(tasks)} remaining tasks") |
|
|
|
|
|
if not tasks: |
|
|
logger.info("No new tasks to evaluate - all tasks already completed!") |
|
|
|
|
|
|
|
|
logger.info("Loading existing results...") |
|
|
try: |
|
|
load_and_display_existing_results(output_path) |
|
|
except (json.JSONDecodeError, OSError, KeyError) as e: |
|
|
logger.warning(f"Could not load existing results: {e}") |
|
|
return |
|
|
|
|
|
|
|
|
evaluation = CustomMTEB(tasks=tasks) |
|
|
|
|
|
|
|
|
logger.info("Starting MTEB evaluation...") |
|
|
start_time = time.time() |
|
|
|
|
|
results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY) |
|
|
|
|
|
end_time = time.time() |
|
|
evaluation_time = end_time - start_time |
|
|
logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds") |
|
|
|
|
|
|
|
|
logger.info("Parsing and summarizing results...") |
|
|
parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME) |
|
|
|
|
|
|
|
|
existing_results = load_existing_parsed_results(output_path) |
|
|
if existing_results: |
|
|
logger.info("Combining with existing results...") |
|
|
|
|
|
parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {} |
|
|
|
|
|
for key, value in existing_results.items(): |
|
|
if key not in parsed_dict: |
|
|
parsed_dict[key] = value |
|
|
parsed_results = parsed_dict |
|
|
|
|
|
task_scores = summarize_results(parsed_results) |
|
|
|
|
|
|
|
|
save_results(output_path, results, parsed_results, task_scores, evaluation_time) |
|
|
|
|
|
|
|
|
logger.info("MTEB Evaluation Results:") |
|
|
logger.info("=" * 80) |
|
|
leaderboard = make_leaderboard(task_scores) |
|
|
logger.info(leaderboard.to_string(index=False)) |
|
|
logger.info("=" * 80) |
|
|
|
|
|
logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}") |
|
|
|
|
|
|
|
|
def save_results( |
|
|
output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float |
|
|
) -> None: |
|
|
"""Save evaluation results in multiple formats.""" |
|
|
|
|
|
raw_results_file = output_path / "mteb_raw_results.json" |
|
|
with raw_results_file.open("w") as f: |
|
|
json.dump(raw_results, f, indent=2, default=str) |
|
|
logger.info(f"Raw results saved to {raw_results_file}") |
|
|
|
|
|
|
|
|
parsed_results_file = output_path / "mteb_parsed_results.json" |
|
|
with parsed_results_file.open("w") as f: |
|
|
json.dump(parsed_results, f, indent=2, default=str) |
|
|
logger.info(f"Parsed results saved to {parsed_results_file}") |
|
|
|
|
|
|
|
|
summary_stats = generate_summary_stats(task_scores) |
|
|
|
|
|
|
|
|
summary = { |
|
|
"model_name": MODEL_NAME, |
|
|
"evaluation_time_seconds": evaluation_time, |
|
|
"task_scores": task_scores, |
|
|
"summary_stats": summary_stats, |
|
|
} |
|
|
|
|
|
summary_file = output_path / "mteb_summary.json" |
|
|
with summary_file.open("w") as f: |
|
|
json.dump(summary, f, indent=2, default=str) |
|
|
logger.info(f"Summary saved to {summary_file}") |
|
|
|
|
|
|
|
|
report_file = output_path / "mteb_report.txt" |
|
|
generate_report(output_path, task_scores, summary_stats, evaluation_time) |
|
|
logger.info(f"Report saved to {report_file}") |
|
|
|
|
|
|
|
|
def generate_summary_stats(task_scores: dict) -> dict: |
|
|
"""Generate summary statistics from task scores.""" |
|
|
if not task_scores: |
|
|
return {} |
|
|
|
|
|
|
|
|
all_scores = [] |
|
|
for model_data in task_scores.values(): |
|
|
if isinstance(model_data, dict) and "dataset_scores" in model_data: |
|
|
dataset_scores = model_data["dataset_scores"] |
|
|
if isinstance(dataset_scores, dict): |
|
|
all_scores.extend( |
|
|
[ |
|
|
float(score) |
|
|
for score in dataset_scores.values() |
|
|
if isinstance(score, int | float) and str(score).lower() != "nan" |
|
|
] |
|
|
) |
|
|
|
|
|
if not all_scores: |
|
|
return {} |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
return { |
|
|
"total_datasets": len(all_scores), |
|
|
"average_score": float(np.mean(all_scores)), |
|
|
"median_score": float(np.median(all_scores)), |
|
|
"std_dev": float(np.std(all_scores)), |
|
|
"min_score": float(np.min(all_scores)), |
|
|
"max_score": float(np.max(all_scores)), |
|
|
} |
|
|
|
|
|
|
|
|
def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None: |
|
|
"""Generate human-readable evaluation report.""" |
|
|
report_file = output_path / "mteb_report.txt" |
|
|
|
|
|
with report_file.open("w") as f: |
|
|
f.write("=" * 80 + "\n") |
|
|
f.write("MTEB Evaluation Report\n") |
|
|
f.write("=" * 80 + "\n\n") |
|
|
f.write(f"Model: {MODEL_NAME}\n") |
|
|
f.write(f"Model Path: {MODEL_PATH}\n") |
|
|
f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n") |
|
|
|
|
|
|
|
|
if summary_stats: |
|
|
f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n") |
|
|
f.write("Summary Statistics:\n") |
|
|
f.write(f" Average Score: {summary_stats['average_score']:.4f}\n") |
|
|
f.write(f" Median Score: {summary_stats['median_score']:.4f}\n") |
|
|
f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n") |
|
|
f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n") |
|
|
else: |
|
|
f.write("Summary Statistics: No valid results found\n\n") |
|
|
|
|
|
|
|
|
f.write("Detailed Results:\n") |
|
|
f.write("-" * 50 + "\n") |
|
|
if task_scores: |
|
|
leaderboard = make_leaderboard(task_scores) |
|
|
f.write(leaderboard.to_string(index=False)) |
|
|
else: |
|
|
f.write("No results available\n") |
|
|
|
|
|
f.write("\n\n" + "=" * 80 + "\n") |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
"""Main evaluation function.""" |
|
|
logger.info(f"Starting MTEB evaluation for {MODEL_NAME}") |
|
|
logger.info(f"Model path: {MODEL_PATH}") |
|
|
logger.info(f"Output directory: {OUTPUT_DIR}") |
|
|
logger.info("Running focused MTEB evaluation on code-relevant tasks:") |
|
|
logger.info(" - Classification: Programming language classification") |
|
|
logger.info(" - Clustering: Code clustering by functionality") |
|
|
logger.info(" - STS: Semantic similarity between code snippets") |
|
|
logger.info(" - Retrieval: Code search and retrieval") |
|
|
|
|
|
try: |
|
|
run_mteb_evaluation() |
|
|
logger.info("Evaluation pipeline completed successfully!") |
|
|
|
|
|
except Exception: |
|
|
logger.exception("Evaluation failed") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|