codemalt / MTEB_evaluate.py

Sarthak

initial commit

ecfceb8 6 months ago

11.8 kB

	#!/usr/bin/env python
	"""
	MTEB Evaluation Script for Distilled Model - Code-Focused Tasks.

	This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB
	(Massive Text Embedding Benchmark) with a focus on tasks relevant for code:

	- Classification: Tests ability to distinguish between different categories (e.g., programming languages)
	- Clustering: Tests ability to group similar code by functionality
	- STS: Tests semantic similarity understanding between code snippets
	- Retrieval: Tests code search and duplicate detection capabilities

	Features:
	- Incremental evaluation: Skips tasks that already have results in mteb_results/
	- Combines existing and new results automatically
	- Saves results in multiple formats for analysis

	Usage:
	python MTEB_evaluate.py

	Configuration:
	- Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS
	- Modify CODE_SPECIFIC_TASKS for granular task selection
	"""

	import json
	import logging
	import sys
	import time
	from pathlib import Path

	import mteb
	from model2vec import StaticModel
	from mteb import ModelMeta

	from evaluation import (
	CustomMTEB,
	get_tasks,
	make_leaderboard,
	parse_mteb_results,
	summarize_results,
	)

	# =============================================================================
	# CONFIGURATION CONSTANTS
	# =============================================================================

	# Model Configuration
	MODEL_PATH = "." # Path to the distilled model directory
	MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled" # Name for the model in results

	# Evaluation Configuration
	OUTPUT_DIR = "mteb_results" # Directory to save evaluation results

	EVAL_ALL_TASKS = True

	# Specific tasks most relevant for code evaluation (focused selection)
	CODE_SPECIFIC_TASKS = [
	# Classification - Programming language/category classification
	"Banking77Classification", # Fine-grained classification (77 classes)
	# Clustering - Code grouping by functionality
	"StackExchangeClustering.v2", # Technical Q&A clustering (most relevant)
	# STS - Code similarity understanding
	"STSBenchmark", # Standard semantic similarity benchmark
	# Retrieval - Code search capabilities
	"CQADupstackProgrammersRetrieval", # Programming Q&A retrieval
	# PairClassification - Duplicate/similar code detection
	"SprintDuplicateQuestions", # Duplicate question detection
	]

	# Evaluation settings
	EVAL_SPLITS = ["test"] # Dataset splits to evaluate on
	VERBOSITY = 2 # MTEB verbosity level

	# =============================================================================

	# Configure logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)


	def check_existing_results(output_path: Path, tasks: list) -> list:
	"""Check for existing task results and filter out completed tasks."""
	remaining_tasks = []
	completed_tasks = []

	for task in tasks:
	task_name = task.metadata.name
	# MTEB saves results as {model_name}__{task_name}.json
	result_file = output_path / MODEL_NAME / f"{task_name}.json"

	if result_file.exists():
	completed_tasks.append(task_name)
	logger.info(f"Skipping {task_name} - results already exist")
	else:
	remaining_tasks.append(task)

	if completed_tasks:
	logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}")

	return remaining_tasks


	def load_existing_parsed_results(output_path: Path) -> dict:
	"""Load existing parsed results if they exist."""
	parsed_results_file = output_path / "mteb_parsed_results.json"
	if parsed_results_file.exists():
	try:
	with parsed_results_file.open("r") as f:
	return json.load(f)
	except (json.JSONDecodeError, OSError) as e:
	logger.warning(f"Could not load existing parsed results: {e}")
	return {}


	def load_and_display_existing_results(output_path: Path) -> None:
	"""Load and display existing MTEB results."""
	summary_file = output_path / "mteb_summary.json"
	if summary_file.exists():
	with summary_file.open("r") as f:
	summary = json.load(f)

	logger.info("=" * 80)
	logger.info("EXISTING MTEB EVALUATION RESULTS:")
	logger.info("=" * 80)

	stats = summary.get("summary_stats")
	if stats:
	logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}")
	logger.info(f"Average Score: {stats.get('average_score', 0):.4f}")
	logger.info(f"Median Score: {stats.get('median_score', 0):.4f}")

	logger.info("=" * 80)
	else:
	logger.info("No existing summary found. Individual task results may still exist.")


	def run_mteb_evaluation() -> None:
	"""Run MTEB evaluation using the evaluation package."""
	output_path = Path(OUTPUT_DIR)
	output_path.mkdir(parents=True, exist_ok=True)

	logger.info(f"Loading model from {MODEL_PATH}")
	model = StaticModel.from_pretrained(MODEL_PATH)
	logger.info("Model loaded successfully")

	# Set up model metadata for MTEB
	model.mteb_model_meta = ModelMeta( # type: ignore[attr-defined]
	name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"]
	)

	# Get specific code-relevant tasks (focused selection)
	logger.info("Getting focused code-relevant MTEB tasks")
	logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}")

	if EVAL_ALL_TASKS:
	all_tasks = get_tasks()
	else:
	all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS]

	logger.info(f"Found {len(all_tasks)} total tasks")

	# Check for existing results and filter out completed tasks
	tasks = check_existing_results(output_path, all_tasks)
	logger.info(f"Will evaluate {len(tasks)} remaining tasks")

	if not tasks:
	logger.info("No new tasks to evaluate - all tasks already completed!")

	# Load and display existing results
	logger.info("Loading existing results...")
	try:
	load_and_display_existing_results(output_path)
	except (json.JSONDecodeError, OSError, KeyError) as e:
	logger.warning(f"Could not load existing results: {e}")
	return

	# Define the CustomMTEB object with the specified tasks
	evaluation = CustomMTEB(tasks=tasks)

	# Run the evaluation
	logger.info("Starting MTEB evaluation...")
	start_time = time.time()

	results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY)

	end_time = time.time()
	evaluation_time = end_time - start_time
	logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds")

	# Parse the results and summarize them
	logger.info("Parsing and summarizing results...")
	parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME)

	# Load existing results if any and combine them
	existing_results = load_existing_parsed_results(output_path)
	if existing_results:
	logger.info("Combining with existing results...")
	# Convert to dict for merging
	parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {}
	# Simple merge - existing results take precedence to avoid overwriting
	for key, value in existing_results.items():
	if key not in parsed_dict:
	parsed_dict[key] = value
	parsed_results = parsed_dict

	task_scores = summarize_results(parsed_results)

	# Save results in different formats
	save_results(output_path, results, parsed_results, task_scores, evaluation_time)

	# Print the results in a leaderboard format
	logger.info("MTEB Evaluation Results:")
	logger.info("=" * 80)
	leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
	logger.info(leaderboard.to_string(index=False))
	logger.info("=" * 80)

	logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}")


	def save_results(
	output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float
	) -> None:
	"""Save evaluation results in multiple formats."""
	# Save raw results
	raw_results_file = output_path / "mteb_raw_results.json"
	with raw_results_file.open("w") as f:
	json.dump(raw_results, f, indent=2, default=str)
	logger.info(f"Raw results saved to {raw_results_file}")

	# Save parsed results
	parsed_results_file = output_path / "mteb_parsed_results.json"
	with parsed_results_file.open("w") as f:
	json.dump(parsed_results, f, indent=2, default=str)
	logger.info(f"Parsed results saved to {parsed_results_file}")

	# Generate summary statistics
	summary_stats = generate_summary_stats(task_scores)

	# Save task scores summary
	summary = {
	"model_name": MODEL_NAME,
	"evaluation_time_seconds": evaluation_time,
	"task_scores": task_scores,
	"summary_stats": summary_stats,
	}

	summary_file = output_path / "mteb_summary.json"
	with summary_file.open("w") as f:
	json.dump(summary, f, indent=2, default=str)
	logger.info(f"Summary saved to {summary_file}")

	# Save human-readable report
	report_file = output_path / "mteb_report.txt"
	generate_report(output_path, task_scores, summary_stats, evaluation_time)
	logger.info(f"Report saved to {report_file}")


	def generate_summary_stats(task_scores: dict) -> dict:
	"""Generate summary statistics from task scores."""
	if not task_scores:
	return {}

	# Extract all individual dataset scores
	all_scores = []
	for model_data in task_scores.values():
	if isinstance(model_data, dict) and "dataset_scores" in model_data:
	dataset_scores = model_data["dataset_scores"]
	if isinstance(dataset_scores, dict):
	all_scores.extend(
	[
	float(score)
	for score in dataset_scores.values()
	if isinstance(score, int \| float) and str(score).lower() != "nan"
	]
	)

	if not all_scores:
	return {}

	import numpy as np

	return {
	"total_datasets": len(all_scores),
	"average_score": float(np.mean(all_scores)),
	"median_score": float(np.median(all_scores)),
	"std_dev": float(np.std(all_scores)),
	"min_score": float(np.min(all_scores)),
	"max_score": float(np.max(all_scores)),
	}


	def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None:
	"""Generate human-readable evaluation report."""
	report_file = output_path / "mteb_report.txt"

	with report_file.open("w") as f:
	f.write("=" * 80 + "\n")
	f.write("MTEB Evaluation Report\n")
	f.write("=" * 80 + "\n\n")
	f.write(f"Model: {MODEL_NAME}\n")
	f.write(f"Model Path: {MODEL_PATH}\n")
	f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n")

	# Write summary stats
	if summary_stats:
	f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n")
	f.write("Summary Statistics:\n")
	f.write(f" Average Score: {summary_stats['average_score']:.4f}\n")
	f.write(f" Median Score: {summary_stats['median_score']:.4f}\n")
	f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n")
	f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n")
	else:
	f.write("Summary Statistics: No valid results found\n\n")

	# Write leaderboard
	f.write("Detailed Results:\n")
	f.write("-" * 50 + "\n")
	if task_scores:
	leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
	f.write(leaderboard.to_string(index=False))
	else:
	f.write("No results available\n")

	f.write("\n\n" + "=" * 80 + "\n")


	def main() -> None:
	"""Main evaluation function."""
	logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
	logger.info(f"Model path: {MODEL_PATH}")
	logger.info(f"Output directory: {OUTPUT_DIR}")
	logger.info("Running focused MTEB evaluation on code-relevant tasks:")
	logger.info(" - Classification: Programming language classification")
	logger.info(" - Clustering: Code clustering by functionality")
	logger.info(" - STS: Semantic similarity between code snippets")
	logger.info(" - Retrieval: Code search and retrieval")

	try:
	run_mteb_evaluation()
	logger.info("Evaluation pipeline completed successfully!")

	except Exception:
	logger.exception("Evaluation failed")
	sys.exit(1)


	if __name__ == "__main__":
	main()