codemalt / MTEB_evaluate.py
Sarthak
initial commit
ecfceb8
raw
history blame
11.8 kB
#!/usr/bin/env python
"""
MTEB Evaluation Script for Distilled Model - Code-Focused Tasks.
This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB
(Massive Text Embedding Benchmark) with a focus on tasks relevant for code:
- Classification: Tests ability to distinguish between different categories (e.g., programming languages)
- Clustering: Tests ability to group similar code by functionality
- STS: Tests semantic similarity understanding between code snippets
- Retrieval: Tests code search and duplicate detection capabilities
Features:
- Incremental evaluation: Skips tasks that already have results in mteb_results/
- Combines existing and new results automatically
- Saves results in multiple formats for analysis
Usage:
python MTEB_evaluate.py
Configuration:
- Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS
- Modify CODE_SPECIFIC_TASKS for granular task selection
"""
import json
import logging
import sys
import time
from pathlib import Path
import mteb
from model2vec import StaticModel
from mteb import ModelMeta
from evaluation import (
CustomMTEB,
get_tasks,
make_leaderboard,
parse_mteb_results,
summarize_results,
)
# =============================================================================
# CONFIGURATION CONSTANTS
# =============================================================================
# Model Configuration
MODEL_PATH = "." # Path to the distilled model directory
MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled" # Name for the model in results
# Evaluation Configuration
OUTPUT_DIR = "mteb_results" # Directory to save evaluation results
EVAL_ALL_TASKS = True
# Specific tasks most relevant for code evaluation (focused selection)
CODE_SPECIFIC_TASKS = [
# Classification - Programming language/category classification
"Banking77Classification", # Fine-grained classification (77 classes)
# Clustering - Code grouping by functionality
"StackExchangeClustering.v2", # Technical Q&A clustering (most relevant)
# STS - Code similarity understanding
"STSBenchmark", # Standard semantic similarity benchmark
# Retrieval - Code search capabilities
"CQADupstackProgrammersRetrieval", # Programming Q&A retrieval
# PairClassification - Duplicate/similar code detection
"SprintDuplicateQuestions", # Duplicate question detection
]
# Evaluation settings
EVAL_SPLITS = ["test"] # Dataset splits to evaluate on
VERBOSITY = 2 # MTEB verbosity level
# =============================================================================
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def check_existing_results(output_path: Path, tasks: list) -> list:
"""Check for existing task results and filter out completed tasks."""
remaining_tasks = []
completed_tasks = []
for task in tasks:
task_name = task.metadata.name
# MTEB saves results as {model_name}__{task_name}.json
result_file = output_path / MODEL_NAME / f"{task_name}.json"
if result_file.exists():
completed_tasks.append(task_name)
logger.info(f"Skipping {task_name} - results already exist")
else:
remaining_tasks.append(task)
if completed_tasks:
logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}")
return remaining_tasks
def load_existing_parsed_results(output_path: Path) -> dict:
"""Load existing parsed results if they exist."""
parsed_results_file = output_path / "mteb_parsed_results.json"
if parsed_results_file.exists():
try:
with parsed_results_file.open("r") as f:
return json.load(f)
except (json.JSONDecodeError, OSError) as e:
logger.warning(f"Could not load existing parsed results: {e}")
return {}
def load_and_display_existing_results(output_path: Path) -> None:
"""Load and display existing MTEB results."""
summary_file = output_path / "mteb_summary.json"
if summary_file.exists():
with summary_file.open("r") as f:
summary = json.load(f)
logger.info("=" * 80)
logger.info("EXISTING MTEB EVALUATION RESULTS:")
logger.info("=" * 80)
stats = summary.get("summary_stats")
if stats:
logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}")
logger.info(f"Average Score: {stats.get('average_score', 0):.4f}")
logger.info(f"Median Score: {stats.get('median_score', 0):.4f}")
logger.info("=" * 80)
else:
logger.info("No existing summary found. Individual task results may still exist.")
def run_mteb_evaluation() -> None:
"""Run MTEB evaluation using the evaluation package."""
output_path = Path(OUTPUT_DIR)
output_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Loading model from {MODEL_PATH}")
model = StaticModel.from_pretrained(MODEL_PATH)
logger.info("Model loaded successfully")
# Set up model metadata for MTEB
model.mteb_model_meta = ModelMeta( # type: ignore[attr-defined]
name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"]
)
# Get specific code-relevant tasks (focused selection)
logger.info("Getting focused code-relevant MTEB tasks")
logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}")
if EVAL_ALL_TASKS:
all_tasks = get_tasks()
else:
all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS]
logger.info(f"Found {len(all_tasks)} total tasks")
# Check for existing results and filter out completed tasks
tasks = check_existing_results(output_path, all_tasks)
logger.info(f"Will evaluate {len(tasks)} remaining tasks")
if not tasks:
logger.info("No new tasks to evaluate - all tasks already completed!")
# Load and display existing results
logger.info("Loading existing results...")
try:
load_and_display_existing_results(output_path)
except (json.JSONDecodeError, OSError, KeyError) as e:
logger.warning(f"Could not load existing results: {e}")
return
# Define the CustomMTEB object with the specified tasks
evaluation = CustomMTEB(tasks=tasks)
# Run the evaluation
logger.info("Starting MTEB evaluation...")
start_time = time.time()
results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY)
end_time = time.time()
evaluation_time = end_time - start_time
logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds")
# Parse the results and summarize them
logger.info("Parsing and summarizing results...")
parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME)
# Load existing results if any and combine them
existing_results = load_existing_parsed_results(output_path)
if existing_results:
logger.info("Combining with existing results...")
# Convert to dict for merging
parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {}
# Simple merge - existing results take precedence to avoid overwriting
for key, value in existing_results.items():
if key not in parsed_dict:
parsed_dict[key] = value
parsed_results = parsed_dict
task_scores = summarize_results(parsed_results)
# Save results in different formats
save_results(output_path, results, parsed_results, task_scores, evaluation_time)
# Print the results in a leaderboard format
logger.info("MTEB Evaluation Results:")
logger.info("=" * 80)
leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
logger.info(leaderboard.to_string(index=False))
logger.info("=" * 80)
logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}")
def save_results(
output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float
) -> None:
"""Save evaluation results in multiple formats."""
# Save raw results
raw_results_file = output_path / "mteb_raw_results.json"
with raw_results_file.open("w") as f:
json.dump(raw_results, f, indent=2, default=str)
logger.info(f"Raw results saved to {raw_results_file}")
# Save parsed results
parsed_results_file = output_path / "mteb_parsed_results.json"
with parsed_results_file.open("w") as f:
json.dump(parsed_results, f, indent=2, default=str)
logger.info(f"Parsed results saved to {parsed_results_file}")
# Generate summary statistics
summary_stats = generate_summary_stats(task_scores)
# Save task scores summary
summary = {
"model_name": MODEL_NAME,
"evaluation_time_seconds": evaluation_time,
"task_scores": task_scores,
"summary_stats": summary_stats,
}
summary_file = output_path / "mteb_summary.json"
with summary_file.open("w") as f:
json.dump(summary, f, indent=2, default=str)
logger.info(f"Summary saved to {summary_file}")
# Save human-readable report
report_file = output_path / "mteb_report.txt"
generate_report(output_path, task_scores, summary_stats, evaluation_time)
logger.info(f"Report saved to {report_file}")
def generate_summary_stats(task_scores: dict) -> dict:
"""Generate summary statistics from task scores."""
if not task_scores:
return {}
# Extract all individual dataset scores
all_scores = []
for model_data in task_scores.values():
if isinstance(model_data, dict) and "dataset_scores" in model_data:
dataset_scores = model_data["dataset_scores"]
if isinstance(dataset_scores, dict):
all_scores.extend(
[
float(score)
for score in dataset_scores.values()
if isinstance(score, int | float) and str(score).lower() != "nan"
]
)
if not all_scores:
return {}
import numpy as np
return {
"total_datasets": len(all_scores),
"average_score": float(np.mean(all_scores)),
"median_score": float(np.median(all_scores)),
"std_dev": float(np.std(all_scores)),
"min_score": float(np.min(all_scores)),
"max_score": float(np.max(all_scores)),
}
def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None:
"""Generate human-readable evaluation report."""
report_file = output_path / "mteb_report.txt"
with report_file.open("w") as f:
f.write("=" * 80 + "\n")
f.write("MTEB Evaluation Report\n")
f.write("=" * 80 + "\n\n")
f.write(f"Model: {MODEL_NAME}\n")
f.write(f"Model Path: {MODEL_PATH}\n")
f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n")
# Write summary stats
if summary_stats:
f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n")
f.write("Summary Statistics:\n")
f.write(f" Average Score: {summary_stats['average_score']:.4f}\n")
f.write(f" Median Score: {summary_stats['median_score']:.4f}\n")
f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n")
f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n")
else:
f.write("Summary Statistics: No valid results found\n\n")
# Write leaderboard
f.write("Detailed Results:\n")
f.write("-" * 50 + "\n")
if task_scores:
leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type]
f.write(leaderboard.to_string(index=False))
else:
f.write("No results available\n")
f.write("\n\n" + "=" * 80 + "\n")
def main() -> None:
"""Main evaluation function."""
logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
logger.info(f"Model path: {MODEL_PATH}")
logger.info(f"Output directory: {OUTPUT_DIR}")
logger.info("Running focused MTEB evaluation on code-relevant tasks:")
logger.info(" - Classification: Programming language classification")
logger.info(" - Clustering: Code clustering by functionality")
logger.info(" - STS: Semantic similarity between code snippets")
logger.info(" - Retrieval: Code search and retrieval")
try:
run_mteb_evaluation()
logger.info("Evaluation pipeline completed successfully!")
except Exception:
logger.exception("Evaluation failed")
sys.exit(1)
if __name__ == "__main__":
main()