#!/usr/bin/env python """ MTEB Evaluation Script for Distilled Model - Code-Focused Tasks. This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB (Massive Text Embedding Benchmark) with a focus on tasks relevant for code: - Classification: Tests ability to distinguish between different categories (e.g., programming languages) - Clustering: Tests ability to group similar code by functionality - STS: Tests semantic similarity understanding between code snippets - Retrieval: Tests code search and duplicate detection capabilities Features: - Incremental evaluation: Skips tasks that already have results in mteb_results/ - Combines existing and new results automatically - Saves results in multiple formats for analysis Usage: python MTEB_evaluate.py Configuration: - Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS - Modify CODE_SPECIFIC_TASKS for granular task selection """ import json import logging import sys import time from pathlib import Path import mteb from model2vec import StaticModel from mteb import ModelMeta from evaluation import ( CustomMTEB, get_tasks, make_leaderboard, parse_mteb_results, summarize_results, ) # ============================================================================= # CONFIGURATION CONSTANTS # ============================================================================= # Model Configuration MODEL_PATH = "." # Path to the distilled model directory MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled" # Name for the model in results # Evaluation Configuration OUTPUT_DIR = "mteb_results" # Directory to save evaluation results EVAL_ALL_TASKS = True # Specific tasks most relevant for code evaluation (focused selection) CODE_SPECIFIC_TASKS = [ # Classification - Programming language/category classification "Banking77Classification", # Fine-grained classification (77 classes) # Clustering - Code grouping by functionality "StackExchangeClustering.v2", # Technical Q&A clustering (most relevant) # STS - Code similarity understanding "STSBenchmark", # Standard semantic similarity benchmark # Retrieval - Code search capabilities "CQADupstackProgrammersRetrieval", # Programming Q&A retrieval # PairClassification - Duplicate/similar code detection "SprintDuplicateQuestions", # Duplicate question detection ] # Evaluation settings EVAL_SPLITS = ["test"] # Dataset splits to evaluate on VERBOSITY = 2 # MTEB verbosity level # ============================================================================= # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) def check_existing_results(output_path: Path, tasks: list) -> list: """Check for existing task results and filter out completed tasks.""" remaining_tasks = [] completed_tasks = [] for task in tasks: task_name = task.metadata.name # MTEB saves results as {model_name}__{task_name}.json result_file = output_path / MODEL_NAME / f"{task_name}.json" if result_file.exists(): completed_tasks.append(task_name) logger.info(f"Skipping {task_name} - results already exist") else: remaining_tasks.append(task) if completed_tasks: logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}") return remaining_tasks def load_existing_parsed_results(output_path: Path) -> dict: """Load existing parsed results if they exist.""" parsed_results_file = output_path / "mteb_parsed_results.json" if parsed_results_file.exists(): try: with parsed_results_file.open("r") as f: return json.load(f) except (json.JSONDecodeError, OSError) as e: logger.warning(f"Could not load existing parsed results: {e}") return {} def load_and_display_existing_results(output_path: Path) -> None: """Load and display existing MTEB results.""" summary_file = output_path / "mteb_summary.json" if summary_file.exists(): with summary_file.open("r") as f: summary = json.load(f) logger.info("=" * 80) logger.info("EXISTING MTEB EVALUATION RESULTS:") logger.info("=" * 80) stats = summary.get("summary_stats") if stats: logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}") logger.info(f"Average Score: {stats.get('average_score', 0):.4f}") logger.info(f"Median Score: {stats.get('median_score', 0):.4f}") logger.info("=" * 80) else: logger.info("No existing summary found. Individual task results may still exist.") def run_mteb_evaluation() -> None: """Run MTEB evaluation using the evaluation package.""" output_path = Path(OUTPUT_DIR) output_path.mkdir(parents=True, exist_ok=True) logger.info(f"Loading model from {MODEL_PATH}") model = StaticModel.from_pretrained(MODEL_PATH) logger.info("Model loaded successfully") # Set up model metadata for MTEB model.mteb_model_meta = ModelMeta( # type: ignore[attr-defined] name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"] ) # Get specific code-relevant tasks (focused selection) logger.info("Getting focused code-relevant MTEB tasks") logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}") if EVAL_ALL_TASKS: all_tasks = get_tasks() else: all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS] logger.info(f"Found {len(all_tasks)} total tasks") # Check for existing results and filter out completed tasks tasks = check_existing_results(output_path, all_tasks) logger.info(f"Will evaluate {len(tasks)} remaining tasks") if not tasks: logger.info("No new tasks to evaluate - all tasks already completed!") # Load and display existing results logger.info("Loading existing results...") try: load_and_display_existing_results(output_path) except (json.JSONDecodeError, OSError, KeyError) as e: logger.warning(f"Could not load existing results: {e}") return # Define the CustomMTEB object with the specified tasks evaluation = CustomMTEB(tasks=tasks) # Run the evaluation logger.info("Starting MTEB evaluation...") start_time = time.time() results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY) end_time = time.time() evaluation_time = end_time - start_time logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds") # Parse the results and summarize them logger.info("Parsing and summarizing results...") parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME) # Load existing results if any and combine them existing_results = load_existing_parsed_results(output_path) if existing_results: logger.info("Combining with existing results...") # Convert to dict for merging parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {} # Simple merge - existing results take precedence to avoid overwriting for key, value in existing_results.items(): if key not in parsed_dict: parsed_dict[key] = value parsed_results = parsed_dict task_scores = summarize_results(parsed_results) # Save results in different formats save_results(output_path, results, parsed_results, task_scores, evaluation_time) # Print the results in a leaderboard format logger.info("MTEB Evaluation Results:") logger.info("=" * 80) leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type] logger.info(leaderboard.to_string(index=False)) logger.info("=" * 80) logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}") def save_results( output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float ) -> None: """Save evaluation results in multiple formats.""" # Save raw results raw_results_file = output_path / "mteb_raw_results.json" with raw_results_file.open("w") as f: json.dump(raw_results, f, indent=2, default=str) logger.info(f"Raw results saved to {raw_results_file}") # Save parsed results parsed_results_file = output_path / "mteb_parsed_results.json" with parsed_results_file.open("w") as f: json.dump(parsed_results, f, indent=2, default=str) logger.info(f"Parsed results saved to {parsed_results_file}") # Generate summary statistics summary_stats = generate_summary_stats(task_scores) # Save task scores summary summary = { "model_name": MODEL_NAME, "evaluation_time_seconds": evaluation_time, "task_scores": task_scores, "summary_stats": summary_stats, } summary_file = output_path / "mteb_summary.json" with summary_file.open("w") as f: json.dump(summary, f, indent=2, default=str) logger.info(f"Summary saved to {summary_file}") # Save human-readable report report_file = output_path / "mteb_report.txt" generate_report(output_path, task_scores, summary_stats, evaluation_time) logger.info(f"Report saved to {report_file}") def generate_summary_stats(task_scores: dict) -> dict: """Generate summary statistics from task scores.""" if not task_scores: return {} # Extract all individual dataset scores all_scores = [] for model_data in task_scores.values(): if isinstance(model_data, dict) and "dataset_scores" in model_data: dataset_scores = model_data["dataset_scores"] if isinstance(dataset_scores, dict): all_scores.extend( [ float(score) for score in dataset_scores.values() if isinstance(score, int | float) and str(score).lower() != "nan" ] ) if not all_scores: return {} import numpy as np return { "total_datasets": len(all_scores), "average_score": float(np.mean(all_scores)), "median_score": float(np.median(all_scores)), "std_dev": float(np.std(all_scores)), "min_score": float(np.min(all_scores)), "max_score": float(np.max(all_scores)), } def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None: """Generate human-readable evaluation report.""" report_file = output_path / "mteb_report.txt" with report_file.open("w") as f: f.write("=" * 80 + "\n") f.write("MTEB Evaluation Report\n") f.write("=" * 80 + "\n\n") f.write(f"Model: {MODEL_NAME}\n") f.write(f"Model Path: {MODEL_PATH}\n") f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n") # Write summary stats if summary_stats: f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n") f.write("Summary Statistics:\n") f.write(f" Average Score: {summary_stats['average_score']:.4f}\n") f.write(f" Median Score: {summary_stats['median_score']:.4f}\n") f.write(f" Standard Deviation: {summary_stats['std_dev']:.4f}\n") f.write(f" Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n") else: f.write("Summary Statistics: No valid results found\n\n") # Write leaderboard f.write("Detailed Results:\n") f.write("-" * 50 + "\n") if task_scores: leaderboard = make_leaderboard(task_scores) # type: ignore[arg-type] f.write(leaderboard.to_string(index=False)) else: f.write("No results available\n") f.write("\n\n" + "=" * 80 + "\n") def main() -> None: """Main evaluation function.""" logger.info(f"Starting MTEB evaluation for {MODEL_NAME}") logger.info(f"Model path: {MODEL_PATH}") logger.info(f"Output directory: {OUTPUT_DIR}") logger.info("Running focused MTEB evaluation on code-relevant tasks:") logger.info(" - Classification: Programming language classification") logger.info(" - Clustering: Code clustering by functionality") logger.info(" - STS: Semantic similarity between code snippets") logger.info(" - Retrieval: Code search and retrieval") try: run_mteb_evaluation() logger.info("Evaluation pipeline completed successfully!") except Exception: logger.exception("Evaluation failed") sys.exit(1) if __name__ == "__main__": main()