from pathlib import Path from leaderboard_tab import ( create_leaderboard_tab, search_leaderboard, update_columns_to_show, ) from utils import load_json_results # Constants RETRIEVAL_ABOUT_SECTION = """ ## About Retrieval Evaluation The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: ### Web Search Dataset Metrics - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 ### Model Requirements - Must support Arabic text embeddings - Should handle queries of at least 512 tokens - Must work with `sentence-transformers` library ### Evaluation Process 1. Models process Arabic web search queries 2. Retrieved documents are evaluated using: - MRR for first relevant result positioning - nDCG for overall ranking quality - Recall@5 for top results accuracy 3. Metrics are averaged to calculate the overall score 4. Models are ranked based on their overall performance ### How to Prepare Your Model - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) - Model should output fixed-dimension embeddings for text - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) """ # Global variables retrieval_df = None def load_retrieval_leaderboard(): """Load and prepare the retrieval leaderboard data""" global retrieval_df # Prepare retrieval dataframe dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json" retrieval_df = load_json_results( dataframe_path, True, "Average Score", drop_cols=["Revision", "Task"] ) retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df))) return retrieval_df def retrieval_search_leaderboard(model_name, columns_to_show): """Search function for retrieval leaderboard""" return search_leaderboard(retrieval_df, model_name, columns_to_show) def update_retrieval_columns_to_show(columns_to_show): """Update displayed columns for retrieval leaderboard""" return update_columns_to_show(retrieval_df, columns_to_show) def create_retrieval_tab(): """Create the complete retrieval leaderboard tab""" global retrieval_df # Load data if not already loaded if retrieval_df is None: retrieval_df = load_retrieval_leaderboard() # Define default columns to show default_columns = [ "Rank", "Model", "Average Score", "Model Size (MB)", "Context Length", "Embedding Dimension", "Web Search Dataset", "Islamic Knowledge Dataset", ] # Create and return the tab return create_leaderboard_tab( df=retrieval_df, initial_columns_to_show=default_columns, search_function=retrieval_search_leaderboard, update_function=update_retrieval_columns_to_show, about_section=RETRIEVAL_ABOUT_SECTION, task_type="Retriever", )