Files changed (13) hide show
  1. .gitattributes +31 -0
  2. .gitignore +1 -5
  3. DESCRIPTION.md +1 -0
  4. Dockerfile +0 -19
  5. EXTERNAL_MODEL_RESULTS.json +0 -0
  6. README.md +5 -11
  7. app.py +674 -0
  8. config.yaml +389 -0
  9. envs.py +48 -0
  10. model_meta.yaml +1327 -0
  11. requirements.txt +5 -0
  12. utils/__init__.py +0 -0
  13. utils/model_size.py +43 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,6 +1,2 @@
1
  *.pyc
2
- model_infos.json
3
- space
4
- .venv
5
- results
6
- mteb
 
1
  *.pyc
2
+ model_infos.json
 
 
 
 
DESCRIPTION.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Massive Text Embedding Benchmark (MTEB) Leaderboard.
Dockerfile DELETED
@@ -1,19 +0,0 @@
1
- FROM python:3.12-bookworm
2
-
3
- RUN apt update && apt install -y git make
4
- RUN useradd -m -u 1000 user
5
- ENV PATH="/home/user/.local/bin:$PATH"
6
-
7
- RUN git clone https://github.com/embeddings-benchmark/mteb.git
8
- RUN chown -R user:user /mteb
9
-
10
- USER user
11
- WORKDIR /mteb
12
-
13
- RUN pip install "pydantic<2.11"
14
- RUN pip install ".[leaderboard]"
15
- # ENV XDG_CACHE_HOME=/home/user/.cache
16
- ENV GRADIO_SERVER_NAME="0.0.0.0"
17
- EXPOSE 7860
18
-
19
- CMD ["make", "run-leaderboard"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
EXTERNAL_MODEL_RESULTS.json ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,20 +1,14 @@
1
  ---
2
- title: MTEB Leaderboard
3
  emoji: 🥇
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: docker
7
- app_port: 7860
8
  app_file: app.py
9
- pinned: true
10
  tags:
11
- - leaderboard
12
  startup_duration_timeout: 1h
13
  fullWidth: true
14
- license: mit
15
- short_description: Embedding Leaderboard
16
  ---
17
-
18
- # MTEB Leaderboard
19
-
20
- Embedding Leaderboard
 
1
  ---
2
+ title: MTEB Leaderboard
3
  emoji: 🥇
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.20.0
8
  app_file: app.py
9
+ pinned: false
10
  tags:
11
+ - leaderboard
12
  startup_duration_timeout: 1h
13
  fullWidth: true
 
 
14
  ---
 
 
 
 
app.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial, reduce
2
+ import json
3
+ import os
4
+ import re
5
+
6
+ from datasets import load_dataset
7
+ import gradio as gr
8
+ from huggingface_hub import hf_hub_download
9
+ from huggingface_hub.repocard import metadata_load
10
+ import pandas as pd
11
+ from tqdm.autonotebook import tqdm
12
+
13
+ from utils.model_size import get_model_parameters_memory
14
+ from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API
15
+
16
+ TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
17
+ BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"]
18
+
19
+ TASKS = list(TASKS_CONFIG.keys())
20
+ PRETTY_NAMES = {
21
+ "InstructionRetrieval": "Retrieval w/Instructions",
22
+ "PairClassification": "Pair Classification",
23
+ "BitextMining": "Bitext Mining",
24
+ }
25
+
26
+ TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()}
27
+
28
+ def make_clickable_model(model_name, link=None):
29
+ if link is None:
30
+ link = "https://huggingface.co/" + model_name
31
+ # Remove user from model name
32
+ return (
33
+ f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
34
+ )
35
+
36
+ EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)}
37
+ EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)}
38
+ EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)}
39
+ EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
40
+ EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
41
+ PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
42
+ TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()}
43
+ TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
44
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
45
+ MODELS_TO_SKIP = MODEL_META["models_to_skip"]
46
+ CROSS_ENCODERS = MODEL_META["cross_encoders"]
47
+ BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]]
48
+
49
+ PROPRIETARY_MODELS = {
50
+ make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
51
+ for model in PROPRIETARY_MODELS
52
+ }
53
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
54
+ make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
55
+ for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
56
+ }
57
+ CROSS_ENCODERS = {
58
+ make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
59
+ for model in CROSS_ENCODERS
60
+ }
61
+ BI_ENCODERS = {
62
+ make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
63
+ for model in BI_ENCODERS
64
+ }
65
+
66
+
67
+ TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
68
+ for board_config in BOARDS_CONFIG.values():
69
+ for task_category, task_list in board_config["tasks"].items():
70
+ TASK_TO_TASK_TYPE[task_category].extend(task_list)
71
+
72
+ def add_lang(examples):
73
+ if not(examples["eval_language"]):
74
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
75
+ else:
76
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
77
+ return examples
78
+
79
+ def norm(names): return set([name.split(" ")[0] for name in names])
80
+
81
+ def add_task(examples):
82
+ # Could be added to the dataset loading script instead
83
+ task_name = examples["mteb_dataset_name"]
84
+ task_type = None
85
+ for task_category, task_list in TASK_TO_TASK_TYPE.items():
86
+ if task_name in norm(task_list):
87
+ task_type = task_category
88
+ break
89
+ if task_type is not None:
90
+ examples["mteb_task"] = task_type
91
+ else:
92
+ print("WARNING: Task not found for dataset", examples["mteb_dataset_name"])
93
+ examples["mteb_task"] = "Unknown"
94
+ return examples
95
+
96
+ if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
97
+ with open("EXTERNAL_MODEL_RESULTS.json") as f:
98
+ EXTERNAL_MODEL_RESULTS = json.load(f)
99
+ # Update with models not contained
100
+ models_to_run = []
101
+ for model in EXTERNAL_MODELS:
102
+ if model not in EXTERNAL_MODEL_RESULTS:
103
+ models_to_run.append(model)
104
+ EXTERNAL_MODEL_RESULTS[model] = {k: {v: []} for k, v in TASK_TO_METRIC.items()}
105
+ else:
106
+ EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
107
+ models_to_run = EXTERNAL_MODELS
108
+
109
+ pbar = tqdm(models_to_run, desc="Fetching external model results")
110
+ for model in pbar:
111
+ pbar.set_description(f"Fetching external model results for {model!r}")
112
+ ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True)
113
+ # For local debugging:
114
+ #, download_mode='force_redownload', verification_mode="no_checks")
115
+ ds = ds.map(add_lang)
116
+ ds = ds.map(add_task)
117
+ base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
+ # For now only one metric per task - Could add more metrics lateron
119
+ for task, metric in TASK_TO_METRIC.items():
120
+ ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict()
121
+ ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
122
+ EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
123
+
124
+ # Save & cache EXTERNAL_MODEL_RESULTS
125
+ with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
126
+ json.dump(EXTERNAL_MODEL_RESULTS, f)
127
+
128
+ def get_dim_seq_size(model):
129
+ filenames = [sib.rfilename for sib in model.siblings]
130
+ dim, seq = "", ""
131
+ for filename in filenames:
132
+ if re.match("\d+_Pooling/config.json", filename):
133
+ st_config_path = hf_hub_download(model.modelId, filename=filename)
134
+ dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
135
+ break
136
+ for filename in filenames:
137
+ if re.match("\d+_Dense/config.json", filename):
138
+ st_config_path = hf_hub_download(model.modelId, filename=filename)
139
+ dim = json.load(open(st_config_path)).get("out_features", dim)
140
+ if "config.json" in filenames:
141
+ config_path = hf_hub_download(model.modelId, filename="config.json")
142
+ config = json.load(open(config_path))
143
+ if not dim:
144
+ dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
145
+ seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
146
+ # Get model file size without downloading. Parameters in million parameters and memory in GB
147
+ parameters, memory = get_model_parameters_memory(model)
148
+ return dim, seq, parameters, memory
149
+
150
+ def make_datasets_clickable(df):
151
+ """Does not work"""
152
+ if "BornholmBitextMining" in df.columns:
153
+ link = "https://huggingface.co/datasets/strombergnlp/bornholmsk_parallel"
154
+ df = df.rename(
155
+ columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
156
+ return df
157
+
158
+ def add_rank(df):
159
+ cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]]
160
+ if len(cols_to_rank) == 1:
161
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
162
+ else:
163
+ df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
164
+ df.sort_values("Average", ascending=False, inplace=True)
165
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
166
+ df = df.round(2)
167
+ # Fill NaN after averaging
168
+ df.fillna("", inplace=True)
169
+ return df
170
+
171
+ model_infos_path = "model_infos.json"
172
+ MODEL_INFOS = {}
173
+ if os.path.exists(model_infos_path):
174
+ with open(model_infos_path) as f:
175
+ MODEL_INFOS = json.load(f)
176
+
177
+ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
178
+ global MODEL_INFOS
179
+ api = API
180
+ models = api.list_models(filter="mteb")
181
+ # Initialize list to models that we cannot fetch metadata from
182
+ df_list = []
183
+ for model in EXTERNAL_MODEL_RESULTS:
184
+ results_list = []
185
+ for task in tasks:
186
+ # Not all models have InstructionRetrieval, other new tasks
187
+ if task not in EXTERNAL_MODEL_RESULTS[model]:
188
+ continue
189
+ results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]
190
+
191
+ if len(datasets) > 0:
192
+ res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
193
+ elif langs:
194
+ # Would be cleaner to rely on an extra language column instead
195
+ langs_format = [f"({lang})" for lang in langs]
196
+ res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
197
+ else:
198
+ res = {k: v for d in results_list for k, v in d.items()}
199
+ # Model & at least one result
200
+ if len(res) > 1:
201
+ if add_emb_dim:
202
+ res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
203
+ res["Memory Usage (GB, fp32)"] = round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2) if res["Model Size (Million Parameters)"] != "" else ""
204
+ res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
205
+ res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
206
+ df_list.append(res)
207
+
208
+ for model in models:
209
+ if model.modelId in MODELS_TO_SKIP: continue
210
+ print("MODEL", model.modelId)
211
+ if model.modelId not in MODEL_INFOS or refresh:
212
+ readme_path = hf_hub_download(model.modelId, filename="README.md")
213
+ meta = metadata_load(readme_path)
214
+ MODEL_INFOS[model.modelId] = {
215
+ "metadata": meta
216
+ }
217
+ meta = MODEL_INFOS[model.modelId]["metadata"]
218
+ if "model-index" not in meta:
219
+ continue
220
+ # meta['model-index'][0]["results"] is list of elements like:
221
+ # {
222
+ # "task": {"type": "Classification"},
223
+ # "dataset": {
224
+ # "type": "mteb/amazon_massive_intent",
225
+ # "name": "MTEB MassiveIntentClassification (nb)",
226
+ # "config": "nb",
227
+ # "split": "test",
228
+ # },
229
+ # "metrics": [
230
+ # {"type": "accuracy", "value": 39.81506388702084},
231
+ # {"type": "f1", "value": 38.809586587791664},
232
+ # ],
233
+ # },
234
+ # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
235
+ if len(datasets) > 0:
236
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
237
+ elif langs:
238
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
239
+ else:
240
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
241
+ out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
242
+ out = {k: v for d in out for k, v in d.items()}
243
+ out["Model"] = make_clickable_model(model.modelId)
244
+ # Model & at least one result
245
+ if len(out) > 1:
246
+ if add_emb_dim:
247
+ try:
248
+ # Fails on gated repos, so we only include scores for them
249
+ if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
250
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
251
+ out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
252
+ except:
253
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", ""
254
+ df_list.append(out)
255
+ if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
256
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
257
+
258
+ # Save & cache MODEL_INFOS
259
+ with open("model_infos.json", "w") as f:
260
+ json.dump(MODEL_INFOS, f)
261
+
262
+ df = pd.DataFrame(df_list)
263
+ # If there are any models that are the same, merge them
264
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
265
+ df = df.groupby("Model", as_index=False).first()
266
+ # Put 'Model' column first
267
+ cols = sorted(list(df.columns))
268
+ base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
269
+ if len(datasets) > 0:
270
+ #filter invalid columns
271
+ cols = [col for col in cols if col in base_columns + datasets]
272
+ i = 0
273
+ for column in base_columns:
274
+ if column in cols:
275
+ cols.insert(i, cols.pop(cols.index(column)))
276
+ i += 1
277
+ df = df[cols]
278
+ if rank:
279
+ df = add_rank(df)
280
+ if fillna:
281
+ df.fillna("", inplace=True)
282
+ return df
283
+
284
+ # Get dict with a task list for each task category
285
+ # E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]}
286
+ def get_mteb_average(task_dict: dict, refresh=True):
287
+ all_tasks = reduce(lambda x, y: x + y, task_dict.values())
288
+ DATA_OVERALL = get_mteb_data(
289
+ tasks=list(task_dict.keys()),
290
+ datasets=all_tasks,
291
+ fillna=False,
292
+ add_emb_dim=True,
293
+ rank=False,
294
+ refresh=refresh
295
+ )
296
+ # Debugging:
297
+ # DATA_OVERALL.to_csv("overall.csv")
298
+
299
+ DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False))
300
+ for i, (task_category, task_category_list) in enumerate(task_dict.items()):
301
+ DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False))
302
+ DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True)
303
+ # Start ranking from 1
304
+ DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
305
+
306
+ DATA_OVERALL = DATA_OVERALL.round(2)
307
+
308
+ DATA_TASKS = {}
309
+ for task_category, task_category_list in task_dict.items():
310
+ DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list])
311
+ DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
312
+
313
+ # Fill NaN after averaging
314
+ DATA_OVERALL.fillna("", inplace=True)
315
+
316
+ data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"]
317
+ for task_category, task_category_list in task_dict.items():
318
+ data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
319
+
320
+ DATA_OVERALL = DATA_OVERALL[data_overall_rows]
321
+ DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
322
+
323
+ return DATA_OVERALL, DATA_TASKS
324
+
325
+ boards_data = {}
326
+ all_data_tasks = []
327
+ for board, board_config in BOARDS_CONFIG.items():
328
+ boards_data[board] = {
329
+ "data_overall": None,
330
+ "data_tasks": {}
331
+ }
332
+ if board_config["has_overall"]:
333
+ data_overall, data_tasks = get_mteb_average(board_config["tasks"], refresh=False)
334
+ boards_data[board]["data_overall"] = data_overall
335
+ boards_data[board]["data_tasks"] = data_tasks
336
+ all_data_tasks.extend(data_tasks.values())
337
+ else:
338
+ for task_category, task_category_list in board_config["tasks"].items():
339
+ data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list, refresh=False)
340
+ data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
341
+ boards_data[board]["data_tasks"][task_category] = data_task_category
342
+ all_data_tasks.append(data_task_category)
343
+
344
+ # Exact, add all non-nan integer values for every dataset
345
+ NUM_SCORES = 0
346
+ DATASETS = []
347
+ MODELS = []
348
+ # LANGUAGES = []
349
+ for d in all_data_tasks:
350
+ # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
351
+ cols_to_ignore = 4 if "Average" in d.columns else 3
352
+ # Count number of scores including only non-nan floats & excluding the rank column
353
+ NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
354
+ # Exclude rank & model name column (first two); Do not count different language versions as different datasets
355
+ DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
356
+ # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
357
+ MODELS += d["Model"].tolist()
358
+
359
+ NUM_DATASETS = len(set(DATASETS))
360
+ # NUM_LANGUAGES = len(set(LANGUAGES))
361
+ NUM_MODELS = len(set(MODELS))
362
+
363
+ # 1. Force headers to wrap
364
+ # 2. Force model column (maximum) width
365
+ # 3. Prevent model column from overflowing, scroll instead
366
+ # 4. Prevent checkbox groups from taking up too much space
367
+ css = """
368
+ table > thead {
369
+ white-space: normal
370
+ }
371
+
372
+ table {
373
+ --cell-width-1: 250px
374
+ }
375
+
376
+ table > tbody > tr > td:nth-child(2) > div {
377
+ overflow-x: auto
378
+ }
379
+
380
+ .filter-checkbox-group {
381
+ max-width: max-content;
382
+ }
383
+ """
384
+
385
+ """
386
+ Each inner tab can have the following keys:
387
+ - language: The language of the leaderboard
388
+ - language_long: [optional] The long form of the language
389
+ - description: The description of the leaderboard
390
+ - credits: [optional] The credits for the leaderboard
391
+ - data: The data for the leaderboard
392
+ - refresh: The function to refresh the leaderboard
393
+ """
394
+
395
+ def get_refresh_function(task_category, task_list):
396
+ def _refresh():
397
+ data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list)
398
+ data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
399
+ return data_task_category
400
+ return _refresh
401
+
402
+ data = {
403
+ "Overall": {"metric": "Various, refer to task tabs", "data": []}
404
+ }
405
+ for task in TASKS:
406
+ data[task] = {"metric": TASKS_CONFIG[task]["metric_description"], "data": []}
407
+
408
+ for board, board_config in BOARDS_CONFIG.items():
409
+ init_name = board_config["title"]
410
+ if init_name in PRETTY_NAMES:
411
+ init_name = PRETTY_NAMES[init_name]
412
+ board_pretty_name = f"{init_name} leaderboard"
413
+ acronym = board_config.get("acronym", None)
414
+ board_icon = board_config.get("icon", None)
415
+ if board_icon is None:
416
+ board_icon = ""
417
+ credits = board_config.get("credits", None)
418
+
419
+ if board_config["has_overall"]:
420
+ overall_pretty_name = board_pretty_name
421
+ if acronym is not None:
422
+ overall_pretty_name += f" ({board_config['acronym']})"
423
+ data["Overall"]["data"].append({
424
+ "language": board_config["title"],
425
+ "language_long": board_config["language_long"],
426
+ "description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}",
427
+ "data": boards_data[board]["data_overall"],
428
+ "refresh": lambda: get_mteb_average(board_config["tasks"])[0],#partial(get_mteb_average, board_config["tasks"]),
429
+ "credits": credits,
430
+ })
431
+ for task_category, task_category_list in board_config["tasks"].items():
432
+ task_icon = TASKS_CONFIG[task_category]['icon']
433
+ if "special_icons" in board_config and isinstance(board_config["special_icons"], dict):
434
+ task_icon = board_config["special_icons"].get(task_category, task_icon)
435
+ data[task_category]["data"].append({
436
+ "language": board_config["title"],
437
+ "language_long": board_config["language_long"],
438
+ "description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}",
439
+ "data": boards_data[board]["data_tasks"][task_category],
440
+ "refresh": get_refresh_function(task_category, task_category_list),
441
+ "credits": credits,
442
+ })
443
+
444
+ dataframes = []
445
+ full_dataframes = []
446
+ tabs = []
447
+
448
+ # The following JavaScript function updates the URL parameters based on the selected task and language
449
+ # Additionally, `update_url_task` and `update_url_language` are used to update the current task and language
450
+ # The current task and language are stored in the `current_task_language` and `language_per_task` JSON objects
451
+ # This is all a bit hacky, but it might be the only way to pass options to a JavaScript function via Gradio
452
+ set_window_url_params = """
453
+ function(goalUrlObject) {
454
+ const params = new URLSearchParams(window.location.search);
455
+ for (const [key, value] of Object.entries(goalUrlObject)) {
456
+ params.set(key, value);
457
+ };
458
+ const queryString = '?' + params.toString();
459
+ console.log(queryString);
460
+ window.history.replaceState({}, '', queryString);
461
+ return [];
462
+ }
463
+ """
464
+
465
+ def update_url_task(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
466
+ current_task_language["task"] = event.target.id
467
+ # Either use the cached language for this task or the 1st language
468
+ try:
469
+ current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[1].children[0].id)
470
+ except Exception as e: # is Overall tab, no description
471
+ current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[0].children[0].id)
472
+
473
+ return current_task_language, language_per_task
474
+
475
+ def update_url_language(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
476
+ current_task_language["language"] = event.target.id
477
+ if "task" not in current_task_language:
478
+ current_task_language["task"] = "overall"
479
+ language_per_task[current_task_language["task"]] = event.target.id
480
+ return current_task_language, language_per_task
481
+
482
+ NUMERIC_INTERVALS = {
483
+ "<100M": pd.Interval(0, 100, closed="right"),
484
+ "100M to 250M": pd.Interval(100, 250, closed="right"),
485
+ "250M to 500M": pd.Interval(250, 500, closed="right"),
486
+ "500M to 1B": pd.Interval(500, 1000, closed="right"),
487
+ ">1B": pd.Interval(1000, 1_000_000, closed="right"),
488
+ }
489
+
490
+ MODEL_TYPES = [
491
+ "Open",
492
+ "Proprietary",
493
+ "Sentence Transformers",
494
+ "Cross-Encoders",
495
+ "Bi-Encoders"
496
+ ]
497
+
498
+ def filter_data(search_query, model_types, model_sizes, *full_dataframes):
499
+ output_dataframes = []
500
+ for df in full_dataframes:
501
+ # Apply the search query
502
+ if search_query:
503
+ names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
504
+ masks = []
505
+ for query in search_query.split(";"):
506
+ masks.append(names.str.contains(query))
507
+ df = df[reduce(lambda a, b: a | b, masks)]
508
+
509
+ # Apply the model type filtering
510
+ if set(model_types) != set(MODEL_TYPES):
511
+ masks = []
512
+ for model_type in model_types:
513
+ if model_type == "Open":
514
+ masks.append(~df["Model"].isin(PROPRIETARY_MODELS))
515
+ elif model_type == "Proprietary":
516
+ masks.append(df["Model"].isin(PROPRIETARY_MODELS))
517
+ elif model_type == "Sentence Transformers":
518
+ masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
519
+ elif model_type == "Cross-Encoders":
520
+ masks.append(df["Model"].isin(CROSS_ENCODERS))
521
+ elif model_type == "Bi-Encoders":
522
+ masks.append(df["Model"].isin(BI_ENCODERS))
523
+ if masks:
524
+ df = df[reduce(lambda a, b: a | b, masks)]
525
+ else:
526
+ df = pd.DataFrame(columns=df.columns)
527
+
528
+ # Apply the model size filtering
529
+ if set(model_sizes) != set(NUMERIC_INTERVALS.keys()):
530
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[model_size] for model_size in model_sizes]))
531
+ sizes = df["Model Size (Million Parameters)"].replace('', 0)
532
+ mask = sizes.apply(lambda size: any(numeric_interval.contains(size)))
533
+ df = df[mask]
534
+
535
+ output_dataframes.append(df)
536
+ return output_dataframes
537
+
538
+
539
+ with gr.Blocks(css=css) as block:
540
+
541
+ # Store the current task and language for updating the URL. This is a bit hacky, but it works
542
+ # for passing the current task and language to the JavaScript function via Gradio
543
+ current_task_language = gr.JSON(value=dict(), visible=False)
544
+ language_per_task = gr.JSON(value=dict(), visible=False)
545
+
546
+ gr.Markdown(f"""
547
+ Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
548
+ """)
549
+
550
+ with gr.Row():
551
+ search_bar = gr.Textbox(
552
+ label="Search Bar (separate multiple queries with `;`)",
553
+ placeholder=" 🔍 Search for a model and press enter...",
554
+ )
555
+ filter_model_type = gr.CheckboxGroup(
556
+ label="Model types",
557
+ choices=MODEL_TYPES,
558
+ value=MODEL_TYPES,
559
+ interactive=True,
560
+ elem_classes=["filter-checkbox-group"]
561
+ )
562
+ filter_model_sizes = gr.CheckboxGroup(
563
+ label="Model sizes (in number of parameters)",
564
+ choices=list(NUMERIC_INTERVALS.keys()),
565
+ value=list(NUMERIC_INTERVALS.keys()),
566
+ interactive=True,
567
+ elem_classes=["filter-checkbox-group"],
568
+ scale=2,
569
+ )
570
+
571
+ with gr.Tabs() as outer_tabs:
572
+ # Store the tabs for updating them on load based on URL parameters
573
+ tabs.append(outer_tabs)
574
+ for task, task_values in data.items():
575
+ metric = task_values["metric"]
576
+ task_tab_id = task.lower().replace(" ", "-")
577
+
578
+ # Overall, Bitext Mining, Classification, etc.
579
+ pretty_task_name = task if task not in PRETTY_NAMES.keys() else PRETTY_NAMES[task]
580
+ with gr.Tab(pretty_task_name, id=task_tab_id) as task_tab:
581
+ # For updating the 'task' in the URL
582
+ task_tab.select(update_url_task, [current_task_language, language_per_task], [current_task_language, language_per_task]).then(None, [current_task_language], [], js=set_window_url_params)
583
+ if "Overall" != task:
584
+ gr.Markdown(TASK_DESCRIPTIONS[task])
585
+ with gr.Tabs() as task_tabs:
586
+ # Store the task tabs for updating them on load based on URL parameters
587
+ tabs.append(task_tabs)
588
+
589
+ for item in task_values["data"]:
590
+ item_tab_id = item["language"].lower().replace(" ", "-")
591
+
592
+ # English, Chinese, French, etc.
593
+ with gr.Tab(item["language"], id=item_tab_id) as item_tab:
594
+ # For updating the 'language' in the URL
595
+ item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
596
+
597
+ with gr.Row():
598
+ gr.Markdown(f"""
599
+ {item['description']}
600
+
601
+ - **Metric:** {metric}
602
+ - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
603
+ {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
604
+ """)
605
+
606
+ with gr.Row():
607
+ datatype = ["number", "markdown"] + ["number"] * len(item["data"])
608
+ dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", height=500)
609
+ dataframes.append(dataframe)
610
+
611
+ full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False)
612
+ full_dataframes.append(full_dataframe)
613
+
614
+ with gr.Row():
615
+ refresh_button = gr.Button("Refresh")
616
+ refresh_button.click(item["refresh"], inputs=None, outputs=dataframe, concurrency_limit=20)
617
+
618
+ gr.Markdown(f"""
619
+ - **Total Datasets**: {NUM_DATASETS}
620
+ - **Total Languages**: 113
621
+ - **Total Scores**: {NUM_SCORES}
622
+ - **Total Models**: {NUM_MODELS}
623
+ """ + r"""
624
+ Made with ❤️ for NLP. If this work is useful to you, please consider citing:
625
+
626
+ ```bibtex
627
+ @article{muennighoff2022mteb,
628
+ doi = {10.48550/ARXIV.2210.07316},
629
+ url = {https://arxiv.org/abs/2210.07316},
630
+ author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils},
631
+ title = {MTEB: Massive Text Embedding Benchmark},
632
+ publisher = {arXiv},
633
+ journal={arXiv preprint arXiv:2210.07316},
634
+ year = {2022}
635
+ }
636
+ ```
637
+ """)
638
+
639
+ def set_tabs_on_load(request: gr.Request):
640
+ """Set the selected tab based on the URL parameters on load."""
641
+ global tabs
642
+ valid_task_keys = [child.id for child in tabs[0].children]
643
+ return_tabs = [gr.Tabs()] * len(tabs)
644
+
645
+ query_params = request.request.query_params
646
+ task_key = query_params.get("task", "overall")
647
+ if task_key not in valid_task_keys:
648
+ task_key = "overall"
649
+ return_tabs[0] = gr.Tabs(selected=task_key)
650
+
651
+ tabs_idx = valid_task_keys.index(task_key) + 1
652
+ language_key = query_params.get("language", "english")
653
+ return_tabs[tabs_idx] = gr.Tabs(selected=language_key)
654
+ current_task_language = {"task": task_key, "language": language_key}
655
+ language_per_task = {task_key: language_key}
656
+ return return_tabs + [current_task_language, language_per_task]
657
+
658
+ block.load(set_tabs_on_load, inputs=[], outputs=tabs + [current_task_language, language_per_task])
659
+
660
+ search_bar.submit(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes)
661
+ filter_model_type.change(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes)
662
+ filter_model_sizes.change(filter_data, inputs=[search_bar, filter_model_type, filter_model_sizes] + full_dataframes, outputs=dataframes)
663
+
664
+ block.queue(max_size=10)
665
+ block.launch()
666
+
667
+ # Possible changes:
668
+ # Could add graphs / other visual content
669
+ # Could add verification marks
670
+
671
+ # Sources:
672
+ # https://huggingface.co/spaces/gradio/leaderboard
673
+ # https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
674
+ # https://getemoji.com/
config.yaml ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config:
2
+ REPO_ID: "mteb/leaderboard"
3
+ RESULTS_REPO: mteb/results
4
+ LEADERBOARD_NAME: "MTEB Leaderboard"
5
+ tasks:
6
+ BitextMining:
7
+ icon: "🎌"
8
+ metric: f1
9
+ metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)"
10
+ task_description: "Bitext mining is the task of finding parallel sentences in two languages."
11
+ Classification:
12
+ icon: "❤️"
13
+ metric: accuracy
14
+ metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)"
15
+ task_description: "Classification is the task of assigning a label to a text."
16
+ Clustering:
17
+ icon: "✨"
18
+ metric: v_measure
19
+ metric_description: "Validity Measure (v_measure)"
20
+ task_description: "Clustering is the task of grouping similar documents together."
21
+ PairClassification:
22
+ icon: "🎭"
23
+ metric: cos_sim_ap
24
+ metric_description: "Average Precision based on Cosine Similarities (cos_sim_ap)"
25
+ task_description: "Pair classification is the task of determining whether two texts are similar."
26
+ Reranking:
27
+ icon: "🥈"
28
+ metric: map
29
+ metric_description: "Mean Average Precision (MAP)"
30
+ task_description: "Reranking is the task of reordering a list of documents to improve relevance."
31
+ Retrieval:
32
+ icon: "🔎"
33
+ metric: ndcg_at_10
34
+ metric_description: "Normalized Discounted Cumulative Gain @ k (ndcg_at_10)"
35
+ task_description: "Retrieval is the task of finding relevant documents for a query."
36
+ STS:
37
+ icon: "🤖"
38
+ metric: cos_sim_spearman
39
+ metric_description: "Spearman correlation based on cosine similarity"
40
+ task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
41
+ Summarization:
42
+ icon: "📜"
43
+ metric: cos_sim_spearman
44
+ metric_description: "Spearman correlation based on cosine similarity"
45
+ task_description: "Summarization is the task of generating a summary of a text."
46
+ InstructionRetrieval:
47
+ icon: "🔎📋"
48
+ metric: "p-MRR"
49
+ metric_description: "paired mean reciprocal rank"
50
+ task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
51
+ boards:
52
+ en:
53
+ title: English
54
+ language_long: "English"
55
+ has_overall: true
56
+ acronym: null
57
+ icon: null
58
+ special_icons: null
59
+ credits: null
60
+ tasks:
61
+ Classification:
62
+ - AmazonCounterfactualClassification (en)
63
+ - AmazonPolarityClassification
64
+ - AmazonReviewsClassification (en)
65
+ - Banking77Classification
66
+ - EmotionClassification
67
+ - ImdbClassification
68
+ - MassiveIntentClassification (en)
69
+ - MassiveScenarioClassification (en)
70
+ - MTOPDomainClassification (en)
71
+ - MTOPIntentClassification (en)
72
+ - ToxicConversationsClassification
73
+ - TweetSentimentExtractionClassification
74
+ Clustering:
75
+ - ArxivClusteringP2P
76
+ - ArxivClusteringS2S
77
+ - BiorxivClusteringP2P
78
+ - BiorxivClusteringS2S
79
+ - MedrxivClusteringP2P
80
+ - MedrxivClusteringS2S
81
+ - RedditClustering
82
+ - RedditClusteringP2P
83
+ - StackExchangeClustering
84
+ - StackExchangeClusteringP2P
85
+ - TwentyNewsgroupsClustering
86
+ PairClassification:
87
+ - SprintDuplicateQuestions
88
+ - TwitterSemEval2015
89
+ - TwitterURLCorpus
90
+ Reranking:
91
+ - AskUbuntuDupQuestions
92
+ - MindSmallReranking
93
+ - SciDocsRR
94
+ - StackOverflowDupQuestions
95
+ Retrieval:
96
+ - ArguAna
97
+ - ClimateFEVER
98
+ - CQADupstackRetrieval
99
+ - DBPedia
100
+ - FEVER
101
+ - FiQA2018
102
+ - HotpotQA
103
+ - MSMARCO
104
+ - NFCorpus
105
+ - NQ
106
+ - QuoraRetrieval
107
+ - SCIDOCS
108
+ - SciFact
109
+ - Touche2020
110
+ - TRECCOVID
111
+ STS:
112
+ - BIOSSES
113
+ - SICK-R
114
+ - STS12
115
+ - STS13
116
+ - STS14
117
+ - STS15
118
+ - STS16
119
+ - STS17 (en-en)
120
+ - STS22 (en)
121
+ - STSBenchmark
122
+ Summarization:
123
+ - SummEval
124
+ en-x:
125
+ title: "English-X"
126
+ language_long: "117 (Pairs of: English & other language)"
127
+ has_overall: false
128
+ acronym: null
129
+ icon: null
130
+ special_icons: null
131
+ credits: null
132
+ tasks:
133
+ BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)']
134
+ zh:
135
+ title: Chinese
136
+ language_long: Chinese
137
+ has_overall: true
138
+ acronym: C-MTEB
139
+ icon: "🇨🇳"
140
+ special_icons:
141
+ Classification: "🧡"
142
+ credits: "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)"
143
+ tasks:
144
+ Classification:
145
+ - AmazonReviewsClassification (zh)
146
+ - IFlyTek
147
+ - JDReview
148
+ - MassiveIntentClassification (zh-CN)
149
+ - MassiveScenarioClassification (zh-CN)
150
+ - MultilingualSentiment
151
+ - OnlineShopping
152
+ - TNews
153
+ - Waimai
154
+ Clustering:
155
+ - CLSClusteringP2P
156
+ - CLSClusteringS2S
157
+ - ThuNewsClusteringP2P
158
+ - ThuNewsClusteringS2S
159
+ PairClassification:
160
+ - Cmnli
161
+ - Ocnli
162
+ Reranking:
163
+ - CMedQAv1
164
+ - CMedQAv2
165
+ - MMarcoReranking
166
+ - T2Reranking
167
+ Retrieval:
168
+ - CmedqaRetrieval
169
+ - CovidRetrieval
170
+ - DuRetrieval
171
+ - EcomRetrieval
172
+ - MedicalRetrieval
173
+ - MMarcoRetrieval
174
+ - T2Retrieval
175
+ - VideoRetrieval
176
+ STS:
177
+ - AFQMC
178
+ - ATEC
179
+ - BQ
180
+ - LCQMC
181
+ - PAWSX
182
+ - QBQTC
183
+ - STS22 (zh)
184
+ - STSB
185
+ da:
186
+ title: Danish
187
+ language_long: Danish
188
+ has_overall: false
189
+ acronym: null
190
+ icon: "🇩🇰"
191
+ special_icons:
192
+ Classification: "🤍"
193
+ credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
194
+ tasks:
195
+ BitextMining:
196
+ - BornholmBitextMining
197
+ Classification:
198
+ - AngryTweetsClassification
199
+ - DanishPoliticalCommentsClassification
200
+ - DKHateClassification
201
+ - LccSentimentClassification
202
+ - MassiveIntentClassification (da)
203
+ - MassiveScenarioClassification (da)
204
+ - NordicLangClassification
205
+ - ScalaDaClassification
206
+ fr:
207
+ title: French
208
+ language_long: "French"
209
+ has_overall: true
210
+ acronym: "F-MTEB"
211
+ icon: "🇫🇷"
212
+ special_icons:
213
+ Classification: "💙"
214
+ credits: "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)"
215
+ tasks:
216
+ Classification:
217
+ - AmazonReviewsClassification (fr)
218
+ - MasakhaNEWSClassification (fra)
219
+ - MassiveIntentClassification (fr)
220
+ - MassiveScenarioClassification (fr)
221
+ - MTOPDomainClassification (fr)
222
+ - MTOPIntentClassification (fr)
223
+ Clustering:
224
+ - AlloProfClusteringP2P
225
+ - AlloProfClusteringS2S
226
+ - HALClusteringS2S
227
+ - MLSUMClusteringP2P
228
+ - MLSUMClusteringS2S
229
+ - MasakhaNEWSClusteringP2P (fra)
230
+ - MasakhaNEWSClusteringS2S (fra)
231
+ PairClassification:
232
+ - OpusparcusPC (fr)
233
+ - PawsX (fr)
234
+ Reranking:
235
+ - AlloprofReranking
236
+ - SyntecReranking
237
+ Retrieval:
238
+ - AlloprofRetrieval
239
+ - BSARDRetrieval
240
+ - MintakaRetrieval (fr)
241
+ - SyntecRetrieval
242
+ - XPQARetrieval (fr)
243
+ STS:
244
+ - STS22 (fr)
245
+ - STSBenchmarkMultilingualSTS (fr)
246
+ - SICKFr
247
+ Summarization:
248
+ - SummEvalFr
249
+ 'no':
250
+ title: Norwegian
251
+ language_long: "Norwegian Bokmål"
252
+ has_overall: false
253
+ acronym: null
254
+ icon: "🇳🇴"
255
+ special_icons:
256
+ Classification: "💙"
257
+ credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
258
+ tasks:
259
+ Classification: &id001
260
+ - NoRecClassification
261
+ - NordicLangClassification
262
+ - NorwegianParliament
263
+ - MassiveIntentClassification (nb)
264
+ - MassiveScenarioClassification (nb)
265
+ - ScalaNbClassification
266
+ instructions:
267
+ title: English
268
+ language_long: "English"
269
+ has_overall: false
270
+ acronym: null
271
+ icon: null
272
+ credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)"
273
+ tasks:
274
+ InstructionRetrieval:
275
+ - Robust04InstructionRetrieval
276
+ - News21InstructionRetrieval
277
+ - Core17InstructionRetrieval
278
+ law:
279
+ title: Law
280
+ language_long: "English, German, Chinese"
281
+ has_overall: false
282
+ acronym: null
283
+ icon: "⚖️"
284
+ special_icons: null
285
+ credits: "[Voyage AI](https://www.voyageai.com/)"
286
+ tasks:
287
+ Retrieval:
288
+ - AILACasedocs
289
+ - AILAStatutes
290
+ - GerDaLIRSmall
291
+ - LeCaRDv2
292
+ - LegalBenchConsumerContractsQA
293
+ - LegalBenchCorporateLobbying
294
+ - LegalQuAD
295
+ - LegalSummarization
296
+ de:
297
+ title: German
298
+ language_long: "German"
299
+ has_overall: false
300
+ acronym: null
301
+ icon: "🇩🇪"
302
+ special_icons: null
303
+ credits: "[Silvan](https://github.com/slvnwhrl)"
304
+ tasks:
305
+ Clustering:
306
+ - BlurbsClusteringP2P
307
+ - BlurbsClusteringS2S
308
+ - TenKGnadClusteringP2P
309
+ - TenKGnadClusteringS2S
310
+ pl:
311
+ title: Polish
312
+ language_long: Polish
313
+ has_overall: true
314
+ acronym: null
315
+ icon: "🇵🇱"
316
+ special_icons:
317
+ Classification: "🤍"
318
+ credits: "[Rafał Poświata](https://github.com/rafalposwiata)"
319
+ tasks:
320
+ Classification:
321
+ - AllegroReviews
322
+ - CBD
323
+ - MassiveIntentClassification (pl)
324
+ - MassiveScenarioClassification (pl)
325
+ - PAC
326
+ - PolEmo2.0-IN
327
+ - PolEmo2.0-OUT
328
+ Clustering:
329
+ - 8TagsClustering
330
+ PairClassification:
331
+ - CDSC-E
332
+ - PPC
333
+ - PSC
334
+ - SICK-E-PL
335
+ Retrieval:
336
+ - ArguAna-PL
337
+ - DBPedia-PL
338
+ - FiQA-PL
339
+ - HotpotQA-PL
340
+ - MSMARCO-PL
341
+ - NFCorpus-PL
342
+ - NQ-PL
343
+ - Quora-PL
344
+ - SCIDOCS-PL
345
+ - SciFact-PL
346
+ - TRECCOVID-PL
347
+ STS:
348
+ - CDSC-R
349
+ - SICK-R-PL
350
+ - STS22 (pl)
351
+ se:
352
+ title: Swedish
353
+ language_long: Swedish
354
+ has_overall: false
355
+ acronym: null
356
+ icon: "🇸🇪"
357
+ special_icons:
358
+ Classification: "💛"
359
+ credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
360
+ tasks:
361
+ Classification:
362
+ - NoRecClassification
363
+ - NordicLangClassification
364
+ - NorwegianParliament
365
+ - MassiveIntentClassification (nb)
366
+ - MassiveScenarioClassification (nb)
367
+ - ScalaNbClassification
368
+ other-cls:
369
+ title: "Other Languages"
370
+ language_long: "47 (Only languages not included in the other tabs)"
371
+ has_overall: false
372
+ acronym: null
373
+ icon: null
374
+ special_icons:
375
+ Classification: "💜💚💙"
376
+ credits: null
377
+ tasks:
378
+ Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)']
379
+ other-sts:
380
+ title: Other
381
+ language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)"
382
+ has_overall: false
383
+ acronym: null
384
+ icon: null
385
+ special_icons:
386
+ STS: "👽"
387
+ credits: null
388
+ tasks:
389
+ STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"]
envs.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from yaml import safe_load
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ LEADERBOARD_CONFIG_PATH = "config.yaml"
7
+ with open(LEADERBOARD_CONFIG_PATH, 'r', encoding='utf-8') as f:
8
+ LEADERBOARD_CONFIG = safe_load(f)
9
+ MODEL_META_PATH = "model_meta.yaml"
10
+ with open(MODEL_META_PATH, 'r', encoding='utf-8') as f:
11
+ MODEL_META = safe_load(f)
12
+
13
+ # Try first to get the config from the environment variables, then from the config.yaml file
14
+ def get_config(name, default):
15
+ res = None
16
+
17
+ if name in os.environ:
18
+ res = os.environ[name]
19
+ elif 'config' in LEADERBOARD_CONFIG:
20
+ res = LEADERBOARD_CONFIG['config'].get(name, None)
21
+
22
+ if res is None:
23
+ return default
24
+ return res
25
+
26
+ def str2bool(v):
27
+ return str(v).lower() in ("yes", "true", "t", "1")
28
+
29
+ # clone / pull the lmeh eval data
30
+ HF_TOKEN = get_config("HF_TOKEN", None)
31
+
32
+ LEADERBOARD_NAME = get_config("LEADERBOARD_NAME", "MTEB Leaderboard")
33
+
34
+ REPO_ID = get_config("REPO_ID", "mteb/leaderboard")
35
+ RESULTS_REPO = get_config("RESULTS_REPO", "mteb/results")
36
+
37
+ CACHE_PATH=get_config("HF_HOME", ".")
38
+ os.environ["HF_HOME"] = CACHE_PATH
39
+
40
+ # Check if it is using persistent storage
41
+ if not os.access(CACHE_PATH, os.W_OK):
42
+ print(f"No write access to HF_HOME: {CACHE_PATH}. Resetting to current directory.")
43
+ CACHE_PATH = "."
44
+ os.environ["HF_HOME"] = CACHE_PATH
45
+ else:
46
+ print(f"Write access confirmed for HF_HOME")
47
+
48
+ API = HfApi(token=HF_TOKEN)
model_meta.yaml ADDED
@@ -0,0 +1,1327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_meta:
2
+ gte-Qwen1.5-7B-instruct:
3
+ link: https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct
4
+ seq_len: 32768
5
+ size: 7099
6
+ dim: 4096
7
+ is_external: true
8
+ is_proprietary: false
9
+ is_sentence_transformers_compatible: true
10
+ Baichuan-text-embedding:
11
+ link: https://platform.baichuan-ai.com/docs/text-Embedding
12
+ seq_len: 512
13
+ size: null
14
+ dim: 1024
15
+ is_external: true
16
+ is_proprietary: true
17
+ is_sentence_transformers_compatible: false
18
+ Cohere-embed-english-v3.0:
19
+ link: https://huggingface.co/Cohere/Cohere-embed-english-v3.0
20
+ seq_len: 512
21
+ size: null
22
+ dim: 1024
23
+ is_external: true
24
+ is_proprietary: true
25
+ is_sentence_transformers_compatible: false
26
+ Cohere-embed-multilingual-light-v3.0:
27
+ link: https://huggingface.co/Cohere/Cohere-embed-multilingual-light-v3.0
28
+ seq_len: 512
29
+ size: null
30
+ dim: 384
31
+ is_external: true
32
+ is_proprietary: true
33
+ is_sentence_transformers_compatible: false
34
+ Cohere-embed-multilingual-v3.0:
35
+ link: https://huggingface.co/Cohere/Cohere-embed-multilingual-v3.0
36
+ seq_len: 512
37
+ size: null
38
+ dim: 1024
39
+ is_external: true
40
+ is_proprietary: true
41
+ is_sentence_transformers_compatible: false
42
+ DanskBERT:
43
+ link: https://huggingface.co/vesteinn/DanskBERT
44
+ seq_len: 514
45
+ size: 125
46
+ dim: 768
47
+ is_external: true
48
+ is_proprietary: false
49
+ is_sentence_transformers_compatible: true
50
+ FollowIR-7B:
51
+ link: https://huggingface.co/jhu-clsp/FollowIR-7B
52
+ seq_len: 4096
53
+ size: 7240
54
+ is_external: true
55
+ is_propietary: false
56
+ is_sentence_transformer_compatible: false
57
+ GritLM-7B:
58
+ link: https://huggingface.co/GritLM/GritLM-7B
59
+ seq_len: 4096
60
+ size: 7240
61
+ is_external: true
62
+ is_propietary: false
63
+ is_sentence_transformer_compatible: false
64
+ LASER2:
65
+ link: https://github.com/facebookresearch/LASER
66
+ seq_len: N/A
67
+ size: 43
68
+ dim: 1024
69
+ is_external: true
70
+ is_proprietary: false
71
+ is_sentence_transformers_compatible: false
72
+ LLM2Vec-Llama-2-supervised:
73
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised
74
+ seq_len: 4096
75
+ size: 6607
76
+ dim: 4096
77
+ is_external: true
78
+ is_proprietary: false
79
+ is_sentence_transformers_compatible: false
80
+ LLM2Vec-Llama-2-unsupervised:
81
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse
82
+ seq_len: 4096
83
+ size: 6607
84
+ dim: 4096
85
+ is_external: true
86
+ is_proprietary: false
87
+ is_sentence_transformers_compatible: false
88
+ LLM2Vec-Meta-Llama-3-supervised:
89
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised
90
+ seq_len: 8192
91
+ size: 7505
92
+ dim: 4096
93
+ is_external: true
94
+ is_proprietary: false
95
+ is_sentence_transformers_compatible: false
96
+ LLM2Vec-Meta-Llama-3-unsupervised:
97
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse
98
+ seq_len: 8192
99
+ size: 7505
100
+ dim: 4096
101
+ is_external: true
102
+ is_proprietary: false
103
+ is_sentence_transformers_compatible: false
104
+ LLM2Vec-Mistral-supervised:
105
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised
106
+ seq_len: 32768
107
+ size: 7111
108
+ dim: 4096
109
+ is_external: true
110
+ is_proprietary: false
111
+ is_sentence_transformers_compatible: false
112
+ LLM2Vec-Mistral-unsupervised:
113
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse
114
+ seq_len: 32768
115
+ size: 7111
116
+ dim: 4096
117
+ is_external: true
118
+ is_proprietary: false
119
+ is_sentence_transformers_compatible: false
120
+ LLM2Vec-Sheared-Llama-supervised:
121
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised
122
+ seq_len: 4096
123
+ size: 1280
124
+ dim: 2048
125
+ is_external: true
126
+ is_proprietary: false
127
+ is_sentence_transformers_compatible: false
128
+ LLM2Vec-Sheared-Llama-unsupervised:
129
+ link: https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse
130
+ seq_len: 4096
131
+ size: 1280
132
+ dim: 2048
133
+ is_external: true
134
+ is_proprietary: false
135
+ is_sentence_transformers_compatible: false
136
+ LaBSE:
137
+ link: https://huggingface.co/sentence-transformers/LaBSE
138
+ seq_len: 512
139
+ size: 471
140
+ dim: 768
141
+ is_external: true
142
+ is_proprietary: false
143
+ is_sentence_transformers_compatible: true
144
+ OpenSearch-text-hybrid:
145
+ link: https://help.aliyun.com/zh/open-search/vector-search-edition/hybrid-retrieval
146
+ seq_len: 512
147
+ size: null
148
+ dim: 1792
149
+ is_external: true
150
+ is_proprietary: true
151
+ is_sentence_transformers_compatible: false
152
+ all-MiniLM-L12-v2:
153
+ link: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
154
+ seq_len: 512
155
+ size: 33
156
+ dim: 384
157
+ is_external: true
158
+ is_proprietary: false
159
+ is_sentence_transformers_compatible: true
160
+ all-MiniLM-L6-v2:
161
+ link: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
162
+ seq_len: 512
163
+ size: 23
164
+ dim: 384
165
+ is_external: true
166
+ is_proprietary: false
167
+ is_sentence_transformers_compatible: true
168
+ all-mpnet-base-v2:
169
+ link: https://huggingface.co/sentence-transformers/all-mpnet-base-v2
170
+ seq_len: 514
171
+ size: 110
172
+ dim: 768
173
+ is_external: true
174
+ is_proprietary: false
175
+ is_sentence_transformers_compatible: true
176
+ allenai-specter:
177
+ link: https://huggingface.co/sentence-transformers/allenai-specter
178
+ seq_len: 512
179
+ size: 110
180
+ dim: 768
181
+ is_external: true
182
+ is_proprietary: false
183
+ is_sentence_transformers_compatible: true
184
+ bert-base-10lang-cased:
185
+ link: https://huggingface.co/Geotrend/bert-base-10lang-cased
186
+ seq_len: 512
187
+ size: 138
188
+ dim: 768
189
+ is_external: true
190
+ is_proprietary: false
191
+ is_sentence_transformers_compatible: true
192
+ bert-base-15lang-cased:
193
+ link: https://huggingface.co/Geotrend/bert-base-15lang-cased
194
+ seq_len: 512
195
+ size: 138
196
+ dim: 768
197
+ is_external: true
198
+ is_proprietary: false
199
+ is_sentence_transformers_compatible: true
200
+ bert-base-25lang-cased:
201
+ link: https://huggingface.co/Geotrend/bert-base-25lang-cased
202
+ seq_len: 512
203
+ size: 138
204
+ dim: 768
205
+ is_external: true
206
+ is_proprietary: false
207
+ is_sentence_transformers_compatible: true
208
+ bert-base-multilingual-cased:
209
+ link: https://huggingface.co/google-bert/bert-base-multilingual-cased
210
+ seq_len: 512
211
+ size: 179
212
+ dim: 768
213
+ is_external: true
214
+ is_proprietary: false
215
+ is_sentence_transformers_compatible: true
216
+ bert-base-multilingual-uncased:
217
+ link: https://huggingface.co/google-bert/bert-base-multilingual-uncased
218
+ seq_len: 512
219
+ size: 168
220
+ dim: 768
221
+ is_external: true
222
+ is_proprietary: false
223
+ is_sentence_transformers_compatible: true
224
+ bert-base-swedish-cased:
225
+ link: https://huggingface.co/KB/bert-base-swedish-cased
226
+ seq_len: 512
227
+ size: 125
228
+ dim: 768
229
+ is_external: true
230
+ is_proprietary: false
231
+ is_sentence_transformers_compatible: true
232
+ bert-base-uncased:
233
+ link: https://huggingface.co/bert-base-uncased
234
+ seq_len: 512
235
+ size: 110
236
+ dim: 768
237
+ is_external: true
238
+ is_proprietary: false
239
+ is_sentence_transformers_compatible: true
240
+ bge-base-zh-v1.5:
241
+ link: https://huggingface.co/BAAI/bge-base-zh-v1.5
242
+ seq_len: 512
243
+ size: 102
244
+ dim: 768
245
+ is_external: true
246
+ is_proprietary: false
247
+ is_sentence_transformers_compatible: true
248
+ bge-large-en-v1.5:
249
+ link: https://huggingface.co/BAAI/bge-large-en-v1.5
250
+ seq_len: 512
251
+ size: null
252
+ dim: 1024
253
+ is_external: true
254
+ is_proprietary: false
255
+ is_sentence_transformers_compatible: false
256
+ bge-large-zh-noinstruct:
257
+ link: https://huggingface.co/BAAI/bge-large-zh-noinstruct
258
+ seq_len: 512
259
+ size: 326
260
+ dim: 1024
261
+ is_external: true
262
+ is_proprietary: false
263
+ is_sentence_transformers_compatible: true
264
+ bge-large-zh-v1.5:
265
+ link: https://huggingface.co/BAAI/bge-large-zh-v1.5
266
+ seq_len: 512
267
+ size: 326
268
+ dim: 1024
269
+ is_external: true
270
+ is_proprietary: false
271
+ is_sentence_transformers_compatible: true
272
+ bge-small-zh-v1.5:
273
+ link: https://huggingface.co/BAAI/bge-small-zh-v1.5
274
+ seq_len: 512
275
+ size: 24
276
+ dim: 512
277
+ is_external: true
278
+ is_proprietary: false
279
+ is_sentence_transformers_compatible: true
280
+ bm25:
281
+ link: https://en.wikipedia.org/wiki/Okapi_BM25
282
+ size: 0
283
+ is_external: true
284
+ is_proprietary: false
285
+ is_sentence_transformers_compatible: false
286
+ camembert-base:
287
+ link: https://huggingface.co/almanach/camembert-base
288
+ seq_len: 512
289
+ size: 111
290
+ dim: 512
291
+ is_external: false
292
+ is_proprietary: false
293
+ is_sentence_transformers_compatible: true
294
+ camembert-large:
295
+ link: https://huggingface.co/almanach/camembert-large
296
+ seq_len: 512
297
+ size: 338
298
+ dim: 768
299
+ is_external: false
300
+ is_proprietary: false
301
+ is_sentence_transformers_compatible: true
302
+ contriever-base-msmarco:
303
+ link: https://huggingface.co/nthakur/contriever-base-msmarco
304
+ seq_len: 512
305
+ size: 110
306
+ dim: 768
307
+ is_external: true
308
+ is_proprietary: false
309
+ is_sentence_transformers_compatible: true
310
+ cross-en-de-roberta-sentence-transformer:
311
+ link: https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer
312
+ seq_len: 514
313
+ size: 278
314
+ dim: 768
315
+ is_external: true
316
+ is_proprietary: false
317
+ is_sentence_transformers_compatible: true
318
+ dfm-encoder-large-v1:
319
+ link: https://huggingface.co/chcaa/dfm-encoder-large-v1
320
+ seq_len: 512
321
+ size: 355
322
+ dim: 1024
323
+ is_external: true
324
+ is_proprietary: false
325
+ is_sentence_transformers_compatible: true
326
+ dfm-sentence-encoder-large-1:
327
+ link: https://huggingface.co/chcaa/dfm-encoder-large-v1
328
+ seq_len: 512
329
+ size: 355
330
+ dim: 1024
331
+ is_external: true
332
+ is_proprietary: false
333
+ is_sentence_transformers_compatible: true
334
+ distilbert-base-25lang-cased:
335
+ link: https://huggingface.co/Geotrend/distilbert-base-25lang-cased
336
+ seq_len: 512
337
+ size: 110
338
+ dim: 768
339
+ is_external: false
340
+ is_proprietary: false
341
+ is_sentence_transformers_compatible: true
342
+ distilbert-base-en-fr-cased:
343
+ link: https://huggingface.co/Geotrend/distilbert-base-en-fr-cased
344
+ seq_len: 512
345
+ size: 110
346
+ dim: 768
347
+ is_external: false
348
+ is_proprietary: false
349
+ is_sentence_transformers_compatible: true
350
+ distilbert-base-en-fr-es-pt-it-cased:
351
+ link: https://huggingface.co/Geotrend/distilbert-base-en-fr-es-pt-it-cased
352
+ seq_len: 512
353
+ size: 110
354
+ dim: 768
355
+ is_external: false
356
+ is_proprietary: false
357
+ is_sentence_transformers_compatible: true
358
+ distilbert-base-fr-cased:
359
+ link: https://huggingface.co/Geotrend/distilbert-base-fr-cased
360
+ seq_len: 512
361
+ size: 110
362
+ dim: 768
363
+ is_external: false
364
+ is_proprietary: false
365
+ is_sentence_transformers_compatible: true
366
+ distilbert-base-uncased:
367
+ link: https://huggingface.co/distilbert-base-uncased
368
+ seq_len: 512
369
+ size: 110
370
+ dim: 768
371
+ is_external: false
372
+ is_proprietary: false
373
+ is_sentence_transformers_compatible: true
374
+ distiluse-base-multilingual-cased-v2:
375
+ link: https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2
376
+ seq_len: 512
377
+ size: 135
378
+ dim: 512
379
+ is_external: true
380
+ is_proprietary: false
381
+ is_sentence_transformers_compatible: true
382
+ e5-base-v2:
383
+ link: https://huggingface.co/intfloat/e5-base-v2
384
+ seq_len: 512
385
+ size: 110
386
+ dim: 768
387
+ is_external: true
388
+ is_proprietary: false
389
+ is_sentence_transformers_compatible: true
390
+ e5-base:
391
+ link: https://huggingface.co/intfloat/e5-base
392
+ seq_len: 512
393
+ size: 110
394
+ dim: 768
395
+ is_external: true
396
+ is_proprietary: false
397
+ is_sentence_transformers_compatible: true
398
+ e5-large-v2:
399
+ link: https://huggingface.co/intfloat/e5-large-v2
400
+ seq_len: 512
401
+ size: 335
402
+ dim: 1024
403
+ is_external: true
404
+ is_proprietary: false
405
+ is_sentence_transformers_compatible: true
406
+ e5-large:
407
+ link: https://huggingface.co/intfloat/e5-large
408
+ seq_len: 512
409
+ size: 335
410
+ dim: 1024
411
+ is_external: true
412
+ is_proprietary: false
413
+ is_sentence_transformers_compatible: true
414
+ e5-mistral-7b-instruct:
415
+ link: https://huggingface.co/intfloat/e5-mistral-7b-instruct
416
+ seq_len: 32768
417
+ size: 7111
418
+ dim: 4096
419
+ is_external: true
420
+ is_proprietary: false
421
+ is_sentence_transformers_compatible: true
422
+ e5-small:
423
+ link: https://huggingface.co/intfloat/e5-small
424
+ seq_len: 512
425
+ size: 33
426
+ dim: 384
427
+ is_external: true
428
+ is_proprietary: false
429
+ is_sentence_transformers_compatible: true
430
+ electra-small-nordic:
431
+ link: https://huggingface.co/jonfd/electra-small-nordic
432
+ seq_len: 512
433
+ size: 23
434
+ dim: 256
435
+ is_external: true
436
+ is_proprietary: false
437
+ is_sentence_transformers_compatible: true
438
+ electra-small-swedish-cased-discriminator:
439
+ link: https://huggingface.co/KBLab/electra-small-swedish-cased-discriminator
440
+ seq_len: 512
441
+ size: 16
442
+ dim: 256
443
+ is_external: true
444
+ is_proprietary: false
445
+ is_sentence_transformers_compatible: true
446
+ flan-t5-base:
447
+ link: https://huggingface.co/google/flan-t5-base
448
+ seq_len: 512
449
+ size: 220
450
+ dim: -1
451
+ is_external: true
452
+ is_proprietary: false
453
+ is_sentence_transformers_compatible: true
454
+ flan-t5-large:
455
+ link: https://huggingface.co/google/flan-t5-large
456
+ seq_len: 512
457
+ size: 770
458
+ dim: -1
459
+ is_external: true
460
+ is_proprietary: false
461
+ is_sentence_transformers_compatible: true
462
+ flaubert_base_cased:
463
+ link: https://huggingface.co/flaubert/flaubert_base_cased
464
+ seq_len: 512
465
+ size: 138
466
+ dim: 768
467
+ is_external: true
468
+ is_proprietary: false
469
+ is_sentence_transformers_compatible: true
470
+ flaubert_base_uncased:
471
+ link: https://huggingface.co/flaubert/flaubert_base_uncased
472
+ seq_len: 512
473
+ size: 138
474
+ dim: 768
475
+ is_external: true
476
+ is_proprietary: false
477
+ is_sentence_transformers_compatible: true
478
+ flaubert_large_cased:
479
+ link: https://huggingface.co/flaubert/flaubert_large_cased
480
+ seq_len: 512
481
+ size: 372
482
+ dim: 1024
483
+ is_external: true
484
+ is_proprietary: false
485
+ is_sentence_transformers_compatible: true
486
+ gbert-base:
487
+ link: https://huggingface.co/deepset/gbert-base
488
+ seq_len: 512
489
+ size: 110
490
+ dim: 768
491
+ is_external: true
492
+ is_proprietary: false
493
+ is_sentence_transformers_compatible: true
494
+ gbert-large:
495
+ link: https://huggingface.co/deepset/gbert-large
496
+ seq_len: 512
497
+ size: 337
498
+ dim: 1024
499
+ is_external: true
500
+ is_proprietary: false
501
+ is_sentence_transformers_compatible: true
502
+ gelectra-base:
503
+ link: https://huggingface.co/deepset/gelectra-base
504
+ seq_len: 512
505
+ size: 110
506
+ dim: 768
507
+ is_external: true
508
+ is_proprietary: false
509
+ is_sentence_transformers_compatible: true
510
+ gelectra-large:
511
+ link: https://huggingface.co/deepset/gelectra-large
512
+ seq_len: 512
513
+ size: 335
514
+ dim: 1024
515
+ is_external: true
516
+ is_proprietary: false
517
+ is_sentence_transformers_compatible: true
518
+ glove.6B.300d:
519
+ link: https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d
520
+ seq_len: N/A
521
+ size: 120
522
+ dim: 300
523
+ is_external: true
524
+ is_proprietary: false
525
+ is_sentence_transformers_compatible: true
526
+ google-gecko-256.text-embedding-preview-0409:
527
+ link: https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#latest_models
528
+ seq_len: 2048
529
+ size: 1200
530
+ dim: 256
531
+ is_external: true
532
+ is_proprietary: true
533
+ is_sentence_transformers_compatible: false
534
+ google-gecko.text-embedding-preview-0409:
535
+ link: https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#latest_models
536
+ seq_len: 2048
537
+ size: 1200
538
+ dim: 768
539
+ is_external: true
540
+ is_proprietary: true
541
+ is_sentence_transformers_compatible: false
542
+ gottbert-base:
543
+ link: https://huggingface.co/uklfr/gottbert-base
544
+ seq_len: 512
545
+ size: 127
546
+ dim: 768
547
+ is_external: true
548
+ is_proprietary: false
549
+ is_sentence_transformers_compatible: true
550
+ gtr-t5-base:
551
+ link: https://huggingface.co/sentence-transformers/gtr-t5-base
552
+ seq_len: 512
553
+ size: 110
554
+ dim: 768
555
+ is_external: true
556
+ is_proprietary: false
557
+ is_sentence_transformers_compatible: true
558
+ gtr-t5-large:
559
+ link: https://huggingface.co/sentence-transformers/gtr-t5-large
560
+ seq_len: 512
561
+ size: 168
562
+ dim: 768
563
+ is_external: true
564
+ is_proprietary: false
565
+ is_sentence_transformers_compatible: true
566
+ gtr-t5-xl:
567
+ link: https://huggingface.co/sentence-transformers/gtr-t5-xl
568
+ seq_len: 512
569
+ size: 1240
570
+ dim: 768
571
+ is_external: true
572
+ is_proprietary: false
573
+ is_sentence_transformers_compatible: true
574
+ gtr-t5-xxl:
575
+ link: https://huggingface.co/sentence-transformers/gtr-t5-xxl
576
+ seq_len: 512
577
+ size: 4865
578
+ dim: 768
579
+ is_external: true
580
+ is_proprietary: false
581
+ is_sentence_transformers_compatible: true
582
+ herbert-base-retrieval-v2:
583
+ link: https://huggingface.co/ipipan/herbert-base-retrieval-v2
584
+ seq_len: 514
585
+ size: 125
586
+ dim: 768
587
+ is_external: true
588
+ is_proprietary: false
589
+ is_sentence_transformers_compatible: true
590
+ instructor-base:
591
+ link: https://huggingface.co/hkunlp/instructor-base
592
+ seq_len: N/A
593
+ size: 110
594
+ dim: 768
595
+ is_external: true
596
+ is_proprietary: false
597
+ is_sentence_transformers_compatible: true
598
+ instructor-xl:
599
+ link: https://huggingface.co/hkunlp/instructor-xl
600
+ seq_len: N/A
601
+ size: 1241
602
+ dim: 768
603
+ is_external: true
604
+ is_proprietary: false
605
+ is_sentence_transformers_compatible: true
606
+ komninos:
607
+ link: https://huggingface.co/sentence-transformers/average_word_embeddings_komninos
608
+ seq_len: N/A
609
+ size: 134
610
+ dim: 300
611
+ is_external: true
612
+ is_proprietary: false
613
+ is_sentence_transformers_compatible: true
614
+ llama-2-7b-chat:
615
+ link: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
616
+ seq_len: 4096
617
+ size: 7000
618
+ dim: -1
619
+ is_external: true
620
+ is_proprietary: false
621
+ is_sentence_transformers_compatible: false
622
+ luotuo-bert-medium:
623
+ link: https://huggingface.co/silk-road/luotuo-bert-medium
624
+ seq_len: 512
625
+ size: 328
626
+ dim: 768
627
+ is_external: true
628
+ is_proprietary: false
629
+ is_sentence_transformers_compatible: true
630
+ m3e-base:
631
+ link: https://huggingface.co/moka-ai/m3e-base
632
+ seq_len: 512
633
+ size: 102
634
+ dim: 768
635
+ is_external: true
636
+ is_proprietary: false
637
+ is_sentence_transformers_compatible: true
638
+ m3e-large:
639
+ link: https://huggingface.co/moka-ai/m3e-large
640
+ seq_len: 512
641
+ size: 102
642
+ dim: 768
643
+ is_external: true
644
+ is_proprietary: false
645
+ is_sentence_transformers_compatible: true
646
+ mistral-7b-instruct-v0.2:
647
+ link: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
648
+ seq_len: 4096
649
+ size: 7240
650
+ dim: -1
651
+ is_external: true
652
+ is_proprietary: false
653
+ is_sentence_transformers_compatible: false
654
+ mistral-embed:
655
+ link: https://docs.mistral.ai/guides/embeddings
656
+ seq_len: null
657
+ size: null
658
+ dim: 1024
659
+ is_external: true
660
+ is_proprietary: true
661
+ is_sentence_transformers_compatible: false
662
+ monobert-large-msmarco:
663
+ link: https://huggingface.co/castorini/monobert-large-msmarco
664
+ seq_len: 512
665
+ size: 770
666
+ dim: -1
667
+ is_external: true
668
+ is_proprietary: false
669
+ is_sentence_transformers_compatible: false
670
+ monot5-3b-msmarco-10k:
671
+ link: https://huggingface.co/castorini/monot5-3b-msmarco-10k
672
+ seq_len: 512
673
+ size: 2480
674
+ dim: -1
675
+ is_external: true
676
+ is_proprietary: false
677
+ is_sentence_transformers_compatible: false
678
+ monot5-base-msmarco-10k:
679
+ link: https://huggingface.co/castorini/monot5-base-msmarco-10k
680
+ seq_len: 512
681
+ size: 220
682
+ dim: -1
683
+ is_external: true
684
+ is_proprietary: false
685
+ is_sentence_transformers_compatible: false
686
+ msmarco-bert-co-condensor:
687
+ link: https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor
688
+ seq_len: 512
689
+ size: 110
690
+ dim: 768
691
+ is_external: true
692
+ is_proprietary: false
693
+ is_sentence_transformers_compatible: true
694
+ multi-qa-MiniLM-L6-cos-v1:
695
+ link: https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1
696
+ seq_len: 512
697
+ size: 23
698
+ dim: 384
699
+ is_external: true
700
+ is_proprietary: false
701
+ is_sentence_transformers_compatible: true
702
+ multilingual-e5-base:
703
+ link: https://huggingface.co/intfloat/multilingual-e5-base
704
+ seq_len: 514
705
+ size: 278
706
+ dim: 768
707
+ is_external: true
708
+ is_proprietary: false
709
+ is_sentence_transformers_compatible: true
710
+ multilingual-e5-large:
711
+ link: https://huggingface.co/intfloat/multilingual-e5-large
712
+ seq_len: 514
713
+ size: 560
714
+ dim: 1024
715
+ is_external: true
716
+ is_proprietary: false
717
+ is_sentence_transformers_compatible: true
718
+ multilingual-e5-small:
719
+ link: https://huggingface.co/intfloat/multilingual-e5-small
720
+ seq_len: 512
721
+ size: 118
722
+ dim: 384
723
+ is_external: true
724
+ is_proprietary: false
725
+ is_sentence_transformers_compatible: true
726
+ nb-bert-base:
727
+ link: https://huggingface.co/NbAiLab/nb-bert-base
728
+ seq_len: 512
729
+ size: 179
730
+ dim: 768
731
+ is_external: true
732
+ is_proprietary: false
733
+ is_sentence_transformers_compatible: true
734
+ nb-bert-large:
735
+ link: https://huggingface.co/NbAiLab/nb-bert-large
736
+ seq_len: 512
737
+ size: 355
738
+ dim: 1024
739
+ is_external: true
740
+ is_proprietary: false
741
+ is_sentence_transformers_compatible: true
742
+ nomic-embed-text-v1.5-128:
743
+ link: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
744
+ seq_len: 8192
745
+ size: 138
746
+ dim: 128
747
+ is_external: true
748
+ is_proprietary: false
749
+ is_sentence_transformers_compatible: true
750
+ nomic-embed-text-v1.5-256:
751
+ link: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
752
+ seq_len: 8192
753
+ size: 138
754
+ dim: 256
755
+ is_external: true
756
+ is_proprietary: false
757
+ is_sentence_transformers_compatible: true
758
+ nomic-embed-text-v1.5-512:
759
+ link: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
760
+ seq_len: 8192
761
+ size: 138
762
+ dim: 512
763
+ is_external: true
764
+ is_proprietary: false
765
+ is_sentence_transformers_compatible: true
766
+ nomic-embed-text-v1.5-64:
767
+ link: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
768
+ seq_len: 8192
769
+ size: 138
770
+ dim: 64
771
+ is_external: true
772
+ is_proprietary: false
773
+ is_sentence_transformers_compatible: true
774
+ norbert3-base:
775
+ link: https://huggingface.co/ltg/norbert3-base
776
+ seq_len: 512
777
+ size: 131
778
+ dim: 768
779
+ is_external: true
780
+ is_proprietary: false
781
+ is_sentence_transformers_compatible: true
782
+ norbert3-large:
783
+ link: https://huggingface.co/ltg/norbert3-large
784
+ seq_len: 512
785
+ size: 368
786
+ dim: 1024
787
+ is_external: true
788
+ is_proprietary: false
789
+ is_sentence_transformers_compatible: true
790
+ paraphrase-multilingual-MiniLM-L12-v2:
791
+ link: https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
792
+ seq_len: 512
793
+ size: 118
794
+ dim: 384
795
+ is_external: true
796
+ is_proprietary: false
797
+ is_sentence_transformers_compatible: true
798
+ paraphrase-multilingual-mpnet-base-v2:
799
+ link: https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2
800
+ seq_len: 514
801
+ size: 278
802
+ dim: 768
803
+ is_external: true
804
+ is_proprietary: false
805
+ is_sentence_transformers_compatible: true
806
+ sentence-bert-swedish-cased:
807
+ link: https://huggingface.co/KBLab/sentence-bert-swedish-cased
808
+ seq_len: 512
809
+ size: 125
810
+ dim: 768
811
+ is_external: true
812
+ is_proprietary: false
813
+ is_sentence_transformers_compatible: true
814
+ sentence-camembert-base:
815
+ link: https://huggingface.co/dangvantuan/sentence-camembert-base
816
+ seq_len: 512
817
+ size: 110
818
+ dim: 768
819
+ is_external: true
820
+ is_proprietary: false
821
+ is_sentence_transformers_compatible: true
822
+ sentence-camembert-large:
823
+ link: https://huggingface.co/dangvantuan/sentence-camembert-large
824
+ seq_len: 512
825
+ size: 337
826
+ dim: 1024
827
+ is_external: true
828
+ is_proprietary: false
829
+ is_sentence_transformers_compatible: true
830
+ sentence-croissant-llm-base:
831
+ link: https://huggingface.co/Wissam42/sentence-croissant-llm-base
832
+ seq_len: 2048
833
+ size: 1280
834
+ dim: 2048
835
+ is_external: true
836
+ is_proprietary: false
837
+ is_sentence_transformers_compatible: true
838
+ sentence-t5-base:
839
+ link: https://huggingface.co/sentence-transformers/sentence-t5-base
840
+ seq_len: 512
841
+ size: 110
842
+ dim: 768
843
+ is_external: true
844
+ is_proprietary: false
845
+ is_sentence_transformers_compatible: true
846
+ sentence-t5-large:
847
+ link: https://huggingface.co/sentence-transformers/sentence-t5-large
848
+ seq_len: 512
849
+ size: 168
850
+ dim: 768
851
+ is_external: true
852
+ is_proprietary: false
853
+ is_sentence_transformers_compatible: true
854
+ sentence-t5-xl:
855
+ link: https://huggingface.co/sentence-transformers/sentence-t5-xl
856
+ seq_len: 512
857
+ size: 1240
858
+ dim: 768
859
+ is_external: true
860
+ is_proprietary: false
861
+ is_sentence_transformers_compatible: true
862
+ sentence-t5-xxl:
863
+ link: https://huggingface.co/sentence-transformers/sentence-t5-xxl
864
+ seq_len: 512
865
+ size: 4865
866
+ dim: 768
867
+ is_external: true
868
+ is_proprietary: false
869
+ is_sentence_transformers_compatible: true
870
+ silver-retriever-base-v1:
871
+ link: https://huggingface.co/ipipan/silver-retriever-base-v1
872
+ seq_len: 514
873
+ size: 125
874
+ dim: 768
875
+ is_external: true
876
+ is_proprietary: false
877
+ is_sentence_transformers_compatible: true
878
+ st-polish-paraphrase-from-distilroberta:
879
+ link: https://huggingface.co/sdadas/st-polish-paraphrase-from-distilroberta
880
+ seq_len: 514
881
+ size: 125
882
+ dim: 768
883
+ is_external: true
884
+ is_proprietary: false
885
+ is_sentence_transformers_compatible: true
886
+ st-polish-paraphrase-from-mpnet:
887
+ link: https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet
888
+ seq_len: 514
889
+ size: 125
890
+ dim: 768
891
+ is_external: true
892
+ is_proprietary: false
893
+ is_sentence_transformers_compatible: true
894
+ sup-simcse-bert-base-uncased:
895
+ link: https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased
896
+ seq_len: 512
897
+ size: 110
898
+ dim: 768
899
+ is_external: true
900
+ is_proprietary: false
901
+ is_sentence_transformers_compatible: true
902
+ text-embedding-3-large:
903
+ link: https://openai.com/blog/new-embedding-models-and-api-updates
904
+ seq_len: 8191
905
+ size: null
906
+ dim: 3072
907
+ is_external: true
908
+ is_proprietary: true
909
+ is_sentence_transformers_compatible: false
910
+ text-embedding-3-large-256:
911
+ link: https://openai.com/blog/new-embedding-models-and-api-updates
912
+ seq_len: 8191
913
+ size: null
914
+ dim: 256
915
+ is_external: true
916
+ is_proprietary: true
917
+ is_sentence_transformers_compatible: false
918
+ text-embedding-3-small:
919
+ link: https://openai.com/blog/new-embedding-models-and-api-updates
920
+ seq_len: 8191
921
+ size: null
922
+ dim: 1536
923
+ is_external: true
924
+ is_proprietary: true
925
+ is_sentence_transformers_compatible: false
926
+ text-embedding-ada-002:
927
+ link: https://openai.com/blog/new-and-improved-embedding-model
928
+ seq_len: 8191
929
+ size: null
930
+ dim: 1536
931
+ is_external: true
932
+ is_proprietary: true
933
+ is_sentence_transformers_compatible: false
934
+ text-search-ada-001:
935
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
936
+ seq_len: 2046
937
+ size: null
938
+ dim: 1024
939
+ is_external: true
940
+ is_proprietary: true
941
+ is_sentence_transformers_compatible: false
942
+ text-search-ada-doc-001:
943
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
944
+ seq_len: 2046
945
+ size: null
946
+ dim: 1024
947
+ is_external: true
948
+ is_proprietary: true
949
+ is_sentence_transformers_compatible: false
950
+ text-search-ada-query-001:
951
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
952
+ seq_len: 2046
953
+ size: null
954
+ dim: 1024
955
+ is_external: false
956
+ is_proprietary: true
957
+ is_sentence_transformers_compatible: false
958
+ text-search-babbage-001:
959
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
960
+ seq_len: 2046
961
+ size: null
962
+ dim: 2048
963
+ is_external: true
964
+ is_proprietary: true
965
+ is_sentence_transformers_compatible: false
966
+ text-search-curie-001:
967
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
968
+ seq_len: 2046
969
+ size: null
970
+ dim: 4096
971
+ is_external: true
972
+ is_proprietary: true
973
+ is_sentence_transformers_compatible: false
974
+ text-search-davinci-001:
975
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
976
+ seq_len: 2046
977
+ size: null
978
+ dim: 12288
979
+ is_external: true
980
+ is_proprietary: true
981
+ is_sentence_transformers_compatible: false
982
+ text-similarity-ada-001:
983
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
984
+ seq_len: 2046
985
+ size: null
986
+ dim: 1024
987
+ is_external: true
988
+ is_proprietary: true
989
+ is_sentence_transformers_compatible: false
990
+ text-similarity-babbage-001:
991
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
992
+ seq_len: 2046
993
+ size: null
994
+ dim: 2048
995
+ is_external: true
996
+ is_proprietary: true
997
+ is_sentence_transformers_compatible: false
998
+ text-similarity-curie-001:
999
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
1000
+ seq_len: 2046
1001
+ size: null
1002
+ dim: 4096
1003
+ is_external: true
1004
+ is_proprietary: true
1005
+ is_sentence_transformers_compatible: false
1006
+ text-similarity-davinci-001:
1007
+ link: https://openai.com/blog/introducing-text-and-code-embeddings
1008
+ seq_len: 2046
1009
+ size: null
1010
+ dim: 12288
1011
+ is_external: true
1012
+ is_proprietary: true
1013
+ is_sentence_transformers_compatible: false
1014
+ tart-dual-contriever-msmarco:
1015
+ link: https://huggingface.co/orionweller/tart-dual-contriever-msmarco
1016
+ seq_len: 512
1017
+ size: 110
1018
+ dim: 768
1019
+ is_external: true
1020
+ is_proprietary: false
1021
+ is_sentence_transformers_compatible: false
1022
+ tart-full-flan-t5-xl:
1023
+ link: https://huggingface.co/facebook/tart-full-flan-t5-xl
1024
+ seq_len: 512
1025
+ size: 2480
1026
+ dim: -1
1027
+ is_external: true
1028
+ is_proprietary: false
1029
+ is_sentence_transformers_compatible: false
1030
+ text2vec-base-chinese:
1031
+ link: https://huggingface.co/shibing624/text2vec-base-chinese
1032
+ seq_len: 512
1033
+ size: 102
1034
+ dim: 768
1035
+ is_external: true
1036
+ is_proprietary: false
1037
+ is_sentence_transformers_compatible: true
1038
+ text2vec-base-multilingual:
1039
+ link: null
1040
+ seq_len: null
1041
+ size: null
1042
+ dim: null
1043
+ is_external: true
1044
+ is_proprietary: false
1045
+ is_sentence_transformers_compatible: false
1046
+ text2vec-large-chinese:
1047
+ link: https://huggingface.co/GanymedeNil/text2vec-large-chinese
1048
+ seq_len: 512
1049
+ size: 326
1050
+ dim: 1024
1051
+ is_external: true
1052
+ is_proprietary: false
1053
+ is_sentence_transformers_compatible: true
1054
+ titan-embed-text-v1:
1055
+ link: https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html
1056
+ seq_len: 8000
1057
+ size: null
1058
+ dim: 1536
1059
+ is_external: true
1060
+ is_proprietary: true
1061
+ is_sentence_transformers_compatible: false
1062
+ udever-bloom-1b1:
1063
+ link: https://huggingface.co/izhx/udever-bloom-1b1
1064
+ seq_len: 2048
1065
+ size: null
1066
+ dim: 1536
1067
+ is_external: true
1068
+ is_proprietary: false
1069
+ is_sentence_transformers_compatible: true
1070
+ udever-bloom-560m:
1071
+ link: https://huggingface.co/izhx/udever-bloom-560m
1072
+ seq_len: 2048
1073
+ size: null
1074
+ dim: 1024
1075
+ is_external: true
1076
+ is_proprietary: false
1077
+ is_sentence_transformers_compatible: true
1078
+ universal-sentence-encoder-multilingual-3:
1079
+ link: https://huggingface.co/vprelovac/universal-sentence-encoder-multilingual-3
1080
+ seq_len: 512
1081
+ size: null
1082
+ dim: 512
1083
+ is_external: true
1084
+ is_proprietary: false
1085
+ is_sentence_transformers_compatible: true
1086
+ universal-sentence-encoder-multilingual-large-3:
1087
+ link: https://huggingface.co/vprelovac/universal-sentence-encoder-multilingual-large-3
1088
+ seq_len: 512
1089
+ size: null
1090
+ dim: 512
1091
+ is_external: true
1092
+ is_proprietary: false
1093
+ is_sentence_transformers_compatible: true
1094
+ unsup-simcse-bert-base-uncased:
1095
+ link: https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased
1096
+ seq_len: 512
1097
+ size: 110
1098
+ dim: 768
1099
+ is_external: true
1100
+ is_proprietary: false
1101
+ is_sentence_transformers_compatible: true
1102
+ use-cmlm-multilingual:
1103
+ link: https://huggingface.co/sentence-transformers/use-cmlm-multilingual
1104
+ seq_len: 512
1105
+ size: 472
1106
+ dim: 768
1107
+ is_external: true
1108
+ is_proprietary: false
1109
+ is_sentence_transformers_compatible: true
1110
+ voyage-2:
1111
+ link: https://docs.voyageai.com/embeddings/
1112
+ seq_len: 1024
1113
+ size: null
1114
+ dim: 1024
1115
+ is_external: true
1116
+ is_proprietary: true
1117
+ is_sentence_transformers_compatible: false
1118
+ voyage-code-2:
1119
+ link: https://docs.voyageai.com/embeddings/
1120
+ seq_len: 16000
1121
+ size: null
1122
+ dim: 1536
1123
+ is_external: true
1124
+ is_proprietary: true
1125
+ is_sentence_transformers_compatible: false
1126
+ voyage-large-2-instruct:
1127
+ link: https://docs.voyageai.com/embeddings/
1128
+ seq_len: 16000
1129
+ size: null
1130
+ dim: 1024
1131
+ is_external: true
1132
+ is_proprietary: false
1133
+ is_sentence_transformers_compatible: false
1134
+ voyage-law-2:
1135
+ link: https://docs.voyageai.com/embeddings/
1136
+ seq_len: 4000
1137
+ size: null
1138
+ dim: 1024
1139
+ is_external: true
1140
+ is_proprietary: true
1141
+ is_sentence_transformers_compatible: false
1142
+ voyage-lite-01-instruct:
1143
+ link: https://docs.voyageai.com/embeddings/
1144
+ seq_len: 4000
1145
+ size: null
1146
+ dim: 1024
1147
+ is_external: true
1148
+ is_proprietary: true
1149
+ is_sentence_transformers_compatible: false
1150
+ voyage-lite-02-instruct:
1151
+ link: https://docs.voyageai.com/embeddings/
1152
+ seq_len: 4000
1153
+ size: 1220
1154
+ dim: 1024
1155
+ is_external: true
1156
+ is_proprietary: true
1157
+ is_sentence_transformers_compatible: false
1158
+ xlm-roberta-base:
1159
+ link: https://huggingface.co/xlm-roberta-base
1160
+ seq_len: 514
1161
+ size: 279
1162
+ dim: 768
1163
+ is_external: true
1164
+ is_proprietary: false
1165
+ is_sentence_transformers_compatible: true
1166
+ xlm-roberta-large:
1167
+ link: https://huggingface.co/xlm-roberta-large
1168
+ seq_len: 514
1169
+ size: 560
1170
+ dim: 1024
1171
+ is_external: true
1172
+ is_proprietary: false
1173
+ is_sentence_transformers_compatible: true
1174
+ models_to_skip:
1175
+ - michaelfeil/ct2fast-e5-large-v2
1176
+ - McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse
1177
+ - newsrx/instructor-xl
1178
+ - sionic-ai/sionic-ai-v1
1179
+ - lsf1000/bge-evaluation
1180
+ - Intel/bge-small-en-v1.5-sst2
1181
+ - newsrx/instructor-xl-newsrx
1182
+ - McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse
1183
+ - McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse
1184
+ - davidpeer/gte-small
1185
+ - goldenrooster/multilingual-e5-large
1186
+ - kozistr/fused-large-en
1187
+ - mixamrepijey/instructor-small
1188
+ - McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised
1189
+ - DecisionOptimizationSystem/DeepFeatEmbeddingLargeContext
1190
+ - Intel/bge-base-en-v1.5-sst2-int8-dynamic
1191
+ - morgendigital/multilingual-e5-large-quantized
1192
+ - BAAI/bge-small-en
1193
+ - ggrn/e5-small-v2
1194
+ - vectoriseai/gte-small
1195
+ - giulio98/placeholder
1196
+ - odunola/UAE-Large-VI
1197
+ - vectoriseai/e5-large-v2
1198
+ - gruber/e5-small-v2-ggml
1199
+ - Severian/nomic
1200
+ - arcdev/e5-mistral-7b-instruct
1201
+ - mlx-community/multilingual-e5-base-mlx
1202
+ - michaelfeil/ct2fast-bge-base-en-v1.5
1203
+ - Intel/bge-small-en-v1.5-sst2-int8-static
1204
+ - jncraton/stella-base-en-v2-ct2-int8
1205
+ - vectoriseai/multilingual-e5-large
1206
+ - rlsChapters/Chapters-SFR-Embedding-Mistral
1207
+ - arcdev/SFR-Embedding-Mistral
1208
+ - McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised
1209
+ - McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised
1210
+ - vectoriseai/gte-base
1211
+ - mixamrepijey/instructor-models
1212
+ - GovCompete/e5-large-v2
1213
+ - ef-zulla/e5-multi-sml-torch
1214
+ - khoa-klaytn/bge-small-en-v1.5-angle
1215
+ - krilecy/e5-mistral-7b-instruct
1216
+ - vectoriseai/bge-base-en-v1.5
1217
+ - vectoriseai/instructor-base
1218
+ - jingyeom/korean_embedding_model
1219
+ - rizki/bgr-tf
1220
+ - barisaydin/bge-base-en
1221
+ - jamesgpt1/zzz
1222
+ - Malmuk1/e5-large-v2_Sharded
1223
+ - vectoriseai/ember-v1
1224
+ - Consensus/instructor-base
1225
+ - barisaydin/bge-small-en
1226
+ - barisaydin/gte-base
1227
+ - woody72/multilingual-e5-base
1228
+ - Einas/einas_ashkar
1229
+ - michaelfeil/ct2fast-bge-large-en-v1.5
1230
+ - vectoriseai/bge-small-en-v1.5
1231
+ - iampanda/Test
1232
+ - cherubhao/yogamodel
1233
+ - ieasybooks/multilingual-e5-large-onnx
1234
+ - jncraton/e5-small-v2-ct2-int8
1235
+ - radames/e5-large
1236
+ - khoa-klaytn/bge-base-en-v1.5-angle
1237
+ - Intel/bge-base-en-v1.5-sst2-int8-static
1238
+ - vectoriseai/e5-large
1239
+ - TitanML/jina-v2-base-en-embed
1240
+ - Koat/gte-tiny
1241
+ - binqiangliu/EmbeddingModlebgelargeENv1.5
1242
+ - beademiguelperez/sentence-transformers-multilingual-e5-small
1243
+ - sionic-ai/sionic-ai-v2
1244
+ - jamesdborin/jina-v2-base-en-embed
1245
+ - maiyad/multilingual-e5-small
1246
+ - dmlls/all-mpnet-base-v2
1247
+ - odunola/e5-base-v2
1248
+ - vectoriseai/bge-large-en-v1.5
1249
+ - vectoriseai/bge-small-en
1250
+ - karrar-alwaili/UAE-Large-V1
1251
+ - t12e/instructor-base
1252
+ - Frazic/udever-bloom-3b-sentence
1253
+ - Geolumina/instructor-xl
1254
+ - hsikchi/dump
1255
+ - recipe/embeddings
1256
+ - michaelfeil/ct2fast-bge-small-en-v1.5
1257
+ - ildodeltaRule/multilingual-e5-large
1258
+ - shubham-bgi/UAE-Large
1259
+ - BAAI/bge-large-en
1260
+ - michaelfeil/ct2fast-e5-small-v2
1261
+ - cgldo/semanticClone
1262
+ - barisaydin/gte-small
1263
+ - aident-ai/bge-base-en-onnx
1264
+ - jamesgpt1/english-large-v1
1265
+ - michaelfeil/ct2fast-e5-small
1266
+ - baseplate/instructor-large-1
1267
+ - newsrx/instructor-large
1268
+ - Narsil/bge-base-en
1269
+ - michaelfeil/ct2fast-e5-large
1270
+ - mlx-community/multilingual-e5-small-mlx
1271
+ - lightbird-ai/nomic
1272
+ - MaziyarPanahi/GritLM-8x7B-GGUF
1273
+ - newsrx/instructor-large-newsrx
1274
+ - dhairya0907/thenlper-get-large
1275
+ - barisaydin/bge-large-en
1276
+ - jncraton/bge-small-en-ct2-int8
1277
+ - retrainai/instructor-xl
1278
+ - BAAI/bge-base-en
1279
+ - gentlebowl/instructor-large-safetensors
1280
+ - d0rj/e5-large-en-ru
1281
+ - atian-chapters/Chapters-SFR-Embedding-Mistral
1282
+ - Intel/bge-base-en-v1.5-sts-int8-static
1283
+ - Intel/bge-base-en-v1.5-sts-int8-dynamic
1284
+ - jncraton/GIST-small-Embedding-v0-ct2-int8
1285
+ - jncraton/gte-tiny-ct2-int8
1286
+ - d0rj/e5-small-en-ru
1287
+ - vectoriseai/e5-small-v2
1288
+ - SmartComponents/bge-micro-v2
1289
+ - michaelfeil/ct2fast-gte-base
1290
+ - vectoriseai/e5-base-v2
1291
+ - Intel/bge-base-en-v1.5-sst2
1292
+ - McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised
1293
+ - Research2NLP/electrical_stella
1294
+ - weakit-v/bge-base-en-v1.5-onnx
1295
+ - GovCompete/instructor-xl
1296
+ - barisaydin/text2vec-base-multilingual
1297
+ - Intel/bge-small-en-v1.5-sst2-int8-dynamic
1298
+ - jncraton/gte-small-ct2-int8
1299
+ - d0rj/e5-base-en-ru
1300
+ - barisaydin/gte-large
1301
+ - fresha/e5-large-v2-endpoint
1302
+ - vectoriseai/instructor-large
1303
+ - Severian/embed
1304
+ - vectoriseai/e5-base
1305
+ - mlx-community/multilingual-e5-large-mlx
1306
+ - vectoriseai/gte-large
1307
+ - anttip/ct2fast-e5-small-v2-hfie
1308
+ - michaelfeil/ct2fast-gte-large
1309
+ - gizmo-ai/Cohere-embed-multilingual-v3.0
1310
+ - McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse
1311
+ - Kenknight1999/tungdd7_ft_e5
1312
+ - joteqwork/new_gsev0
1313
+ - vantagediscovery/jina-embeddings-v2-base-en
1314
+ - vantagediscovery/nomic-embed-text-v1
1315
+ - vantagediscovery/nomic-embed-text-v1.5
1316
+ - srikanthmalla/hkunlp-instructor-xl
1317
+ - afrideva/GIST-all-MiniLM-L6-v2-GGUF
1318
+ cross_encoders:
1319
+ - FollowIR-7B
1320
+ - flan-t5-base
1321
+ - flan-t5-large
1322
+ - monobert-large-msmarco
1323
+ - monot5-3b-msmarco-10k
1324
+ - monot5-base-msmarco-10k
1325
+ - llama-2-7b-chat
1326
+ - mistral-7b-instruct-v0.2
1327
+ - tart-full-flan-t5-xl
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ datasets
3
+ pandas
4
+ huggingface_hub
5
+ tqdm
utils/__init__.py ADDED
File without changes
utils/model_size.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, model_info as get_model_info, get_hf_file_metadata, hf_hub_url
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ # Map model IDs to the number of bytes used for one parameter. So, 4 bytes for fp32, 2 bytes for fp16, etc.
7
+ # By default, we assume that the model is stored in fp32.
8
+ KNOWN_BYTES_PER_PARAM = {
9
+ "dwzhu/e5-base-4k": 2,
10
+ }
11
+
12
+ def get_model_parameters_memory(model_info: ModelInfo):
13
+ '''Get the size of the model in million of parameters.'''
14
+ try:
15
+ safetensors = get_safetensors_metadata(model_info.id)
16
+ num_parameters = sum(safetensors.parameter_count.values())
17
+ return round(num_parameters / 1e6), round(num_parameters * 4 / 1024**3, 2)
18
+ except Exception as e:
19
+ pass
20
+
21
+ filenames = [sib.rfilename for sib in model_info.siblings]
22
+ if "pytorch_model.bin" in filenames:
23
+ url = hf_hub_url(model_info.id, filename="pytorch_model.bin")
24
+ meta = get_hf_file_metadata(url)
25
+ bytes_per_param = KNOWN_BYTES_PER_PARAM.get(model_info.id, 4)
26
+ num_params = round(meta.size / bytes_per_param / 1e6)
27
+ size_gb = round(meta.size * (4 / bytes_per_param) / 1024**3, 2)
28
+ return num_params, size_gb
29
+
30
+ if "pytorch_model.bin.index.json" in filenames:
31
+ index_path = hf_hub_download(model_info.id, filename="pytorch_model.bin.index.json")
32
+ """
33
+ {
34
+ "metadata": {
35
+ "total_size": 28272820224
36
+ },....
37
+ """
38
+ size = json.load(open(index_path))
39
+ bytes_per_param = KNOWN_BYTES_PER_PARAM.get(model_info.id, 4)
40
+ if ("metadata" in size) and ("total_size" in size["metadata"]):
41
+ return round(size["metadata"]["total_size"] / bytes_per_param / 1e6), round(size["metadata"]["total_size"] / 1024**3, 2)
42
+
43
+ return None, None