Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Tom Aarsen
commited on
Commit
·
6c6aac5
1
Parent(s):
cfacdee
Add Sentence Transformers model type option
Browse files
app.py
CHANGED
|
@@ -1003,6 +1003,104 @@ MODELS_TO_SKIP = {
|
|
| 1003 |
"Koat/gte-tiny",
|
| 1004 |
}
|
| 1005 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
def add_lang(examples):
|
| 1007 |
if not(examples["eval_language"]):
|
| 1008 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
|
@@ -1170,6 +1268,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 1170 |
except:
|
| 1171 |
pass
|
| 1172 |
df_list.append(out)
|
|
|
|
|
|
|
| 1173 |
df = pd.DataFrame(df_list)
|
| 1174 |
# If there are any models that are the same, merge them
|
| 1175 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
|
@@ -1863,22 +1963,21 @@ def update_url_language(event: gr.SelectData, current_task_language: dict, langu
|
|
| 1863 |
|
| 1864 |
NUMERIC_INTERVALS = {
|
| 1865 |
"<100M": pd.Interval(0, 100, closed="right"),
|
| 1866 |
-
"
|
| 1867 |
-
"
|
| 1868 |
-
"
|
| 1869 |
">1B": pd.Interval(1000, 1_000_000, closed="right"),
|
| 1870 |
}
|
| 1871 |
|
| 1872 |
MODEL_TYPES = [
|
| 1873 |
"Open",
|
| 1874 |
"Proprietary",
|
|
|
|
| 1875 |
]
|
| 1876 |
|
| 1877 |
def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
| 1878 |
output_dataframes = []
|
| 1879 |
for df in full_dataframes:
|
| 1880 |
-
# df = pd.DataFrame(data=dataframe.value["data"], columns=dataframe.value["headers"])
|
| 1881 |
-
|
| 1882 |
# Apply the search query
|
| 1883 |
if search_query:
|
| 1884 |
names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
|
|
@@ -1895,7 +1994,12 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
|
| 1895 |
masks.append(df["Model Size (Million Parameters)"] != "")
|
| 1896 |
elif model_type == "Proprietary":
|
| 1897 |
masks.append(df["Model Size (Million Parameters)"] == "")
|
| 1898 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1899 |
|
| 1900 |
# Apply the model size filtering
|
| 1901 |
if model_sizes != list(NUMERIC_INTERVALS.keys()):
|
|
@@ -1920,8 +2024,8 @@ with gr.Blocks(css=css) as block:
|
|
| 1920 |
|
| 1921 |
with gr.Row():
|
| 1922 |
search_bar = gr.Textbox(
|
| 1923 |
-
label="Search Bar",
|
| 1924 |
-
placeholder=" 🔍 Search for
|
| 1925 |
)
|
| 1926 |
filter_model_type = gr.CheckboxGroup(
|
| 1927 |
label="Model types",
|
|
@@ -1935,7 +2039,8 @@ with gr.Blocks(css=css) as block:
|
|
| 1935 |
choices=list(NUMERIC_INTERVALS.keys()),
|
| 1936 |
value=list(NUMERIC_INTERVALS.keys()),
|
| 1937 |
interactive=True,
|
| 1938 |
-
elem_classes=["filter-checkbox-group"]
|
|
|
|
| 1939 |
)
|
| 1940 |
|
| 1941 |
with gr.Tabs() as outer_tabs:
|
|
|
|
| 1003 |
"Koat/gte-tiny",
|
| 1004 |
}
|
| 1005 |
|
| 1006 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
|
| 1007 |
+
"allenai-specter",
|
| 1008 |
+
"allenai-specter",
|
| 1009 |
+
"all-MiniLM-L12-v2",
|
| 1010 |
+
"all-MiniLM-L6-v2",
|
| 1011 |
+
"all-mpnet-base-v2",
|
| 1012 |
+
"bert-base-10lang-cased",
|
| 1013 |
+
"bert-base-15lang-cased",
|
| 1014 |
+
"bert-base-25lang-cased",
|
| 1015 |
+
"bert-base-multilingual-cased",
|
| 1016 |
+
"bert-base-multilingual-uncased",
|
| 1017 |
+
"bert-base-swedish-cased",
|
| 1018 |
+
"bert-base-uncased",
|
| 1019 |
+
"bge-base-zh-v1.5",
|
| 1020 |
+
"bge-large-zh-v1.5",
|
| 1021 |
+
"bge-large-zh-noinstruct",
|
| 1022 |
+
"bge-small-zh-v1.5",
|
| 1023 |
+
"camembert-base",
|
| 1024 |
+
"camembert-large",
|
| 1025 |
+
"contriever-base-msmarco",
|
| 1026 |
+
"cross-en-de-roberta-sentence-transformer",
|
| 1027 |
+
"DanskBERT",
|
| 1028 |
+
"distilbert-base-25lang-cased",
|
| 1029 |
+
"distilbert-base-en-fr-cased",
|
| 1030 |
+
"distilbert-base-en-fr-es-pt-it-cased",
|
| 1031 |
+
"distilbert-base-fr-cased",
|
| 1032 |
+
"distilbert-base-uncased",
|
| 1033 |
+
"distiluse-base-multilingual-cased-v2",
|
| 1034 |
+
"dfm-encoder-large-v1",
|
| 1035 |
+
"dfm-sentence-encoder-large-1",
|
| 1036 |
+
"e5-base",
|
| 1037 |
+
"e5-large",
|
| 1038 |
+
"e5-mistral-7b-instruct",
|
| 1039 |
+
"e5-small",
|
| 1040 |
+
"electra-small-nordic",
|
| 1041 |
+
"electra-small-swedish-cased-discriminator",
|
| 1042 |
+
"flaubert_base_cased",
|
| 1043 |
+
"flaubert_base_uncased",
|
| 1044 |
+
"flaubert_large_cased",
|
| 1045 |
+
"gbert-base",
|
| 1046 |
+
"gbert-large",
|
| 1047 |
+
"gelectra-base",
|
| 1048 |
+
"gelectra-large",
|
| 1049 |
+
"glove.6B.300d",
|
| 1050 |
+
"gottbert-base",
|
| 1051 |
+
"gtr-t5-base",
|
| 1052 |
+
"gtr-t5-large",
|
| 1053 |
+
"gtr-t5-xl",
|
| 1054 |
+
"gtr-t5-xxl",
|
| 1055 |
+
"herbert-base-retrieval-v2",
|
| 1056 |
+
"komninos",
|
| 1057 |
+
"luotuo-bert-medium",
|
| 1058 |
+
"LaBSE",
|
| 1059 |
+
"m3e-base",
|
| 1060 |
+
"m3e-large",
|
| 1061 |
+
"msmarco-bert-co-condensor",
|
| 1062 |
+
"multi-qa-MiniLM-L6-cos-v1",
|
| 1063 |
+
"multilingual-e5-base",
|
| 1064 |
+
"multilingual-e5-large",
|
| 1065 |
+
"multilingual-e5-small",
|
| 1066 |
+
"nb-bert-base",
|
| 1067 |
+
"nb-bert-large",
|
| 1068 |
+
"nomic-embed-text-v1.5-64",
|
| 1069 |
+
"nomic-embed-text-v1.5-128",
|
| 1070 |
+
"nomic-embed-text-v1.5-256",
|
| 1071 |
+
"nomic-embed-text-v1.5-512",
|
| 1072 |
+
"norbert3-base",
|
| 1073 |
+
"norbert3-large",
|
| 1074 |
+
"paraphrase-multilingual-mpnet-base-v2",
|
| 1075 |
+
"paraphrase-multilingual-MiniLM-L12-v2",
|
| 1076 |
+
"sentence-camembert-base",
|
| 1077 |
+
"sentence-camembert-large",
|
| 1078 |
+
"sentence-croissant-llm-base",
|
| 1079 |
+
"sentence-bert-swedish-cased",
|
| 1080 |
+
"sentence-t5-base",
|
| 1081 |
+
"sentence-t5-large",
|
| 1082 |
+
"sentence-t5-xl",
|
| 1083 |
+
"sentence-t5-xxl",
|
| 1084 |
+
"silver-retriever-base-v1",
|
| 1085 |
+
"sup-simcse-bert-base-uncased",
|
| 1086 |
+
"st-polish-paraphrase-from-distilroberta",
|
| 1087 |
+
"st-polish-paraphrase-from-mpnet",
|
| 1088 |
+
"text2vec-base-chinese",
|
| 1089 |
+
"text2vec-large-chinese",
|
| 1090 |
+
"udever-bloom-1b1",
|
| 1091 |
+
"udever-bloom-560m",
|
| 1092 |
+
"universal-sentence-encoder-multilingual-3",
|
| 1093 |
+
"universal-sentence-encoder-multilingual-large-3",
|
| 1094 |
+
"unsup-simcse-bert-base-uncased",
|
| 1095 |
+
"use-cmlm-multilingual",
|
| 1096 |
+
"xlm-roberta-base",
|
| 1097 |
+
"xlm-roberta-large",
|
| 1098 |
+
}
|
| 1099 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
|
| 1100 |
+
make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))
|
| 1101 |
+
for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
|
| 1102 |
+
}
|
| 1103 |
+
|
| 1104 |
def add_lang(examples):
|
| 1105 |
if not(examples["eval_language"]):
|
| 1106 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
|
|
|
| 1268 |
except:
|
| 1269 |
pass
|
| 1270 |
df_list.append(out)
|
| 1271 |
+
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
| 1272 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
| 1273 |
df = pd.DataFrame(df_list)
|
| 1274 |
# If there are any models that are the same, merge them
|
| 1275 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
|
|
|
| 1963 |
|
| 1964 |
NUMERIC_INTERVALS = {
|
| 1965 |
"<100M": pd.Interval(0, 100, closed="right"),
|
| 1966 |
+
"100M to 250M": pd.Interval(100, 250, closed="right"),
|
| 1967 |
+
"250M to 500M": pd.Interval(250, 500, closed="right"),
|
| 1968 |
+
"500M to 1B": pd.Interval(500, 1000, closed="right"),
|
| 1969 |
">1B": pd.Interval(1000, 1_000_000, closed="right"),
|
| 1970 |
}
|
| 1971 |
|
| 1972 |
MODEL_TYPES = [
|
| 1973 |
"Open",
|
| 1974 |
"Proprietary",
|
| 1975 |
+
"Sentence Transformers",
|
| 1976 |
]
|
| 1977 |
|
| 1978 |
def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
| 1979 |
output_dataframes = []
|
| 1980 |
for df in full_dataframes:
|
|
|
|
|
|
|
| 1981 |
# Apply the search query
|
| 1982 |
if search_query:
|
| 1983 |
names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
|
|
|
|
| 1994 |
masks.append(df["Model Size (Million Parameters)"] != "")
|
| 1995 |
elif model_type == "Proprietary":
|
| 1996 |
masks.append(df["Model Size (Million Parameters)"] == "")
|
| 1997 |
+
elif model_type == "Sentence Transformers":
|
| 1998 |
+
masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
|
| 1999 |
+
if masks:
|
| 2000 |
+
df = df[reduce(lambda a, b: a | b, masks)]
|
| 2001 |
+
else:
|
| 2002 |
+
df = pd.DataFrame(columns=df.columns)
|
| 2003 |
|
| 2004 |
# Apply the model size filtering
|
| 2005 |
if model_sizes != list(NUMERIC_INTERVALS.keys()):
|
|
|
|
| 2024 |
|
| 2025 |
with gr.Row():
|
| 2026 |
search_bar = gr.Textbox(
|
| 2027 |
+
label="Search Bar (separate multiple queries with `;`)",
|
| 2028 |
+
placeholder=" 🔍 Search for a model and press enter...",
|
| 2029 |
)
|
| 2030 |
filter_model_type = gr.CheckboxGroup(
|
| 2031 |
label="Model types",
|
|
|
|
| 2039 |
choices=list(NUMERIC_INTERVALS.keys()),
|
| 2040 |
value=list(NUMERIC_INTERVALS.keys()),
|
| 2041 |
interactive=True,
|
| 2042 |
+
elem_classes=["filter-checkbox-group"],
|
| 2043 |
+
scale=2,
|
| 2044 |
)
|
| 2045 |
|
| 2046 |
with gr.Tabs() as outer_tabs:
|