Spaces:
Running
Running
antoinelouis
commited on
Commit
•
ad1a271
1
Parent(s):
0bfb4f6
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ DATASETS = [
|
|
7 |
"mMARCO-fr",
|
8 |
"BSARD",
|
9 |
]
|
10 |
-
|
11 |
"antoinelouis/biencoder-camemberta-base-mmarcoFR",
|
12 |
"antoinelouis/biencoder-camembert-base-mmarcoFR",
|
13 |
"antoinelouis/biencoder-distilcamembert-mmarcoFR",
|
@@ -22,15 +22,15 @@ DENSE_SINGLE_BIENCODERS = [
|
|
22 |
"OrdalieTech/Solon-embeddings-large-0.1",
|
23 |
"OrdalieTech/Solon-embeddings-base-0.1",
|
24 |
]
|
25 |
-
|
26 |
"antoinelouis/colbertv1-camembert-base-mmarcoFR",
|
27 |
"antoinelouis/colbertv2-camembert-L4-mmarcoFR",
|
28 |
"antoinelouis/colbert-xm",
|
29 |
]
|
30 |
-
|
31 |
"antoinelouis/spladev2-camembert-base-mmarcoFR",
|
32 |
]
|
33 |
-
|
34 |
"antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
|
35 |
"antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
|
36 |
"antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
|
@@ -57,7 +57,6 @@ CROSS_ENCODERS = [
|
|
57 |
"antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
|
58 |
"antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
|
59 |
]
|
60 |
-
LLMS = []
|
61 |
COLUMNS = {
|
62 |
"Model": "html",
|
63 |
"#Params (M)": "number",
|
@@ -81,7 +80,7 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
|
|
81 |
if result.dataset_name in DATASETS and result.dataset_name not in data:
|
82 |
data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
|
83 |
data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
|
84 |
-
data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6) if model_info.safetensors else None
|
85 |
data[result.dataset_name]["Type"] = model_type
|
86 |
data[result.dataset_name]["Dataset"] = result.dataset_name
|
87 |
|
@@ -91,17 +90,24 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
|
|
91 |
return pd.DataFrame(list(data.values()))
|
92 |
|
93 |
def load_all_results() -> pd.DataFrame:
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
return df
|
106 |
|
107 |
def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
|
@@ -111,35 +117,24 @@ def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str
|
|
111 |
.sort_values(by=sort_by, ascending=False)
|
112 |
)
|
113 |
|
114 |
-
|
115 |
def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
|
116 |
filtered_df = dataf.copy()
|
117 |
-
conditions = []
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
conditions.append((filtered_df['Type'] == 'DSVBE'))
|
122 |
-
elif val == 'Dense multi-vector bi-encoder (DMVBE)':
|
123 |
-
conditions.append((filtered_df['Type'] == 'DMVBE'))
|
124 |
-
elif val == 'Sparse single-vector bi-encoder (SSVBE)':
|
125 |
-
conditions.append((filtered_df['Type'] == 'SSVBE'))
|
126 |
-
elif val == 'Cross-encoder (CE)':
|
127 |
-
conditions.append((filtered_df['Type'] == 'CE'))
|
128 |
-
elif val == 'LLM':
|
129 |
-
conditions.append((filtered_df['Type'] == 'LLM'))
|
130 |
|
|
|
131 |
for val in selected_sizes:
|
132 |
if val == 'Small (< 100M)':
|
133 |
-
|
134 |
elif val == 'Base (100M-300M)':
|
135 |
-
|
136 |
elif val == 'Large (300M-500M)':
|
137 |
-
|
138 |
elif val == 'Extra-large (500M+)':
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
filtered_df = filtered_df[pd.concat(conditions, axis=1).any(axis=1)]
|
143 |
|
144 |
if query:
|
145 |
filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
|
@@ -171,11 +166,10 @@ with gr.Blocks() as demo:
|
|
171 |
filter_type = gr.CheckboxGroup(
|
172 |
label="Model type",
|
173 |
choices=[
|
174 |
-
'
|
175 |
-
'
|
176 |
-
'Sparse
|
177 |
-
'Cross-encoder (
|
178 |
-
'LLM',
|
179 |
],
|
180 |
value=[],
|
181 |
interactive=True,
|
@@ -220,41 +214,11 @@ with gr.Blocks() as demo:
|
|
220 |
# elem_classes="text-sm",
|
221 |
# )
|
222 |
|
223 |
-
# Update tables on
|
224 |
-
search_bar
|
225 |
-
|
226 |
-
inputs=[
|
227 |
-
outputs=
|
228 |
-
)
|
229 |
-
# search_bar.change(
|
230 |
-
# fn=lambda x: update_table(dataf=bsard_df, query=x, selected_types=filter_type.value, selected_sizes=filter_size.value),
|
231 |
-
# inputs=[search_bar],
|
232 |
-
# outputs=bsard_table,
|
233 |
-
# )
|
234 |
-
|
235 |
-
# Update tables on model type filter.
|
236 |
-
filter_type.change(
|
237 |
-
fn=lambda selected_types: update_table(mmarco_df, search_bar.value, selected_types, filter_size.value),
|
238 |
-
inputs=[filter_type],
|
239 |
-
outputs=mmarco_table,
|
240 |
-
)
|
241 |
-
# filter_type.change(
|
242 |
-
# fn=lambda selected_types: update_table(bsard_df, search_bar.value, selected_types, filter_size.value),
|
243 |
-
# inputs=[filter_type],
|
244 |
-
# outputs=bsard_table,
|
245 |
-
# )
|
246 |
-
|
247 |
-
# Update tables on model size filter.
|
248 |
-
filter_size.change(
|
249 |
-
fn=lambda selected_sizes: update_table(mmarco_df, search_bar.value, filter_type.value, selected_sizes),
|
250 |
-
inputs=[filter_size],
|
251 |
-
outputs=mmarco_table,
|
252 |
-
)
|
253 |
-
# filter_size.change(
|
254 |
-
# fn=lambda selected_sizes: update_table(bsard_df, search_bar.value, filter_type.value, selected_sizes),
|
255 |
-
# inputs=[filter_size],
|
256 |
-
# outputs=bsard_table,
|
257 |
-
# )
|
258 |
|
259 |
# Citation
|
260 |
with gr.Column():
|
|
|
7 |
"mMARCO-fr",
|
8 |
"BSARD",
|
9 |
]
|
10 |
+
SINGLE_VECTOR_MODELS = [
|
11 |
"antoinelouis/biencoder-camemberta-base-mmarcoFR",
|
12 |
"antoinelouis/biencoder-camembert-base-mmarcoFR",
|
13 |
"antoinelouis/biencoder-distilcamembert-mmarcoFR",
|
|
|
22 |
"OrdalieTech/Solon-embeddings-large-0.1",
|
23 |
"OrdalieTech/Solon-embeddings-base-0.1",
|
24 |
]
|
25 |
+
MULTI_VECTOR_MODELS = [
|
26 |
"antoinelouis/colbertv1-camembert-base-mmarcoFR",
|
27 |
"antoinelouis/colbertv2-camembert-L4-mmarcoFR",
|
28 |
"antoinelouis/colbert-xm",
|
29 |
]
|
30 |
+
SPARSE_LEXICAL_MODELS = [
|
31 |
"antoinelouis/spladev2-camembert-base-mmarcoFR",
|
32 |
]
|
33 |
+
CROSS_ENCODER_MODELS = [
|
34 |
"antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
|
35 |
"antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
|
36 |
"antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
|
|
|
57 |
"antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
|
58 |
"antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
|
59 |
]
|
|
|
60 |
COLUMNS = {
|
61 |
"Model": "html",
|
62 |
"#Params (M)": "number",
|
|
|
80 |
if result.dataset_name in DATASETS and result.dataset_name not in data:
|
81 |
data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
|
82 |
data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
|
83 |
+
data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6, 0) if model_info.safetensors else None
|
84 |
data[result.dataset_name]["Type"] = model_type
|
85 |
data[result.dataset_name]["Dataset"] = result.dataset_name
|
86 |
|
|
|
90 |
return pd.DataFrame(list(data.values()))
|
91 |
|
92 |
def load_all_results() -> pd.DataFrame:
|
93 |
+
# Load results from external baseline models.
|
94 |
+
df = pd.read_csv('./baselines.csv')
|
95 |
+
|
96 |
+
# Load results from own Hugging Face models.
|
97 |
+
for model_id in SINGLE_VECTOR_MODELS:
|
98 |
+
df = pd.concat([df, get_model_info(model_id, model_type="SINGLE")])
|
99 |
+
for model_id in MULTI_VECTOR_MODELS:
|
100 |
+
df = pd.concat([df, get_model_info(model_id, model_type="MULTI")])
|
101 |
+
for model_id in SPARSE_LEXICAL_MODELS:
|
102 |
+
df = pd.concat([df, get_model_info(model_id, model_type="SPARSE")])
|
103 |
+
for model_id in CROSS_ENCODER_MODELS:
|
104 |
+
df = pd.concat([df, get_model_info(model_id, model_type="CROSS")])
|
105 |
+
|
106 |
+
# Round all metrics to 1 decimal.
|
107 |
+
for col in df.columns:
|
108 |
+
if "Recall" in col or "MRR" in col or "nDCG" in col or "MAP" in col:
|
109 |
+
df[col] = df[col].round(1)
|
110 |
+
|
111 |
return df
|
112 |
|
113 |
def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
|
|
|
117 |
.sort_values(by=sort_by, ascending=False)
|
118 |
)
|
119 |
|
|
|
120 |
def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
|
121 |
filtered_df = dataf.copy()
|
|
|
122 |
|
123 |
+
if selected_types:
|
124 |
+
filtered_df = filtered_df[filtered_df['Type'].isin([t.split()[-1][1:-1] for t in selected_types])]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
size_conditions = []
|
127 |
for val in selected_sizes:
|
128 |
if val == 'Small (< 100M)':
|
129 |
+
size_conditions.append(filtered_df['#Params (M)'] < 100)
|
130 |
elif val == 'Base (100M-300M)':
|
131 |
+
size_conditions.append((filtered_df['#Params (M)'] >= 100) & (filtered_df['#Params (M)'] <= 300))
|
132 |
elif val == 'Large (300M-500M)':
|
133 |
+
size_conditions.append((filtered_df['#Params (M)'] >= 300) & (filtered_df['#Params (M)'] <= 500))
|
134 |
elif val == 'Extra-large (500M+)':
|
135 |
+
size_conditions.append(filtered_df['#Params (M)'] > 500)
|
136 |
+
if size_conditions:
|
137 |
+
filtered_df = filtered_df[pd.concat(size_conditions, axis=1).any(axis=1)]
|
|
|
138 |
|
139 |
if query:
|
140 |
filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
|
|
|
166 |
filter_type = gr.CheckboxGroup(
|
167 |
label="Model type",
|
168 |
choices=[
|
169 |
+
'Single-vector dense bi-encoder (SINGLE)',
|
170 |
+
'Multi-vector dense bi-encoder (MULTI)',
|
171 |
+
'Sparse lexical model (SPARSE)',
|
172 |
+
'Cross-encoder (CROSS)',
|
|
|
173 |
],
|
174 |
value=[],
|
175 |
interactive=True,
|
|
|
214 |
# elem_classes="text-sm",
|
215 |
# )
|
216 |
|
217 |
+
# Update tables on filter widgets change.
|
218 |
+
widgets = [search_bar, filter_type, filter_size]
|
219 |
+
for w in widgets:
|
220 |
+
w.change(fn=lambda q, t, s: update_table(dataf=mmarco_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[mmarco_table])
|
221 |
+
#w.change(fn=lambda q, t, s: update_table(dataf=bsard_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[bsard_table])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
# Citation
|
224 |
with gr.Column():
|