antoinelouis commited on
Commit
ad1a271
1 Parent(s): 0bfb4f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -77
app.py CHANGED
@@ -7,7 +7,7 @@ DATASETS = [
7
  "mMARCO-fr",
8
  "BSARD",
9
  ]
10
- DENSE_SINGLE_BIENCODERS = [
11
  "antoinelouis/biencoder-camemberta-base-mmarcoFR",
12
  "antoinelouis/biencoder-camembert-base-mmarcoFR",
13
  "antoinelouis/biencoder-distilcamembert-mmarcoFR",
@@ -22,15 +22,15 @@ DENSE_SINGLE_BIENCODERS = [
22
  "OrdalieTech/Solon-embeddings-large-0.1",
23
  "OrdalieTech/Solon-embeddings-base-0.1",
24
  ]
25
- DENSE_MULTI_BIENCODERS = [
26
  "antoinelouis/colbertv1-camembert-base-mmarcoFR",
27
  "antoinelouis/colbertv2-camembert-L4-mmarcoFR",
28
  "antoinelouis/colbert-xm",
29
  ]
30
- SPARSE_SINGLE_BIENCODERS = [
31
  "antoinelouis/spladev2-camembert-base-mmarcoFR",
32
  ]
33
- CROSS_ENCODERS = [
34
  "antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
35
  "antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
36
  "antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
@@ -57,7 +57,6 @@ CROSS_ENCODERS = [
57
  "antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
58
  "antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
59
  ]
60
- LLMS = []
61
  COLUMNS = {
62
  "Model": "html",
63
  "#Params (M)": "number",
@@ -81,7 +80,7 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
81
  if result.dataset_name in DATASETS and result.dataset_name not in data:
82
  data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
83
  data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
84
- data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6) if model_info.safetensors else None
85
  data[result.dataset_name]["Type"] = model_type
86
  data[result.dataset_name]["Dataset"] = result.dataset_name
87
 
@@ -91,17 +90,24 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
91
  return pd.DataFrame(list(data.values()))
92
 
93
  def load_all_results() -> pd.DataFrame:
94
- df = pd.DataFrame()
95
- for model_id in DENSE_SINGLE_BIENCODERS:
96
- df = pd.concat([df, get_model_info(model_id, model_type="DSVBE")])
97
- for model_id in DENSE_MULTI_BIENCODERS:
98
- df = pd.concat([df, get_model_info(model_id, model_type="DMVBE")])
99
- for model_id in SPARSE_SINGLE_BIENCODERS:
100
- df = pd.concat([df, get_model_info(model_id, model_type="SSVBE")])
101
- for model_id in CROSS_ENCODERS:
102
- df = pd.concat([df, get_model_info(model_id, model_type="CE")])
103
- for model_id in LLMS:
104
- df = pd.concat([df, get_model_info(model_id, model_type="LLM")])
 
 
 
 
 
 
 
105
  return df
106
 
107
  def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
@@ -111,35 +117,24 @@ def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str
111
  .sort_values(by=sort_by, ascending=False)
112
  )
113
 
114
-
115
  def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
116
  filtered_df = dataf.copy()
117
- conditions = []
118
 
119
- for val in selected_types:
120
- if val == 'Dense single-vector bi-encoder (DSVBE)':
121
- conditions.append((filtered_df['Type'] == 'DSVBE'))
122
- elif val == 'Dense multi-vector bi-encoder (DMVBE)':
123
- conditions.append((filtered_df['Type'] == 'DMVBE'))
124
- elif val == 'Sparse single-vector bi-encoder (SSVBE)':
125
- conditions.append((filtered_df['Type'] == 'SSVBE'))
126
- elif val == 'Cross-encoder (CE)':
127
- conditions.append((filtered_df['Type'] == 'CE'))
128
- elif val == 'LLM':
129
- conditions.append((filtered_df['Type'] == 'LLM'))
130
 
 
131
  for val in selected_sizes:
132
  if val == 'Small (< 100M)':
133
- conditions.append((filtered_df['#Params (M)'] < 100))
134
  elif val == 'Base (100M-300M)':
135
- conditions.append((filtered_df['#Params (M)'] >= 100) & (filtered_df['#Params (M)'] <= 300))
136
  elif val == 'Large (300M-500M)':
137
- conditions.append((filtered_df['#Params (M)'] >= 300) & (filtered_df['#Params (M)'] <= 500))
138
  elif val == 'Extra-large (500M+)':
139
- conditions.append((filtered_df['#Params (M)'] > 500))
140
-
141
- if conditions:
142
- filtered_df = filtered_df[pd.concat(conditions, axis=1).any(axis=1)]
143
 
144
  if query:
145
  filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
@@ -171,11 +166,10 @@ with gr.Blocks() as demo:
171
  filter_type = gr.CheckboxGroup(
172
  label="Model type",
173
  choices=[
174
- 'Dense single-vector bi-encoder (DSVBE)',
175
- 'Dense multi-vector bi-encoder (DMVBE)',
176
- 'Sparse single-vector bi-encoder (SSVBE)',
177
- 'Cross-encoder (CE)',
178
- 'LLM',
179
  ],
180
  value=[],
181
  interactive=True,
@@ -220,41 +214,11 @@ with gr.Blocks() as demo:
220
  # elem_classes="text-sm",
221
  # )
222
 
223
- # Update tables on search.
224
- search_bar.change(
225
- fn=lambda x: update_table(dataf=mmarco_df, query=x, selected_types=filter_type.value, selected_sizes=filter_size.value),
226
- inputs=[search_bar],
227
- outputs=mmarco_table,
228
- )
229
- # search_bar.change(
230
- # fn=lambda x: update_table(dataf=bsard_df, query=x, selected_types=filter_type.value, selected_sizes=filter_size.value),
231
- # inputs=[search_bar],
232
- # outputs=bsard_table,
233
- # )
234
-
235
- # Update tables on model type filter.
236
- filter_type.change(
237
- fn=lambda selected_types: update_table(mmarco_df, search_bar.value, selected_types, filter_size.value),
238
- inputs=[filter_type],
239
- outputs=mmarco_table,
240
- )
241
- # filter_type.change(
242
- # fn=lambda selected_types: update_table(bsard_df, search_bar.value, selected_types, filter_size.value),
243
- # inputs=[filter_type],
244
- # outputs=bsard_table,
245
- # )
246
-
247
- # Update tables on model size filter.
248
- filter_size.change(
249
- fn=lambda selected_sizes: update_table(mmarco_df, search_bar.value, filter_type.value, selected_sizes),
250
- inputs=[filter_size],
251
- outputs=mmarco_table,
252
- )
253
- # filter_size.change(
254
- # fn=lambda selected_sizes: update_table(bsard_df, search_bar.value, filter_type.value, selected_sizes),
255
- # inputs=[filter_size],
256
- # outputs=bsard_table,
257
- # )
258
 
259
  # Citation
260
  with gr.Column():
 
7
  "mMARCO-fr",
8
  "BSARD",
9
  ]
10
+ SINGLE_VECTOR_MODELS = [
11
  "antoinelouis/biencoder-camemberta-base-mmarcoFR",
12
  "antoinelouis/biencoder-camembert-base-mmarcoFR",
13
  "antoinelouis/biencoder-distilcamembert-mmarcoFR",
 
22
  "OrdalieTech/Solon-embeddings-large-0.1",
23
  "OrdalieTech/Solon-embeddings-base-0.1",
24
  ]
25
+ MULTI_VECTOR_MODELS = [
26
  "antoinelouis/colbertv1-camembert-base-mmarcoFR",
27
  "antoinelouis/colbertv2-camembert-L4-mmarcoFR",
28
  "antoinelouis/colbert-xm",
29
  ]
30
+ SPARSE_LEXICAL_MODELS = [
31
  "antoinelouis/spladev2-camembert-base-mmarcoFR",
32
  ]
33
+ CROSS_ENCODER_MODELS = [
34
  "antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
35
  "antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
36
  "antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
 
57
  "antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
58
  "antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
59
  ]
 
60
  COLUMNS = {
61
  "Model": "html",
62
  "#Params (M)": "number",
 
80
  if result.dataset_name in DATASETS and result.dataset_name not in data:
81
  data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
82
  data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
83
+ data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6, 0) if model_info.safetensors else None
84
  data[result.dataset_name]["Type"] = model_type
85
  data[result.dataset_name]["Dataset"] = result.dataset_name
86
 
 
90
  return pd.DataFrame(list(data.values()))
91
 
92
  def load_all_results() -> pd.DataFrame:
93
+ # Load results from external baseline models.
94
+ df = pd.read_csv('./baselines.csv')
95
+
96
+ # Load results from own Hugging Face models.
97
+ for model_id in SINGLE_VECTOR_MODELS:
98
+ df = pd.concat([df, get_model_info(model_id, model_type="SINGLE")])
99
+ for model_id in MULTI_VECTOR_MODELS:
100
+ df = pd.concat([df, get_model_info(model_id, model_type="MULTI")])
101
+ for model_id in SPARSE_LEXICAL_MODELS:
102
+ df = pd.concat([df, get_model_info(model_id, model_type="SPARSE")])
103
+ for model_id in CROSS_ENCODER_MODELS:
104
+ df = pd.concat([df, get_model_info(model_id, model_type="CROSS")])
105
+
106
+ # Round all metrics to 1 decimal.
107
+ for col in df.columns:
108
+ if "Recall" in col or "MRR" in col or "nDCG" in col or "MAP" in col:
109
+ df[col] = df[col].round(1)
110
+
111
  return df
112
 
113
  def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
 
117
  .sort_values(by=sort_by, ascending=False)
118
  )
119
 
 
120
  def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
121
  filtered_df = dataf.copy()
 
122
 
123
+ if selected_types:
124
+ filtered_df = filtered_df[filtered_df['Type'].isin([t.split()[-1][1:-1] for t in selected_types])]
 
 
 
 
 
 
 
 
 
125
 
126
+ size_conditions = []
127
  for val in selected_sizes:
128
  if val == 'Small (< 100M)':
129
+ size_conditions.append(filtered_df['#Params (M)'] < 100)
130
  elif val == 'Base (100M-300M)':
131
+ size_conditions.append((filtered_df['#Params (M)'] >= 100) & (filtered_df['#Params (M)'] <= 300))
132
  elif val == 'Large (300M-500M)':
133
+ size_conditions.append((filtered_df['#Params (M)'] >= 300) & (filtered_df['#Params (M)'] <= 500))
134
  elif val == 'Extra-large (500M+)':
135
+ size_conditions.append(filtered_df['#Params (M)'] > 500)
136
+ if size_conditions:
137
+ filtered_df = filtered_df[pd.concat(size_conditions, axis=1).any(axis=1)]
 
138
 
139
  if query:
140
  filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
 
166
  filter_type = gr.CheckboxGroup(
167
  label="Model type",
168
  choices=[
169
+ 'Single-vector dense bi-encoder (SINGLE)',
170
+ 'Multi-vector dense bi-encoder (MULTI)',
171
+ 'Sparse lexical model (SPARSE)',
172
+ 'Cross-encoder (CROSS)',
 
173
  ],
174
  value=[],
175
  interactive=True,
 
214
  # elem_classes="text-sm",
215
  # )
216
 
217
+ # Update tables on filter widgets change.
218
+ widgets = [search_bar, filter_type, filter_size]
219
+ for w in widgets:
220
+ w.change(fn=lambda q, t, s: update_table(dataf=mmarco_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[mmarco_table])
221
+ #w.change(fn=lambda q, t, s: update_table(dataf=bsard_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[bsard_table])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  # Citation
224
  with gr.Column():