Spaces:
Sleeping
Sleeping
Sasha
commited on
Commit
·
1f09890
1
Parent(s):
d720be7
Initial version of the Evaluation Buddy -- currently most things are hardcoded (e.g. the dataset list), but the goal it to make it all compatible with the Hub!
Browse files
app.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from datasets import load_dataset_builder
|
| 3 |
+
from datasets import get_dataset_config_names
|
| 4 |
+
from os import listdir
|
| 5 |
+
from datasets import load_dataset, Dataset
|
| 6 |
+
from datasets_sql import query
|
| 7 |
+
import plotly.express as px
|
| 8 |
+
import numpy as np
|
| 9 |
+
import statistics
|
| 10 |
+
|
| 11 |
+
st.set_page_config(
|
| 12 |
+
page_title="Evaluation Buddy",
|
| 13 |
+
page_icon="./robot.png",
|
| 14 |
+
layout="wide",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
st.title("Hugging Face Evaluation Buddy")
|
| 18 |
+
|
| 19 |
+
top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
|
| 20 |
+
'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
|
| 21 |
+
'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
|
| 22 |
+
'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
|
| 23 |
+
'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
|
| 24 |
+
'race', 'winogrande']
|
| 25 |
+
|
| 26 |
+
tasks= ['text-classification', 'question-answering-extractive', 'automatic-speech-recognition']
|
| 27 |
+
|
| 28 |
+
with st.sidebar.expander("Datasets", expanded=True):
|
| 29 |
+
dataset_name = st.selectbox(
|
| 30 |
+
f"Choose a dataset to evaluate on:",
|
| 31 |
+
sorted(top_datasets))
|
| 32 |
+
configs = get_dataset_config_names(dataset_name)
|
| 33 |
+
dataset_config = st.selectbox(
|
| 34 |
+
f"Choose a configuration of your dataset:",
|
| 35 |
+
configs)
|
| 36 |
+
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
|
| 37 |
+
splits = [s for s in dataset_builder.info.splits]
|
| 38 |
+
dataset_split = st.selectbox(
|
| 39 |
+
f"Choose a dataset split:",
|
| 40 |
+
splits)
|
| 41 |
+
balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
st.markdown("## Here is some information about your dataset:")
|
| 46 |
+
|
| 47 |
+
st.markdown("### Description")
|
| 48 |
+
|
| 49 |
+
st.markdown(dataset_builder.info.description)
|
| 50 |
+
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
|
| 51 |
+
|
| 52 |
+
st.markdown("### Dataset-Specific Metrics")
|
| 53 |
+
if dataset_name in listdir('../datasets/metrics/'):
|
| 54 |
+
st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
|
| 55 |
+
code = ''' from datasets import load_metric
|
| 56 |
+
metric = load_metric('''+dataset+''', '''+config+''')'''
|
| 57 |
+
st.code(code, language='python')
|
| 58 |
+
dedicated_metric = True
|
| 59 |
+
else:
|
| 60 |
+
st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
|
| 61 |
+
dedicated_metric = False
|
| 62 |
+
|
| 63 |
+
st.markdown("### Task-Specific Metrics")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
task = dataset_builder.info.task_templates[0].task
|
| 67 |
+
st.markdown("The task associated to it is: " + task)
|
| 68 |
+
if task == 'automatic-speech-recognition':
|
| 69 |
+
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
|
| 70 |
+
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
|
| 71 |
+
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
|
| 72 |
+
else:
|
| 73 |
+
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
|
| 74 |
+
except:
|
| 75 |
+
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
#print(dataset_builder.info.task_templates)
|
| 79 |
+
#print(dataset_builder.info.features)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
#st.markdown("### General Metrics")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
#dataset = load_dataset(dataset_name, dataset_config, dataset_split)
|
| 87 |
+
#print(dataset_name, dataset_config, dataset_split)
|
| 88 |
+
|
| 89 |
+
#print(labels.head())
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
num_classes = dataset_builder.info.features['label'].num_classes
|
| 95 |
+
dataset = load_dataset(dataset_name, split=dataset_split)
|
| 96 |
+
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
|
| 97 |
+
labels = labels.rename(columns={"count_star()": "count"})
|
| 98 |
+
labels.index = dataset_builder.info.features['label'].names
|
| 99 |
+
st.markdown("### Labelled Metrics")
|
| 100 |
+
st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
|
| 101 |
+
#TODO : figure out how to make a label plot
|
| 102 |
+
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
|
| 103 |
+
total = sum(c for c in labels['count'])
|
| 104 |
+
proportion = [c/total for c in labels['count']]
|
| 105 |
+
#proportion = [0.85, 0.15]
|
| 106 |
+
stdev_dataset= statistics.stdev(proportion)
|
| 107 |
+
if stdev_dataset <= balanced_stdev:
|
| 108 |
+
st.markdown("Since your dataset is well-balanced, you can look at using:")
|
| 109 |
+
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
|
| 110 |
+
accuracy_code = '''from datasets import load_metric
|
| 111 |
+
metric = load_metric("accuracy")'''
|
| 112 |
+
st.code(accuracy_code, language='python')
|
| 113 |
+
|
| 114 |
+
else:
|
| 115 |
+
st.markdown("Since your dataset is not well-balanced, you can look at using:")
|
| 116 |
+
st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
|
| 117 |
+
accuracy_code = '''from datasets import load_metric
|
| 118 |
+
metric = load_metric("accuracy")'''
|
| 119 |
+
st.code(accuracy_code, language='python')
|
| 120 |
+
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
|
| 121 |
+
except:
|
| 122 |
+
st.markdown("### Unsupervised Metrics")
|
| 123 |
+
st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
|
| 124 |
+
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
|
| 125 |
+
perplexity_code = '''from datasets import load_metric
|
| 126 |
+
metric = load_metric("perplexity")'''
|
| 127 |
+
st.code(perplexity_code, language='python')
|
| 128 |
+
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
|
robot.png
ADDED
|