Spaces:

sasha
/

evaluation-buddy

Sleeping

App Files Files Community

Sasha commited on Mar 17, 2022

Commit

1f09890

1 Parent(s): d720be7

Initial version of the Evaluation Buddy -- currently most things are hardcoded (e.g. the dataset list), but the goal it to make it all compatible with the Hub!

Browse files

Files changed (2) hide show

app.py +128 -0
robot.png +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+from datasets import load_dataset_builder
+from datasets import get_dataset_config_names
+from os import listdir
+from datasets import load_dataset, Dataset
+from datasets_sql import query
+import plotly.express as px
+import numpy as np
+import statistics
+st.set_page_config(
+    page_title="Evaluation Buddy",
+    page_icon="./robot.png",
+    layout="wide",
+)
+st.title("Hugging Face Evaluation Buddy")
+top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
+                'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
+                'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
+                'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
+                'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
+                'race', 'winogrande']
+tasks= ['text-classification', 'question-answering-extractive', 'automatic-speech-recognition']
+with st.sidebar.expander("Datasets", expanded=True):
+    dataset_name = st.selectbox(
+        f"Choose a dataset to evaluate on:",
+        sorted(top_datasets))
+    configs = get_dataset_config_names(dataset_name)
+    dataset_config = st.selectbox(
+        f"Choose a configuration of your dataset:",
+        configs)
+    dataset_builder = load_dataset_builder(dataset_name, dataset_config)
+    splits = [s for s in dataset_builder.info.splits]
+    dataset_split = st.selectbox(
+    f"Choose a dataset split:",
+    splits)
+    balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)
+st.markdown("## Here is some information about your dataset:")
+st.markdown("### Description")
+st.markdown(dataset_builder.info.description)
+st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
+st.markdown("### Dataset-Specific Metrics")
+if dataset_name in listdir('../datasets/metrics/'):
+    st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
+    code = ''' from datasets import load_metric
+ metric = load_metric('''+dataset+''', '''+config+''')'''
+    st.code(code, language='python')
+    dedicated_metric = True
+else:
+    st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
+    dedicated_metric = False
+st.markdown("### Task-Specific Metrics")
+try:
+    task = dataset_builder.info.task_templates[0].task
+    st.markdown("The task associated to it is: " + task)
+    if task == 'automatic-speech-recognition':
+        st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
+        st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
+        st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
+    else:
+        st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
+except:
+    st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
+#print(dataset_builder.info.task_templates)
+#print(dataset_builder.info.features)
+#st.markdown("### General Metrics")
+#dataset = load_dataset(dataset_name, dataset_config, dataset_split)
+#print(dataset_name, dataset_config, dataset_split)
+#print(labels.head())
+try:
+    num_classes = dataset_builder.info.features['label'].num_classes
+    dataset = load_dataset(dataset_name, split=dataset_split)
+    labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
+    labels = labels.rename(columns={"count_star()": "count"})
+    labels.index = dataset_builder.info.features['label'].names
+    st.markdown("### Labelled  Metrics")
+    st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
+    #TODO : figure out how to make a label plot
+    st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
+    total = sum(c for c in labels['count'])
+    proportion = [c/total for c in labels['count']]
+    #proportion = [0.85, 0.15]
+    stdev_dataset= statistics.stdev(proportion)
+    if stdev_dataset <= balanced_stdev:
+            st.markdown("Since your dataset is well-balanced, you can look at using:")
+            st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
+            accuracy_code = '''from datasets import load_metric
+        metric = load_metric("accuracy")'''
+            st.code(accuracy_code, language='python')
+    else:
+            st.markdown("Since your dataset is not well-balanced, you can look at using:")
+            st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
+            accuracy_code = '''from datasets import load_metric
+        metric = load_metric("accuracy")'''
+            st.code(accuracy_code, language='python')
+            st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
+except:
+    st.markdown("### Unsupervised  Metrics")
+    st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
+    st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
+    perplexity_code = '''from datasets import load_metric
+metric = load_metric("perplexity")'''
+    st.code(perplexity_code, language='python')
+    st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')

robot.png ADDED Viewed