Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| import time | |
| import threading | |
| from huggingface_hub import HfApi | |
| from humanize import naturalsize | |
| api = HfApi() | |
| HF_TOKEN = os.getenv('HF_TOKEN') | |
| def clickable(x): | |
| return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>' | |
| def apply_headers(df, headers): | |
| tmp = df.copy() | |
| tmp.columns = headers | |
| return tmp | |
| def search(search_text): | |
| if not search_text: | |
| return df | |
| return df[df['👤 Author'].str.contains(search_text, case=False, na=False)] | |
| df = pd.read_csv("author_data_hf_merged.csv") | |
| df_author_copy = df.copy() | |
| df["author"] = df["author"].apply(lambda x: clickable(x)) | |
| df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1) | |
| df = df.sort_values(by='Total Usage', ascending=False) | |
| sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist())) | |
| naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces'] | |
| df[naturalsize_columns] = df[naturalsize_columns].map(naturalsize) | |
| df['Serial Number'] = [i for i in range(1, len(df)+1)] | |
| df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]] | |
| df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"]) | |
| desc = f""" | |
| 🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface. | |
| ## 📄 Information | |
| 🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard). | |
| These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard) positions: | |
| - 🤖 Top 60k authors in the models category | |
| - 📊 Top 60k authors in the datasets category | |
| - 🚀 Top 50k authors in the spaces category | |
| ## 📒 Notes | |
| Note that these numbers may not be entirely accurate due to the following reasons: | |
| - I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen. | |
| - There may be large datasets/models to which I don't have access (either private or gated). | |
| # 📶 Total Data Usage From All Authors | |
| According to this leaderboard, there is a total of {sum_all_author} of data on this platform. | |
| """ | |
| # Write note maybe? | |
| title = """ | |
| <div style="text-align:center"> | |
| <h1 id="space-title">💾 Data Leaderboard 💾</h1> | |
| </div> | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("""<h1 align="center" id="space-title">💾 Data Leaderboard 💾</h1>""") | |
| gr.Markdown(desc) | |
| with gr.Column(min_width=320): | |
| search_bar = gr.Textbox(placeholder="🔍 Search for a author", show_label=False) | |
| gr_followers = gr.Dataframe(df, interactive=False, datatype=["number", 'markdown', 'number']) | |
| search_bar.submit(fn=search, inputs=search_bar, outputs=gr_followers) | |
| demo.launch() |