import re import subprocess import yaml import gradio as gr import requests from huggingface_hub import HfApi CMD = ["python" ,"run.py"] with open("README.md") as f: METADATA = yaml.safe_load(f.read().split("---\n")[1]) TITLE = METADATA["title"] EMOJI = METADATA["emoji"] try: process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) HELP = not process.returncode and (process.stdout or process.stderr).decode() except Exception: HELP = False DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0) def update_pbars(pbars: dict[str, float], line: str): if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"): [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.] percent = float(percent_match.group(0)[:-1]) / 100 desc = line[:percent_match.start()].strip() or "Progress" pbars[desc] = percent def dry_run(src, config, split, dst, query): if not all([src, config, split, dst, query]): raise gr.Error("Please fill source, destination and query.") process = subprocess.Popen(CMD + ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN], stdout=subprocess.PIPE) logs = "" for line in iter(process.stdout.readline, b""): logs += line.decode() yield {output_markdown: logs, progress_labels: gr.Label(visible=False)} def run(src, config, split, dst, query): if not all([src, config, split, dst, query]): raise gr.Error("Please fill source, destination and query.") raise gr.Error("NotImplemented") READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json") NUM_TRENDING_DATASETS = 10 with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=10): gr.Markdown(f"# {TITLE} {EMOJI}") with gr.Column(): gr.LoginButton(scale=0.1) with gr.Row(): with gr.Column(): with gr.Row(): loading_codes_json = gr.JSON([], visible=False) dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10) subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False) split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False) with gr.Column(scale=0.1, min_width=60): gr.HTML("
") with gr.Column(): dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True) query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False) with gr.Row(): run_button = gr.Button("Run", scale=10, variant="primary") if DRY_RUN: dry_run_button = gr.Button("Dry-Run") progress_labels= gr.Label(visible=False, label="Progress") output_markdown = gr.Markdown(label="Output logs") run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown]) if DRY_RUN: dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown]) def show_subset_dropdown(dataset: str): if dataset and "/" not in dataset.strip().strip("/"): return [] resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_FUNCTIONS] or [[]])[0] or [] subsets = [loading_code["config_name"] for loading_code in loading_codes] subset = (subsets or [""])[0] return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes def show_split_dropdown(subset: str, loading_codes: list[dict]): splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] split = (splits or [""])[0] return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown]) def _fetch_datasets(request: gr.Request): dataset = "CohereForAI/Global-MMLU" datasets = [dataset] + [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1) if ds.id != dataset] subsets, loading_codes = show_subset_dropdown(dataset) splits = show_split_dropdown(subsets["value"], loading_codes) return { dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset), loading_codes_json: loading_codes, subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), } @dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[subset_dropdown, split_dropdown]) def _show_subset_dropdown(dataset: str): subsets, loading_codes = show_subset_dropdown(dataset) splits = show_split_dropdown(subsets["value"], loading_codes) return { subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), } @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown]) def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): splits = show_split_dropdown(subset, loading_codes) return { split_dropdown: gr.Dropdown(**splits), } if HELP: with demo.route("Help", "/help"): gr.Markdown(f"# Help\n\n```\n{HELP}\n```") with demo.route("Jobs", "/jobs"): gr.Markdown("# Jobs") if __name__ == "__main__": demo.launch(server_name="0.0.0.0")