lhoestq HF staff commited on
Commit
73e0168
·
1 Parent(s): b067411

add run.py and v0 of the app

Browse files
Files changed (5) hide show
  1. Dockerfile +2 -2
  2. README.md +7 -3
  3. app.py +115 -8
  4. requirements.txt +2 -1
  5. run.py +66 -2
Dockerfile CHANGED
@@ -17,10 +17,10 @@ WORKDIR $HOME/app
17
  RUN pip install --no-cache-dir --upgrade pip
18
 
19
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
20
- COPY --chown=user run.py app.py requirements.txt $HOME/app
21
 
22
  # Install dependencies
23
- RUN pip install gradio
24
  RUN pip install -r requirements.txt
25
 
26
  # Run app
 
17
  RUN pip install --no-cache-dir --upgrade pip
18
 
19
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
20
+ COPY --chown=user run.py app.py requirements.txt $HOME/app/
21
 
22
  # Install dependencies
23
+ RUN pip install "gradio[oauth]" fire
24
  RUN pip install -r requirements.txt
25
 
26
  # Run app
README.md CHANGED
@@ -1,10 +1,14 @@
1
  ---
2
  title: Run Duckdb
3
- emoji: 🐠
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Run Duckdb
3
+ emoji: 🦆
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ hf_oauth: true
9
+ hf_oauth_scopes:
10
+ - manage-repos
11
+ - jobs-api
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,20 +1,127 @@
 
1
  import subprocess
2
- import sys
3
 
4
  import gradio as gr
 
 
5
 
 
6
 
7
- def run():
8
- process = subprocess.Popen(["python", "run.py"], stdout=subprocess.PIPE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  logs = ""
10
  for line in iter(process.stdout.readline, b""):
11
  logs += line.decode()
12
- yield logs
 
 
 
 
 
 
 
 
13
 
14
  with gr.Blocks() as demo:
15
- button = gr.Button("Run")
16
- output_textbox = gr.Textbox()
17
- button.click(run, outputs=[output_textbox])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  if __name__ == "__main__":
20
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import re
2
  import subprocess
3
+ import yaml
4
 
5
  import gradio as gr
6
+ import requests
7
+ from huggingface_hub import HfApi
8
 
9
+ CMD = ["python" ,"run.py"]
10
 
11
+ with open("README.md") as f:
12
+ METADATA = yaml.safe_load(f.read().split("---\n")[1])
13
+ TITLE = METADATA["title"]
14
+ EMOJI = METADATA["emoji"]
15
+
16
+ try:
17
+ process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
18
+ HELP = not process.returncode and (process.stdout or process.stderr).decode()
19
+ except Exception:
20
+ HELP = False
21
+
22
+ DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
23
+
24
+ def update_pbars(pbars: dict[str, float], line: str):
25
+ if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"):
26
+ [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
27
+ percent = float(percent_match.group(0)[:-1]) / 100
28
+ desc = line[:percent_match.start()].strip() or "Progress"
29
+ pbars[desc] = percent
30
+
31
+ def dry_run(src, config, split, dst, query):
32
+ if not all([src, config, split, dst, query]):
33
+ raise gr.Error("Please fill source, destination and query.")
34
+ process = subprocess.Popen(CMD + ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN], stdout=subprocess.PIPE)
35
  logs = ""
36
  for line in iter(process.stdout.readline, b""):
37
  logs += line.decode()
38
+ yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
39
+
40
+ def run(src, config, split, dst, query):
41
+ if not all([src, config, split, dst, query]):
42
+ raise gr.Error("Please fill source, destination and query.")
43
+ raise gr.Error("NotImplemented")
44
+
45
+ READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
46
+ NUM_TRENDING_DATASETS = 10
47
 
48
  with gr.Blocks() as demo:
49
+ with gr.Row():
50
+ with gr.Column(scale=10):
51
+ gr.Markdown(f"# {TITLE} {EMOJI}")
52
+ with gr.Column():
53
+ gr.LoginButton(scale=0.1)
54
+ with gr.Row():
55
+ with gr.Column():
56
+ with gr.Row():
57
+ loading_codes_json = gr.JSON([], visible=False)
58
+ dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
59
+ subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
60
+ split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
61
+ with gr.Column(scale=0.1, min_width=60):
62
+ gr.HTML("<div style='font-size: 4em;'>→</div>")
63
+ with gr.Column():
64
+ dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
65
+ query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
66
+ with gr.Row():
67
+ run_button = gr.Button("Run", scale=10, variant="primary")
68
+ if DRY_RUN:
69
+ dry_run_button = gr.Button("Dry-Run")
70
+ progress_labels= gr.Label(visible=False, label="Progress")
71
+ output_markdown = gr.Markdown(label="Output logs")
72
+ run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
73
+ if DRY_RUN:
74
+ dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
75
+
76
+ def show_subset_dropdown(dataset: str):
77
+ if dataset and "/" not in dataset.strip().strip("/"):
78
+ return []
79
+ resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
80
+ loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_FUNCTIONS] or [[]])[0] or []
81
+ subsets = [loading_code["config_name"] for loading_code in loading_codes]
82
+ subset = (subsets or [""])[0]
83
+ return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
84
+
85
+ def show_split_dropdown(subset: str, loading_codes: list[dict]):
86
+ splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
87
+ split = (splits or [""])[0]
88
+ return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
89
+
90
+ @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown])
91
+ def _fetch_datasets(request: gr.Request):
92
+ dataset = "CohereForAI/Global-MMLU"
93
+ datasets = [dataset] + [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1) if ds.id != dataset]
94
+ subsets, loading_codes = show_subset_dropdown(dataset)
95
+ splits = show_split_dropdown(subsets["value"], loading_codes)
96
+ return {
97
+ dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset),
98
+ loading_codes_json: loading_codes,
99
+ subset_dropdown: gr.Dropdown(**subsets),
100
+ split_dropdown: gr.Dropdown(**splits),
101
+ }
102
+
103
+ @dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[subset_dropdown, split_dropdown])
104
+ def _show_subset_dropdown(dataset: str):
105
+ subsets, loading_codes = show_subset_dropdown(dataset)
106
+ splits = show_split_dropdown(subsets["value"], loading_codes)
107
+ return {
108
+ subset_dropdown: gr.Dropdown(**subsets),
109
+ split_dropdown: gr.Dropdown(**splits),
110
+ }
111
+
112
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown])
113
+ def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
114
+ splits = show_split_dropdown(subset, loading_codes)
115
+ return {
116
+ split_dropdown: gr.Dropdown(**splits),
117
+ }
118
+
119
+ if HELP:
120
+ with demo.route("Help", "/help"):
121
+ gr.Markdown(f"# Help\n\n```\n{HELP}\n```")
122
+
123
+ with demo.route("Jobs", "/jobs"):
124
+ gr.Markdown("# Jobs")
125
 
126
  if __name__ == "__main__":
127
+ demo.launch(server_name="0.0.0.0")
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  duckdb
2
- huggingface_hub
 
 
1
  duckdb
2
+ huggingface_hub
3
+ tabulate
run.py CHANGED
@@ -1,2 +1,66 @@
1
- print("hello world")
2
- print("done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fire
2
+
3
+ CONFIG = {
4
+ "preserve_insertion_order": False
5
+ }
6
+
7
+ CMD_SRC_KWARGS = """
8
+ SELECT ('hf://datasets/{src}/' || lo.arguments['splits']['{split}']) AS path, function
9
+ FROM (
10
+ SELECT unnest(li.loading_codes) AS lo, li.function[4:] as function
11
+ FROM (
12
+ SELECT unnest(libraries) as li
13
+ FROM read_json('https://datasets-server.huggingface.co/compatible-libraries?dataset={src}')
14
+ ) WHERE li.function[:3] = 'pl.'
15
+ ) WHERE lo.config_name='{config}';
16
+ """.strip()
17
+
18
+ CMD_SRC = """
19
+ CREATE VIEW src AS SELECT * FROM {function}('{path}');
20
+ """.strip()
21
+
22
+
23
+ CMD_DST = """
24
+ COPY ({query}) to 'tmp' (FORMAT PARQUET, ROW_GROUP_SIZE_BYTES '100MB', ROW_GROUPS_PER_FILE 5, PER_THREAD_OUTPUT true);
25
+ """.strip()
26
+
27
+ CMD_SRC_DRY_RUN = CMD_SRC[:-1] + " LIMIT 5;"
28
+ CMD_DST_DRY_RUN = "{query};"
29
+
30
+ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
31
+ import os
32
+ import duckdb
33
+ from contextlib import nullcontext
34
+ from huggingface_hub import CommitScheduler
35
+
36
+ class CommitAndCleanScheduler(CommitScheduler):
37
+
38
+ def push_to_hub(self):
39
+ for path in self.folder_path.with_name("tmp").glob(self.allow_patterns):
40
+ with path.open("rb") as f:
41
+ footer = f.read(4) and f.seek(-4, os.SEEK_END) and f.read(4)
42
+ if footer == b"PAR1":
43
+ path.rename(self.folder_path / path.name)
44
+ super().push_to_hub()
45
+ for path in self.last_uploaded:
46
+ path.unlink(missing_ok=True)
47
+
48
+ with nullcontext() if dry_run else CommitAndCleanScheduler(repo_id=dst, repo_type="dataset", folder_path="dst", path_in_repo="data", allow_patterns="*.parquet", every=0.1, private=private):
49
+ con = duckdb.connect(":memory:", config=CONFIG)
50
+ src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
51
+ if not src_kwargs:
52
+ raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
53
+ con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
54
+ if dry_run:
55
+ print(f"Sample data from '{src}' that would be written to '{dst}':\n")
56
+ else:
57
+ con.sql("PRAGMA enable_progress_bar;")
58
+ result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
59
+ if dry_run:
60
+ print(result.df().to_markdown())
61
+ else:
62
+ print("done")
63
+
64
+
65
+ if __name__ == '__main__':
66
+ fire.Fire(sql)