Spaces:
Runtime error
Runtime error
add run.py and v0 of the app
Browse files- Dockerfile +2 -2
- README.md +7 -3
- app.py +115 -8
- requirements.txt +2 -1
- run.py +66 -2
Dockerfile
CHANGED
@@ -17,10 +17,10 @@ WORKDIR $HOME/app
|
|
17 |
RUN pip install --no-cache-dir --upgrade pip
|
18 |
|
19 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
20 |
-
COPY --chown=user run.py app.py requirements.txt $HOME/app
|
21 |
|
22 |
# Install dependencies
|
23 |
-
RUN pip install gradio
|
24 |
RUN pip install -r requirements.txt
|
25 |
|
26 |
# Run app
|
|
|
17 |
RUN pip install --no-cache-dir --upgrade pip
|
18 |
|
19 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
20 |
+
COPY --chown=user run.py app.py requirements.txt $HOME/app/
|
21 |
|
22 |
# Install dependencies
|
23 |
+
RUN pip install "gradio[oauth]" fire
|
24 |
RUN pip install -r requirements.txt
|
25 |
|
26 |
# Run app
|
README.md
CHANGED
@@ -1,10 +1,14 @@
|
|
1 |
---
|
2 |
title: Run Duckdb
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Run Duckdb
|
3 |
+
emoji: 🦆
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
hf_oauth: true
|
9 |
+
hf_oauth_scopes:
|
10 |
+
- manage-repos
|
11 |
+
- jobs-api
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,20 +1,127 @@
|
|
|
|
1 |
import subprocess
|
2 |
-
import
|
3 |
|
4 |
import gradio as gr
|
|
|
|
|
5 |
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
logs = ""
|
10 |
for line in iter(process.stdout.readline, b""):
|
11 |
logs += line.decode()
|
12 |
-
yield logs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
with gr.Blocks() as demo:
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
if __name__ == "__main__":
|
20 |
-
demo.launch(server_name="0.0.0.0"
|
|
|
1 |
+
import re
|
2 |
import subprocess
|
3 |
+
import yaml
|
4 |
|
5 |
import gradio as gr
|
6 |
+
import requests
|
7 |
+
from huggingface_hub import HfApi
|
8 |
|
9 |
+
CMD = ["python" ,"run.py"]
|
10 |
|
11 |
+
with open("README.md") as f:
|
12 |
+
METADATA = yaml.safe_load(f.read().split("---\n")[1])
|
13 |
+
TITLE = METADATA["title"]
|
14 |
+
EMOJI = METADATA["emoji"]
|
15 |
+
|
16 |
+
try:
|
17 |
+
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
18 |
+
HELP = not process.returncode and (process.stdout or process.stderr).decode()
|
19 |
+
except Exception:
|
20 |
+
HELP = False
|
21 |
+
|
22 |
+
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
|
23 |
+
|
24 |
+
def update_pbars(pbars: dict[str, float], line: str):
|
25 |
+
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"):
|
26 |
+
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
|
27 |
+
percent = float(percent_match.group(0)[:-1]) / 100
|
28 |
+
desc = line[:percent_match.start()].strip() or "Progress"
|
29 |
+
pbars[desc] = percent
|
30 |
+
|
31 |
+
def dry_run(src, config, split, dst, query):
|
32 |
+
if not all([src, config, split, dst, query]):
|
33 |
+
raise gr.Error("Please fill source, destination and query.")
|
34 |
+
process = subprocess.Popen(CMD + ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN], stdout=subprocess.PIPE)
|
35 |
logs = ""
|
36 |
for line in iter(process.stdout.readline, b""):
|
37 |
logs += line.decode()
|
38 |
+
yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
|
39 |
+
|
40 |
+
def run(src, config, split, dst, query):
|
41 |
+
if not all([src, config, split, dst, query]):
|
42 |
+
raise gr.Error("Please fill source, destination and query.")
|
43 |
+
raise gr.Error("NotImplemented")
|
44 |
+
|
45 |
+
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
|
46 |
+
NUM_TRENDING_DATASETS = 10
|
47 |
|
48 |
with gr.Blocks() as demo:
|
49 |
+
with gr.Row():
|
50 |
+
with gr.Column(scale=10):
|
51 |
+
gr.Markdown(f"# {TITLE} {EMOJI}")
|
52 |
+
with gr.Column():
|
53 |
+
gr.LoginButton(scale=0.1)
|
54 |
+
with gr.Row():
|
55 |
+
with gr.Column():
|
56 |
+
with gr.Row():
|
57 |
+
loading_codes_json = gr.JSON([], visible=False)
|
58 |
+
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
|
59 |
+
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
60 |
+
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
61 |
+
with gr.Column(scale=0.1, min_width=60):
|
62 |
+
gr.HTML("<div style='font-size: 4em;'>→</div>")
|
63 |
+
with gr.Column():
|
64 |
+
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
|
65 |
+
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
|
66 |
+
with gr.Row():
|
67 |
+
run_button = gr.Button("Run", scale=10, variant="primary")
|
68 |
+
if DRY_RUN:
|
69 |
+
dry_run_button = gr.Button("Dry-Run")
|
70 |
+
progress_labels= gr.Label(visible=False, label="Progress")
|
71 |
+
output_markdown = gr.Markdown(label="Output logs")
|
72 |
+
run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
|
73 |
+
if DRY_RUN:
|
74 |
+
dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
|
75 |
+
|
76 |
+
def show_subset_dropdown(dataset: str):
|
77 |
+
if dataset and "/" not in dataset.strip().strip("/"):
|
78 |
+
return []
|
79 |
+
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
|
80 |
+
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_FUNCTIONS] or [[]])[0] or []
|
81 |
+
subsets = [loading_code["config_name"] for loading_code in loading_codes]
|
82 |
+
subset = (subsets or [""])[0]
|
83 |
+
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
|
84 |
+
|
85 |
+
def show_split_dropdown(subset: str, loading_codes: list[dict]):
|
86 |
+
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
87 |
+
split = (splits or [""])[0]
|
88 |
+
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
89 |
+
|
90 |
+
@demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown])
|
91 |
+
def _fetch_datasets(request: gr.Request):
|
92 |
+
dataset = "CohereForAI/Global-MMLU"
|
93 |
+
datasets = [dataset] + [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1) if ds.id != dataset]
|
94 |
+
subsets, loading_codes = show_subset_dropdown(dataset)
|
95 |
+
splits = show_split_dropdown(subsets["value"], loading_codes)
|
96 |
+
return {
|
97 |
+
dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset),
|
98 |
+
loading_codes_json: loading_codes,
|
99 |
+
subset_dropdown: gr.Dropdown(**subsets),
|
100 |
+
split_dropdown: gr.Dropdown(**splits),
|
101 |
+
}
|
102 |
+
|
103 |
+
@dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[subset_dropdown, split_dropdown])
|
104 |
+
def _show_subset_dropdown(dataset: str):
|
105 |
+
subsets, loading_codes = show_subset_dropdown(dataset)
|
106 |
+
splits = show_split_dropdown(subsets["value"], loading_codes)
|
107 |
+
return {
|
108 |
+
subset_dropdown: gr.Dropdown(**subsets),
|
109 |
+
split_dropdown: gr.Dropdown(**splits),
|
110 |
+
}
|
111 |
+
|
112 |
+
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown])
|
113 |
+
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
114 |
+
splits = show_split_dropdown(subset, loading_codes)
|
115 |
+
return {
|
116 |
+
split_dropdown: gr.Dropdown(**splits),
|
117 |
+
}
|
118 |
+
|
119 |
+
if HELP:
|
120 |
+
with demo.route("Help", "/help"):
|
121 |
+
gr.Markdown(f"# Help\n\n```\n{HELP}\n```")
|
122 |
+
|
123 |
+
with demo.route("Jobs", "/jobs"):
|
124 |
+
gr.Markdown("# Jobs")
|
125 |
|
126 |
if __name__ == "__main__":
|
127 |
+
demo.launch(server_name="0.0.0.0")
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
duckdb
|
2 |
-
huggingface_hub
|
|
|
|
1 |
duckdb
|
2 |
+
huggingface_hub
|
3 |
+
tabulate
|
run.py
CHANGED
@@ -1,2 +1,66 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fire
|
2 |
+
|
3 |
+
CONFIG = {
|
4 |
+
"preserve_insertion_order": False
|
5 |
+
}
|
6 |
+
|
7 |
+
CMD_SRC_KWARGS = """
|
8 |
+
SELECT ('hf://datasets/{src}/' || lo.arguments['splits']['{split}']) AS path, function
|
9 |
+
FROM (
|
10 |
+
SELECT unnest(li.loading_codes) AS lo, li.function[4:] as function
|
11 |
+
FROM (
|
12 |
+
SELECT unnest(libraries) as li
|
13 |
+
FROM read_json('https://datasets-server.huggingface.co/compatible-libraries?dataset={src}')
|
14 |
+
) WHERE li.function[:3] = 'pl.'
|
15 |
+
) WHERE lo.config_name='{config}';
|
16 |
+
""".strip()
|
17 |
+
|
18 |
+
CMD_SRC = """
|
19 |
+
CREATE VIEW src AS SELECT * FROM {function}('{path}');
|
20 |
+
""".strip()
|
21 |
+
|
22 |
+
|
23 |
+
CMD_DST = """
|
24 |
+
COPY ({query}) to 'tmp' (FORMAT PARQUET, ROW_GROUP_SIZE_BYTES '100MB', ROW_GROUPS_PER_FILE 5, PER_THREAD_OUTPUT true);
|
25 |
+
""".strip()
|
26 |
+
|
27 |
+
CMD_SRC_DRY_RUN = CMD_SRC[:-1] + " LIMIT 5;"
|
28 |
+
CMD_DST_DRY_RUN = "{query};"
|
29 |
+
|
30 |
+
def sql(src: str, dst: str, query: str, config: str = "default", split: str = "train", private: bool = False, dry_run: bool = False):
|
31 |
+
import os
|
32 |
+
import duckdb
|
33 |
+
from contextlib import nullcontext
|
34 |
+
from huggingface_hub import CommitScheduler
|
35 |
+
|
36 |
+
class CommitAndCleanScheduler(CommitScheduler):
|
37 |
+
|
38 |
+
def push_to_hub(self):
|
39 |
+
for path in self.folder_path.with_name("tmp").glob(self.allow_patterns):
|
40 |
+
with path.open("rb") as f:
|
41 |
+
footer = f.read(4) and f.seek(-4, os.SEEK_END) and f.read(4)
|
42 |
+
if footer == b"PAR1":
|
43 |
+
path.rename(self.folder_path / path.name)
|
44 |
+
super().push_to_hub()
|
45 |
+
for path in self.last_uploaded:
|
46 |
+
path.unlink(missing_ok=True)
|
47 |
+
|
48 |
+
with nullcontext() if dry_run else CommitAndCleanScheduler(repo_id=dst, repo_type="dataset", folder_path="dst", path_in_repo="data", allow_patterns="*.parquet", every=0.1, private=private):
|
49 |
+
con = duckdb.connect(":memory:", config=CONFIG)
|
50 |
+
src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
|
51 |
+
if not src_kwargs:
|
52 |
+
raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
|
53 |
+
con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
|
54 |
+
if dry_run:
|
55 |
+
print(f"Sample data from '{src}' that would be written to '{dst}':\n")
|
56 |
+
else:
|
57 |
+
con.sql("PRAGMA enable_progress_bar;")
|
58 |
+
result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
|
59 |
+
if dry_run:
|
60 |
+
print(result.df().to_markdown())
|
61 |
+
else:
|
62 |
+
print("done")
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == '__main__':
|
66 |
+
fire.Fire(sql)
|