# """ # app.py # Gradio app to download a dataset from a provided URL and push it to a Hugging Face # datasets repository under the provided username/repo_name. # Requirements: # pip install gradio huggingface-hub requests tqdm # """ import os import tempfile import shutil import requests from urllib.parse import urlparse import zipfile import tarfile from pathlib import Path from huggingface_hub import HfApi import gradio as gr from tqdm import tqdm def filename_from_url(url: str) -> str: parsed = urlparse(url) name = os.path.basename(parsed.path) return name or "downloaded_file" def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=None): with requests.get(url, stream=True, allow_redirects=True, timeout=30) as r: r.raise_for_status() total = r.headers.get("content-length") total = int(total) if total and total.isdigit() else None with open(dest_path, "wb") as f: if total: for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=(total // chunk_size) + 1, unit="chunk"): if chunk: f.write(chunk) if logger: logger.append(f" wrote {len(chunk)} bytes") else: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) if logger: logger.append(f" wrote {len(chunk)} bytes") return dest_path def extract_archive_if_needed(filepath: str, extract_to: str, logger=None): lowered = filepath.lower() try: if zipfile.is_zipfile(filepath) or lowered.endswith(".zip"): with zipfile.ZipFile(filepath, "r") as z: z.extractall(extract_to) if logger: logger.append(f"Extracted zip archive to {extract_to}") return True if tarfile.is_tarfile(filepath) or any(lowered.endswith(ext) for ext in [".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"]): with tarfile.open(filepath, "r:*") as tar: tar.extractall(path=extract_to) if logger: logger.append(f"Extracted tar archive to {extract_to}") return True except Exception as e: if logger: logger.append(f"Extraction failed: {e}") return False return False def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str, logger=None): local_dir = Path(local_dir) uploaded = 0 for root, dirs, files in os.walk(local_dir): dirs[:] = [d for d in dirs if not d.startswith(".")] for file in files: if file.startswith("."): continue full_path = Path(root) / file rel_path = full_path.relative_to(local_dir).as_posix() try: if logger: logger.append(f"Uploading {rel_path} ...") api.upload_file( path_or_fileobj=str(full_path), path_in_repo=rel_path, repo_id=repo_id, repo_type="dataset", token=token, ) if logger: logger.append(f" uploaded {rel_path}") uploaded += 1 except Exception as e: if logger: logger.append(f" ERROR uploading {rel_path}: {e}") return uploaded def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_token: str, extract_archive: bool): log = [] if not hf_username or not repo_name or not dataset_url: return "ERROR: hf_username, repo_name and dataset_url are required.", "" if not hf_token: # fixed: parenthesize the multi-line string so Python treats it as one expression return ( "ERROR: A Hugging Face token is required to create/upload repositories. " "Create one at https://huggingface.co/settings/tokens with 'repo' and 'upload' scopes." ), "" repo_id = f"{hf_username}/{repo_name}" hf_api = HfApi() try: log.append(f"Creating (or verifying) dataset repo {repo_id} ...") try: # some versions support exist_ok, some do not — try both ways hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token, exist_ok=True) except TypeError: try: hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token) except Exception: log.append("Repo may already exist or creation raised an error; continuing.") log.append("Repo ready.") except Exception as e: return f"ERROR creating/validating repo {repo_id}: {e}", "" tmpdir = tempfile.mkdtemp(prefix="hf_dataset_") try: suggested_name = filename_from_url(dataset_url) dest_file = os.path.join(tmpdir, suggested_name) log.append(f"Downloading {dataset_url} -> {dest_file} ... (streaming)") try: stream_download(dataset_url, dest_file, logger=log) log.append("Download finished.") except Exception as e: raise RuntimeError(f"Download failed: {e}") extracted = False if extract_archive: log.append("Checking whether to extract archive...") extracted = extract_archive_if_needed(dest_file, tmpdir, logger=log) uploaded_count = 0 if extracted: log.append("Uploading extracted files...") uploaded_count = upload_directory_to_hf(hf_api, tmpdir, repo_id, hf_token, logger=log) else: log.append(f"Uploading single file {os.path.basename(dest_file)} ...") try: hf_api.upload_file( path_or_fileobj=dest_file, path_in_repo=os.path.basename(dest_file), repo_id=repo_id, repo_type="dataset", token=hf_token, ) uploaded_count = 1 log.append(f"Uploaded {os.path.basename(dest_file)}") except Exception as e: raise RuntimeError(f"Upload failed: {e}") repo_url = f"https://huggingface.co/datasets/{repo_id}" log.append(f"Done. Uploaded {uploaded_count} file(s).") log.append(f"Dataset repo URL: {repo_url}") return "\n".join(log), repo_url except Exception as e: return f"ERROR during processing: {e}\n\nLog so far:\n" + "\n".join(log), "" finally: try: shutil.rmtree(tmpdir) except Exception: pass # ---------- Gradio UI ---------- with gr.Blocks(title="HF Dataset Uploader") as demo: gr.Markdown("# Hugging Face — dataset uploader\n" "Enter your Hugging Face username, desired dataset repo name, a downloadable dataset URL, and your HF token. " "This app will download the URL and save its contents into `https://huggingface.co/datasets//`.\n\n" "**Warning:** Provide a token with `repo`/`upload` permissions. Large uploads may take time.") with gr.Row(): with gr.Column(): hf_username_in = gr.Textbox(label="Hugging Face username", placeholder="your-hf-username", lines=1) repo_name_in = gr.Textbox(label="Dataset repository name", placeholder="my_dataset_repo", lines=1) dataset_url_in = gr.Textbox(label="Downloadable dataset URL (direct link)", placeholder="https://example.com/data.zip", lines=1) hf_token_in = gr.Textbox(label="Hugging Face token (write/upload)", placeholder="hf_xxx", type="password", lines=1) extract_chk = gr.Checkbox(label="Extract archive if ZIP/TAR? (recommended for zip/tar archives)", value=True) submit_btn = gr.Button("Download & Upload") with gr.Column(): output_log = gr.Textbox(label="Status log", interactive=False, lines=18) repo_link = gr.Textbox(label="Dataset repo URL (if successful)", interactive=False, lines=1) def submit_click(hf_username, repo_name, dataset_url, hf_token, extract_archive): log, url = process_and_upload(hf_username.strip(), repo_name.strip(), dataset_url.strip(), hf_token.strip(), extract_archive) return log, url submit_btn.click(fn=submit_click, inputs=[hf_username_in, repo_name_in, dataset_url_in, hf_token_in, extract_chk], outputs=[output_log, repo_link]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", share=False)