Spaces:

ragunath-ravi
/

DownloadIT

Sleeping

App Files Files Community

ragunath-ravi commited on Aug 22

Commit

c95f0b1

verified ·

1 Parent(s): 7977632

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -62

app.py CHANGED Viewed

@@ -1,25 +1,12 @@
-"""
-app.py
-Gradio app to download a dataset from a provided URL and push it to a Hugging Face
-datasets repository under the provided username/repo_name.
-Requirements (install with pip):
-    pip install gradio huggingface_hub requests tqdm
-How it works:
-    1. User supplies: hf_username, repo_name, dataset_url, hf_token
-    2. App creates (or re-uses) a dataset repo on Hugging Face:
-         https://huggingface.co/datasets/{username}/{repo_name}
-    3. Downloads the file (streamed), optionally extracts if archive.
-    4. Uploads one or more files into the dataset repo.
-    5. Returns a status log and repo URL.
-Security note:
-    - The HF token is required to create/upload to the user's repo. Never share your token.
-    - This script performs synchronous downloads/uploads; large datasets may take time and
-      will use local disk space while processing.
-"""
 import os
 import tempfile
@@ -33,27 +20,19 @@ from huggingface_hub import HfApi
 import gradio as gr
 from tqdm import tqdm
-# ---------- Helper functions ----------
 def filename_from_url(url: str) -> str:
-    """Try to determine a filename from a URL; fallback to 'download'."""
     parsed = urlparse(url)
     name = os.path.basename(parsed.path)
-    if not name:
-        return "downloaded_file"
-    return name
 def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=None):
-    """
-    Stream download a URL to dest_path. Returns the final path.
-    """
     with requests.get(url, stream=True, allow_redirects=True, timeout=30) as r:
         r.raise_for_status()
         total = r.headers.get("content-length")
         total = int(total) if total and total.isdigit() else None
         with open(dest_path, "wb") as f:
             if total:
-                for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=total // chunk_size + 1, unit="chunk"):
                     if chunk:
                         f.write(chunk)
                         if logger:
@@ -67,10 +46,6 @@ def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=No
     return dest_path
 def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
-    """
-    If filepath is a zip or tar(.*) archive, extract it into extract_to and return True.
-    Otherwise, return False.
-    """
     lowered = filepath.lower()
     try:
         if zipfile.is_zipfile(filepath) or lowered.endswith(".zip"):
@@ -79,7 +54,6 @@ def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
             if logger:
                 logger.append(f"Extracted zip archive to {extract_to}")
             return True
-        # tar-like archives
         if tarfile.is_tarfile(filepath) or any(lowered.endswith(ext) for ext in [".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"]):
             with tarfile.open(filepath, "r:*") as tar:
                 tar.extractall(path=extract_to)
@@ -93,14 +67,9 @@ def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
     return False
 def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str, logger=None):
-    """
-    Walk local_dir and upload files to the HF dataset repo preserving relative paths.
-    Uses HfApi.upload_file for each file. Skips hidden files and .git.
-    """
     local_dir = Path(local_dir)
     uploaded = 0
     for root, dirs, files in os.walk(local_dir):
-        # skip .git and hidden directories
         dirs[:] = [d for d in dirs if not d.startswith(".")]
         for file in files:
             if file.startswith("."):
@@ -108,7 +77,7 @@ def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str,
             full_path = Path(root) / file
             rel_path = full_path.relative_to(local_dir).as_posix()
             try:
-                logger and logger.append(f"Uploading {rel_path} ...")
                 api.upload_file(
                     path_or_fileobj=str(full_path),
                     path_in_repo=rel_path,
@@ -116,50 +85,41 @@ def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str,
                     repo_type="dataset",
                     token=token,
                 )
-                logger and logger.append(f"  uploaded {rel_path}")
                 uploaded += 1
             except Exception as e:
-                logger and logger.append(f"  ERROR uploading {rel_path}: {e}")
     return uploaded
-# ---------- Core processing function (Gradio) ----------
 def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_token: str, extract_archive: bool):
-    """
-    Downloads dataset_url, creates dataset repo under hf_username/repo_name,
-    and uploads either the single file or extracted contents.
-    Returns text log and the final repo URL.
-    """
     log = []
-    # basic validation
     if not hf_username or not repo_name or not dataset_url:
         return "ERROR: hf_username, repo_name and dataset_url are required.", ""
     if not hf_token:
-        return "ERROR: A Hugging Face token is required to create/upload repositories. "
-               "Create one at https://huggingface.co/settings/tokens with 'repo' and 'upload' scopes.", ""
     repo_id = f"{hf_username}/{repo_name}"
     hf_api = HfApi()
-    # 1) create repo (dataset repo_type)
     try:
         log.append(f"Creating (or verifying) dataset repo {repo_id} ...")
-        # create_repo has exist_ok parameter in newer versions; if not available, catch exception
         try:
             hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token, exist_ok=True)
         except TypeError:
-            # older huggingface_hub versions may not support exist_ok, so try-except
             try:
                 hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token)
             except Exception:
-                # if it already exists, it will raise — that's fine
                 log.append("Repo may already exist or creation raised an error; continuing.")
         log.append("Repo ready.")
     except Exception as e:
         return f"ERROR creating/validating repo {repo_id}: {e}", ""
-    # 2) download the dataset_url to a temp file
     tmpdir = tempfile.mkdtemp(prefix="hf_dataset_")
     try:
         suggested_name = filename_from_url(dataset_url)
@@ -171,19 +131,16 @@ def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_to
         except Exception as e:
             raise RuntimeError(f"Download failed: {e}")
-        # 3) If extract requested and it is an archive, extract
         extracted = False
         if extract_archive:
             log.append("Checking whether to extract archive...")
             extracted = extract_archive_if_needed(dest_file, tmpdir, logger=log)
-        # 4) If extraction happened, upload folder contents. Otherwise upload single file.
         uploaded_count = 0
         if extracted:
             log.append("Uploading extracted files...")
             uploaded_count = upload_directory_to_hf(hf_api, tmpdir, repo_id, hf_token, logger=log)
         else:
-            # upload the single file into repo root, use the same filename
             log.append(f"Uploading single file {os.path.basename(dest_file)} ...")
             try:
                 hf_api.upload_file(
@@ -211,7 +168,6 @@ def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_to
             pass
 # ---------- Gradio UI ----------
 with gr.Blocks(title="HF Dataset Uploader") as demo:
     gr.Markdown("# Hugging Face — dataset uploader\n"
                 "Enter your Hugging Face username, desired dataset repo name, a downloadable dataset URL, and your HF token. "

+# """
+# app.py
+# Gradio app to download a dataset from a provided URL and push it to a Hugging Face
+# datasets repository under the provided username/repo_name.
+# Requirements:
+#     pip install gradio huggingface-hub requests tqdm
+# """
 import os
 import tempfile
 import gradio as gr
 from tqdm import tqdm
 def filename_from_url(url: str) -> str:
     parsed = urlparse(url)
     name = os.path.basename(parsed.path)
+    return name or "downloaded_file"
 def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=None):
     with requests.get(url, stream=True, allow_redirects=True, timeout=30) as r:
         r.raise_for_status()
         total = r.headers.get("content-length")
         total = int(total) if total and total.isdigit() else None
         with open(dest_path, "wb") as f:
             if total:
+                for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=(total // chunk_size) + 1, unit="chunk"):
                     if chunk:
                         f.write(chunk)
                         if logger:
     return dest_path
 def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
     lowered = filepath.lower()
     try:
         if zipfile.is_zipfile(filepath) or lowered.endswith(".zip"):
             if logger:
                 logger.append(f"Extracted zip archive to {extract_to}")
             return True
         if tarfile.is_tarfile(filepath) or any(lowered.endswith(ext) for ext in [".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"]):
             with tarfile.open(filepath, "r:*") as tar:
                 tar.extractall(path=extract_to)
     return False
 def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str, logger=None):
     local_dir = Path(local_dir)
     uploaded = 0
     for root, dirs, files in os.walk(local_dir):
         dirs[:] = [d for d in dirs if not d.startswith(".")]
         for file in files:
             if file.startswith("."):
             full_path = Path(root) / file
             rel_path = full_path.relative_to(local_dir).as_posix()
             try:
+                if logger: logger.append(f"Uploading {rel_path} ...")
                 api.upload_file(
                     path_or_fileobj=str(full_path),
                     path_in_repo=rel_path,
                     repo_type="dataset",
                     token=token,
                 )
+                if logger: logger.append(f"  uploaded {rel_path}")
                 uploaded += 1
             except Exception as e:
+                if logger: logger.append(f"  ERROR uploading {rel_path}: {e}")
     return uploaded
 def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_token: str, extract_archive: bool):
     log = []
     if not hf_username or not repo_name or not dataset_url:
         return "ERROR: hf_username, repo_name and dataset_url are required.", ""
     if not hf_token:
+        # fixed: parenthesize the multi-line string so Python treats it as one expression
+        return (
+            "ERROR: A Hugging Face token is required to create/upload repositories. "
+            "Create one at https://huggingface.co/settings/tokens with 'repo' and 'upload' scopes."
+        ), ""
     repo_id = f"{hf_username}/{repo_name}"
     hf_api = HfApi()
     try:
         log.append(f"Creating (or verifying) dataset repo {repo_id} ...")
         try:
+            # some versions support exist_ok, some do not — try both ways
             hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token, exist_ok=True)
         except TypeError:
             try:
                 hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token)
             except Exception:
                 log.append("Repo may already exist or creation raised an error; continuing.")
         log.append("Repo ready.")
     except Exception as e:
         return f"ERROR creating/validating repo {repo_id}: {e}", ""
     tmpdir = tempfile.mkdtemp(prefix="hf_dataset_")
     try:
         suggested_name = filename_from_url(dataset_url)
         except Exception as e:
             raise RuntimeError(f"Download failed: {e}")
         extracted = False
         if extract_archive:
             log.append("Checking whether to extract archive...")
             extracted = extract_archive_if_needed(dest_file, tmpdir, logger=log)
         uploaded_count = 0
         if extracted:
             log.append("Uploading extracted files...")
             uploaded_count = upload_directory_to_hf(hf_api, tmpdir, repo_id, hf_token, logger=log)
         else:
             log.append(f"Uploading single file {os.path.basename(dest_file)} ...")
             try:
                 hf_api.upload_file(
             pass
 # ---------- Gradio UI ----------
 with gr.Blocks(title="HF Dataset Uploader") as demo:
     gr.Markdown("# Hugging Face — dataset uploader\n"
                 "Enter your Hugging Face username, desired dataset repo name, a downloadable dataset URL, and your HF token. "