ragunath-ravi commited on
Commit
c95f0b1
·
verified ·
1 Parent(s): 7977632

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -62
app.py CHANGED
@@ -1,25 +1,12 @@
1
- """
2
- app.py
3
 
4
- Gradio app to download a dataset from a provided URL and push it to a Hugging Face
5
- datasets repository under the provided username/repo_name.
6
 
7
- Requirements (install with pip):
8
- pip install gradio huggingface_hub requests tqdm
9
-
10
- How it works:
11
- 1. User supplies: hf_username, repo_name, dataset_url, hf_token
12
- 2. App creates (or re-uses) a dataset repo on Hugging Face:
13
- https://huggingface.co/datasets/{username}/{repo_name}
14
- 3. Downloads the file (streamed), optionally extracts if archive.
15
- 4. Uploads one or more files into the dataset repo.
16
- 5. Returns a status log and repo URL.
17
-
18
- Security note:
19
- - The HF token is required to create/upload to the user's repo. Never share your token.
20
- - This script performs synchronous downloads/uploads; large datasets may take time and
21
- will use local disk space while processing.
22
- """
23
 
24
  import os
25
  import tempfile
@@ -33,27 +20,19 @@ from huggingface_hub import HfApi
33
  import gradio as gr
34
  from tqdm import tqdm
35
 
36
- # ---------- Helper functions ----------
37
-
38
  def filename_from_url(url: str) -> str:
39
- """Try to determine a filename from a URL; fallback to 'download'."""
40
  parsed = urlparse(url)
41
  name = os.path.basename(parsed.path)
42
- if not name:
43
- return "downloaded_file"
44
- return name
45
 
46
  def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=None):
47
- """
48
- Stream download a URL to dest_path. Returns the final path.
49
- """
50
  with requests.get(url, stream=True, allow_redirects=True, timeout=30) as r:
51
  r.raise_for_status()
52
  total = r.headers.get("content-length")
53
  total = int(total) if total and total.isdigit() else None
54
  with open(dest_path, "wb") as f:
55
  if total:
56
- for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=total // chunk_size + 1, unit="chunk"):
57
  if chunk:
58
  f.write(chunk)
59
  if logger:
@@ -67,10 +46,6 @@ def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=No
67
  return dest_path
68
 
69
  def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
70
- """
71
- If filepath is a zip or tar(.*) archive, extract it into extract_to and return True.
72
- Otherwise, return False.
73
- """
74
  lowered = filepath.lower()
75
  try:
76
  if zipfile.is_zipfile(filepath) or lowered.endswith(".zip"):
@@ -79,7 +54,6 @@ def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
79
  if logger:
80
  logger.append(f"Extracted zip archive to {extract_to}")
81
  return True
82
- # tar-like archives
83
  if tarfile.is_tarfile(filepath) or any(lowered.endswith(ext) for ext in [".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"]):
84
  with tarfile.open(filepath, "r:*") as tar:
85
  tar.extractall(path=extract_to)
@@ -93,14 +67,9 @@ def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
93
  return False
94
 
95
  def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str, logger=None):
96
- """
97
- Walk local_dir and upload files to the HF dataset repo preserving relative paths.
98
- Uses HfApi.upload_file for each file. Skips hidden files and .git.
99
- """
100
  local_dir = Path(local_dir)
101
  uploaded = 0
102
  for root, dirs, files in os.walk(local_dir):
103
- # skip .git and hidden directories
104
  dirs[:] = [d for d in dirs if not d.startswith(".")]
105
  for file in files:
106
  if file.startswith("."):
@@ -108,7 +77,7 @@ def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str,
108
  full_path = Path(root) / file
109
  rel_path = full_path.relative_to(local_dir).as_posix()
110
  try:
111
- logger and logger.append(f"Uploading {rel_path} ...")
112
  api.upload_file(
113
  path_or_fileobj=str(full_path),
114
  path_in_repo=rel_path,
@@ -116,50 +85,41 @@ def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str,
116
  repo_type="dataset",
117
  token=token,
118
  )
119
- logger and logger.append(f" uploaded {rel_path}")
120
  uploaded += 1
121
  except Exception as e:
122
- logger and logger.append(f" ERROR uploading {rel_path}: {e}")
123
  return uploaded
124
 
125
- # ---------- Core processing function (Gradio) ----------
126
-
127
  def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_token: str, extract_archive: bool):
128
- """
129
- Downloads dataset_url, creates dataset repo under hf_username/repo_name,
130
- and uploads either the single file or extracted contents.
131
- Returns text log and the final repo URL.
132
- """
133
  log = []
134
- # basic validation
135
  if not hf_username or not repo_name or not dataset_url:
136
  return "ERROR: hf_username, repo_name and dataset_url are required.", ""
137
 
138
  if not hf_token:
139
- return "ERROR: A Hugging Face token is required to create/upload repositories. "
140
- "Create one at https://huggingface.co/settings/tokens with 'repo' and 'upload' scopes.", ""
 
 
 
141
 
142
  repo_id = f"{hf_username}/{repo_name}"
143
  hf_api = HfApi()
144
 
145
- # 1) create repo (dataset repo_type)
146
  try:
147
  log.append(f"Creating (or verifying) dataset repo {repo_id} ...")
148
- # create_repo has exist_ok parameter in newer versions; if not available, catch exception
149
  try:
 
150
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token, exist_ok=True)
151
  except TypeError:
152
- # older huggingface_hub versions may not support exist_ok, so try-except
153
  try:
154
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token)
155
  except Exception:
156
- # if it already exists, it will raise — that's fine
157
  log.append("Repo may already exist or creation raised an error; continuing.")
158
  log.append("Repo ready.")
159
  except Exception as e:
160
  return f"ERROR creating/validating repo {repo_id}: {e}", ""
161
 
162
- # 2) download the dataset_url to a temp file
163
  tmpdir = tempfile.mkdtemp(prefix="hf_dataset_")
164
  try:
165
  suggested_name = filename_from_url(dataset_url)
@@ -171,19 +131,16 @@ def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_to
171
  except Exception as e:
172
  raise RuntimeError(f"Download failed: {e}")
173
 
174
- # 3) If extract requested and it is an archive, extract
175
  extracted = False
176
  if extract_archive:
177
  log.append("Checking whether to extract archive...")
178
  extracted = extract_archive_if_needed(dest_file, tmpdir, logger=log)
179
 
180
- # 4) If extraction happened, upload folder contents. Otherwise upload single file.
181
  uploaded_count = 0
182
  if extracted:
183
  log.append("Uploading extracted files...")
184
  uploaded_count = upload_directory_to_hf(hf_api, tmpdir, repo_id, hf_token, logger=log)
185
  else:
186
- # upload the single file into repo root, use the same filename
187
  log.append(f"Uploading single file {os.path.basename(dest_file)} ...")
188
  try:
189
  hf_api.upload_file(
@@ -211,7 +168,6 @@ def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_to
211
  pass
212
 
213
  # ---------- Gradio UI ----------
214
-
215
  with gr.Blocks(title="HF Dataset Uploader") as demo:
216
  gr.Markdown("# Hugging Face — dataset uploader\n"
217
  "Enter your Hugging Face username, desired dataset repo name, a downloadable dataset URL, and your HF token. "
 
1
+ # """
2
+ # app.py
3
 
4
+ # Gradio app to download a dataset from a provided URL and push it to a Hugging Face
5
+ # datasets repository under the provided username/repo_name.
6
 
7
+ # Requirements:
8
+ # pip install gradio huggingface-hub requests tqdm
9
+ # """
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  import os
12
  import tempfile
 
20
  import gradio as gr
21
  from tqdm import tqdm
22
 
 
 
23
  def filename_from_url(url: str) -> str:
 
24
  parsed = urlparse(url)
25
  name = os.path.basename(parsed.path)
26
+ return name or "downloaded_file"
 
 
27
 
28
  def stream_download(url: str, dest_path: str, chunk_size: int = 32768, logger=None):
 
 
 
29
  with requests.get(url, stream=True, allow_redirects=True, timeout=30) as r:
30
  r.raise_for_status()
31
  total = r.headers.get("content-length")
32
  total = int(total) if total and total.isdigit() else None
33
  with open(dest_path, "wb") as f:
34
  if total:
35
+ for chunk in tqdm(r.iter_content(chunk_size=chunk_size), total=(total // chunk_size) + 1, unit="chunk"):
36
  if chunk:
37
  f.write(chunk)
38
  if logger:
 
46
  return dest_path
47
 
48
  def extract_archive_if_needed(filepath: str, extract_to: str, logger=None):
 
 
 
 
49
  lowered = filepath.lower()
50
  try:
51
  if zipfile.is_zipfile(filepath) or lowered.endswith(".zip"):
 
54
  if logger:
55
  logger.append(f"Extracted zip archive to {extract_to}")
56
  return True
 
57
  if tarfile.is_tarfile(filepath) or any(lowered.endswith(ext) for ext in [".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"]):
58
  with tarfile.open(filepath, "r:*") as tar:
59
  tar.extractall(path=extract_to)
 
67
  return False
68
 
69
  def upload_directory_to_hf(api: HfApi, local_dir: str, repo_id: str, token: str, logger=None):
 
 
 
 
70
  local_dir = Path(local_dir)
71
  uploaded = 0
72
  for root, dirs, files in os.walk(local_dir):
 
73
  dirs[:] = [d for d in dirs if not d.startswith(".")]
74
  for file in files:
75
  if file.startswith("."):
 
77
  full_path = Path(root) / file
78
  rel_path = full_path.relative_to(local_dir).as_posix()
79
  try:
80
+ if logger: logger.append(f"Uploading {rel_path} ...")
81
  api.upload_file(
82
  path_or_fileobj=str(full_path),
83
  path_in_repo=rel_path,
 
85
  repo_type="dataset",
86
  token=token,
87
  )
88
+ if logger: logger.append(f" uploaded {rel_path}")
89
  uploaded += 1
90
  except Exception as e:
91
+ if logger: logger.append(f" ERROR uploading {rel_path}: {e}")
92
  return uploaded
93
 
 
 
94
  def process_and_upload(hf_username: str, repo_name: str, dataset_url: str, hf_token: str, extract_archive: bool):
 
 
 
 
 
95
  log = []
 
96
  if not hf_username or not repo_name or not dataset_url:
97
  return "ERROR: hf_username, repo_name and dataset_url are required.", ""
98
 
99
  if not hf_token:
100
+ # fixed: parenthesize the multi-line string so Python treats it as one expression
101
+ return (
102
+ "ERROR: A Hugging Face token is required to create/upload repositories. "
103
+ "Create one at https://huggingface.co/settings/tokens with 'repo' and 'upload' scopes."
104
+ ), ""
105
 
106
  repo_id = f"{hf_username}/{repo_name}"
107
  hf_api = HfApi()
108
 
 
109
  try:
110
  log.append(f"Creating (or verifying) dataset repo {repo_id} ...")
 
111
  try:
112
+ # some versions support exist_ok, some do not — try both ways
113
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token, exist_ok=True)
114
  except TypeError:
 
115
  try:
116
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset", token=hf_token)
117
  except Exception:
 
118
  log.append("Repo may already exist or creation raised an error; continuing.")
119
  log.append("Repo ready.")
120
  except Exception as e:
121
  return f"ERROR creating/validating repo {repo_id}: {e}", ""
122
 
 
123
  tmpdir = tempfile.mkdtemp(prefix="hf_dataset_")
124
  try:
125
  suggested_name = filename_from_url(dataset_url)
 
131
  except Exception as e:
132
  raise RuntimeError(f"Download failed: {e}")
133
 
 
134
  extracted = False
135
  if extract_archive:
136
  log.append("Checking whether to extract archive...")
137
  extracted = extract_archive_if_needed(dest_file, tmpdir, logger=log)
138
 
 
139
  uploaded_count = 0
140
  if extracted:
141
  log.append("Uploading extracted files...")
142
  uploaded_count = upload_directory_to_hf(hf_api, tmpdir, repo_id, hf_token, logger=log)
143
  else:
 
144
  log.append(f"Uploading single file {os.path.basename(dest_file)} ...")
145
  try:
146
  hf_api.upload_file(
 
168
  pass
169
 
170
  # ---------- Gradio UI ----------
 
171
  with gr.Blocks(title="HF Dataset Uploader") as demo:
172
  gr.Markdown("# Hugging Face — dataset uploader\n"
173
  "Enter your Hugging Face username, desired dataset repo name, a downloadable dataset URL, and your HF token. "