MRiabov commited on
Commit
ebeb96c
·
1 Parent(s): 8ea2eff

Dataset pull script

Browse files
Files changed (1) hide show
  1. gdrive_pull.py +75 -0
gdrive_pull.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from pydrive2.auth import GoogleAuth
4
+ from pydrive2.drive import GoogleDrive
5
+ from tqdm import tqdm
6
+ from pathlib import Path
7
+
8
+
9
+ def authenticate(service_account_json):
10
+ """Authenticate PyDrive2 with a service account."""
11
+ gauth = GoogleAuth()
12
+ # Configure PyDrive2 to use service account credentials directly
13
+ gauth.settings["client_config_backend"] = "service"
14
+ gauth.settings["service_config"] = {
15
+ "client_json_file_path": service_account_json,
16
+ # Provide the key to satisfy PyDrive2 even if not impersonating
17
+ "client_user_email": "[email protected]",
18
+ }
19
+ gauth.ServiceAuth()
20
+ drive = GoogleDrive(gauth)
21
+ return drive
22
+
23
+
24
+ def list_files_with_paths(drive, folder_id, prefix=""):
25
+ """Recursively collect all files with their relative paths from a folder."""
26
+ items = []
27
+ query = f"'{folder_id}' in parents and trashed=false"
28
+ for file in drive.ListFile({"q": query, "maxResults": 1000}).GetList():
29
+ if file["mimeType"] == "application/vnd.google-apps.folder":
30
+ sub_prefix = (
31
+ os.path.join(prefix, file["title"]) if prefix else file["title"]
32
+ )
33
+ items += list_files_with_paths(drive, file["id"], sub_prefix)
34
+ else:
35
+ rel_path = os.path.join(prefix, file["title"]) if prefix else file["title"]
36
+ items.append((file, rel_path))
37
+ return items
38
+
39
+
40
+ def download_folder(folder_id, dest, service_account_json):
41
+ drive = authenticate(service_account_json)
42
+ os.makedirs(dest, exist_ok=True)
43
+
44
+ print(f"Listing files in folder {folder_id}...")
45
+ files_with_paths = list_files_with_paths(drive, folder_id)
46
+ print(f"Found {len(files_with_paths)} files. Downloading...")
47
+
48
+ for file, rel_path in tqdm(files_with_paths, desc="Downloading", unit="file"):
49
+ out_path = os.path.join(dest, rel_path)
50
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
51
+ file.GetContentFile(out_path)
52
+
53
+
54
+ def main():
55
+ parser = argparse.ArgumentParser(
56
+ description="Download a full Google Drive folder using a service account"
57
+ )
58
+ parser.add_argument("folder_id", help="Google Drive folder ID")
59
+ parser.add_argument("output_dir", help="Directory to save files")
60
+ parser.add_argument(
61
+ "--service-account",
62
+ default="service_account.json",
63
+ help="Path to your Google service account JSON key file",
64
+ )
65
+ args = parser.parse_args()
66
+
67
+ download_folder(args.folder_id, args.output_dir, args.service_account)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ # also, mkdir -p dataset/
72
+ path = Path("./dataset")
73
+ path.mkdir(exists_ok=True)
74
+
75
+ main()