Spaces:
Sleeping
Sleeping
Commit
·
5f7ffc5
1
Parent(s):
de729bd
deal with samples properly
Browse files
app.py
CHANGED
|
@@ -136,14 +136,31 @@ def process_pdfs(
|
|
| 136 |
progress(0, desc="Starting PDF processing")
|
| 137 |
images, message = pdf_to_images(pdf_files, sample_size, images_dir)
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
zip_path = None
|
| 140 |
if create_zip:
|
| 141 |
-
# Create a zip file of the images
|
| 142 |
zip_path = os.path.join(temp_dir, "converted_images.zip")
|
| 143 |
with zipfile.ZipFile(zip_path, "w") as zipf:
|
| 144 |
progress(0, desc="Zipping images")
|
| 145 |
for image in progress.tqdm(images, desc="Zipping images"):
|
| 146 |
-
zipf.write(
|
|
|
|
|
|
|
|
|
|
| 147 |
message += f"\nCreated zip file with {len(images)} images"
|
| 148 |
|
| 149 |
if hf_repo:
|
|
@@ -154,11 +171,12 @@ def process_pdfs(
|
|
| 154 |
repo_type="dataset",
|
| 155 |
private=private_repo,
|
| 156 |
)
|
| 157 |
-
|
| 158 |
-
|
|
|
|
| 159 |
repo_id=hf_repo,
|
| 160 |
repo_type="dataset",
|
| 161 |
-
|
| 162 |
)
|
| 163 |
|
| 164 |
# Determine size category
|
|
|
|
| 136 |
progress(0, desc="Starting PDF processing")
|
| 137 |
images, message = pdf_to_images(pdf_files, sample_size, images_dir)
|
| 138 |
|
| 139 |
+
# Create a new directory for sampled images
|
| 140 |
+
sampled_images_dir = os.path.join(temp_dir, "sampled_images")
|
| 141 |
+
os.makedirs(sampled_images_dir)
|
| 142 |
+
|
| 143 |
+
# Move sampled images to the new directory and update paths
|
| 144 |
+
updated_images = []
|
| 145 |
+
for image in images:
|
| 146 |
+
new_path = os.path.join(sampled_images_dir, os.path.basename(image))
|
| 147 |
+
shutil.move(image, new_path)
|
| 148 |
+
updated_images.append(new_path)
|
| 149 |
+
|
| 150 |
+
# Update the images list with new paths
|
| 151 |
+
images = updated_images
|
| 152 |
+
|
| 153 |
zip_path = None
|
| 154 |
if create_zip:
|
| 155 |
+
# Create a zip file of the sampled images
|
| 156 |
zip_path = os.path.join(temp_dir, "converted_images.zip")
|
| 157 |
with zipfile.ZipFile(zip_path, "w") as zipf:
|
| 158 |
progress(0, desc="Zipping images")
|
| 159 |
for image in progress.tqdm(images, desc="Zipping images"):
|
| 160 |
+
zipf.write(
|
| 161 |
+
os.path.join(sampled_images_dir, os.path.basename(image)),
|
| 162 |
+
os.path.basename(image),
|
| 163 |
+
)
|
| 164 |
message += f"\nCreated zip file with {len(images)} images"
|
| 165 |
|
| 166 |
if hf_repo:
|
|
|
|
| 171 |
repo_type="dataset",
|
| 172 |
private=private_repo,
|
| 173 |
)
|
| 174 |
+
# Upload only the sampled images directory
|
| 175 |
+
hf_api.upload_folder(
|
| 176 |
+
folder_path=sampled_images_dir,
|
| 177 |
repo_id=hf_repo,
|
| 178 |
repo_type="dataset",
|
| 179 |
+
path_in_repo="images",
|
| 180 |
)
|
| 181 |
|
| 182 |
# Determine size category
|