|
import gradio as gr |
|
from datasets import load_dataset |
|
from PIL import Image |
|
import io |
|
import time |
|
import os |
|
from datetime import datetime, timedelta |
|
import json |
|
|
|
access_token = os.environ.get("HUGGINGFACE_TOKEN") |
|
|
|
|
|
dataset = None |
|
dataset_size = "Unknown" |
|
last_refresh_time = None |
|
REFRESH_INTERVAL = timedelta(hours=24) |
|
|
|
def load_and_prepare_dataset(): |
|
global dataset, dataset_size, last_refresh_time |
|
|
|
dataset = load_dataset( |
|
"taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample", |
|
split="train", |
|
streaming=True, |
|
token=access_token, |
|
) |
|
|
|
|
|
dataset_info = dataset.info |
|
dataset_size = ( |
|
dataset_info.splits["train"].num_examples |
|
if dataset_info.splits.get("train") |
|
else "Unknown" |
|
) |
|
|
|
last_refresh_time = datetime.now() |
|
|
|
def check_and_refresh_dataset(): |
|
global last_refresh_time |
|
current_time = datetime.now() |
|
if ( |
|
last_refresh_time is None |
|
or (current_time - last_refresh_time) >= REFRESH_INTERVAL |
|
): |
|
load_and_prepare_dataset() |
|
|
|
|
|
load_and_prepare_dataset() |
|
|
|
|
|
dataset = load_dataset( |
|
"taesiri/PhotoshopRequest-DailyDump", |
|
split="train", |
|
streaming=True, |
|
token=access_token, |
|
) |
|
|
|
|
|
dataset_info = dataset.info |
|
dataset_size = ( |
|
dataset_info.splits["train"].num_examples |
|
if dataset_info.splits.get("train") |
|
else "Unknown" |
|
) |
|
|
|
BUFFER_SIZE = 1 |
|
sample_iterator = None |
|
sample_count = 0 |
|
|
|
def reshuffle_dataset(): |
|
global sample_iterator, sample_count |
|
seed = int(time.time()) |
|
shuffled_dataset = dataset.shuffle(seed=seed, buffer_size=BUFFER_SIZE) |
|
sample_iterator = iter(shuffled_dataset) |
|
sample_count = 0 |
|
|
|
reshuffle_dataset() |
|
|
|
def get_next_samples(num_samples=5): |
|
""" |
|
Fetch 'num_samples' items from the dataset and return |
|
the text + source/edited images for each sample. |
|
This yields 3 * num_samples outputs in a fixed order. |
|
""" |
|
check_and_refresh_dataset() |
|
|
|
global sample_count |
|
|
|
results = [] |
|
for _ in range(num_samples): |
|
if sample_count >= BUFFER_SIZE: |
|
reshuffle_dataset() |
|
|
|
sample = next(sample_iterator) |
|
sample_count += 1 |
|
print(sample) |
|
|
|
post_id = sample["post_id"] |
|
title = sample["title"] |
|
reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}" |
|
|
|
selftext = "" |
|
try: |
|
selftext = json.loads(sample["json_data"])["post"]["selftext"] |
|
except: |
|
print("No selftext found") |
|
|
|
markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})" |
|
|
|
|
|
results.append(markdown_text) |
|
results.append(sample["source_image"]) |
|
results.append(sample["edited_image"]) |
|
|
|
return tuple(results) |
|
|
|
def update_info(): |
|
""" |
|
Return a small HTML snippet with dataset stats and last refresh time. |
|
""" |
|
return f""" |
|
<div style="text-align: center;"> |
|
<hr> |
|
Dataset Size: {dataset_size} items<br> |
|
Last Refreshed: {last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC') if last_refresh_time else 'Unknown'} |
|
</div> |
|
""" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# PhotoshopRequest Dataset Sampler") |
|
|
|
gr.Markdown( |
|
""" |
|
This is a preview of the PhotoshopRequest dataset. Each sample represents a Photoshop editing request post. |
|
Click the 'Sample New Item' button to retrieve **5 random samples** from the dataset. |
|
|
|
**Layout**: For each sample, you'll see: |
|
1. A text block with the post info. |
|
2. A row with two images (source on the left, edited on the right). |
|
""" |
|
) |
|
|
|
|
|
|
|
|
|
post_info1 = gr.Markdown() |
|
with gr.Row(): |
|
source_image1 = gr.Image(label="Source Image 1") |
|
edited_image1 = gr.Image(label="Edited Image 1") |
|
|
|
|
|
post_info2 = gr.Markdown() |
|
with gr.Row(): |
|
source_image2 = gr.Image(label="Source Image 2") |
|
edited_image2 = gr.Image(label="Edited Image 2") |
|
|
|
|
|
post_info3 = gr.Markdown() |
|
with gr.Row(): |
|
source_image3 = gr.Image(label="Source Image 3") |
|
edited_image3 = gr.Image(label="Edited Image 3") |
|
|
|
|
|
post_info4 = gr.Markdown() |
|
with gr.Row(): |
|
source_image4 = gr.Image(label="Source Image 4") |
|
edited_image4 = gr.Image(label="Edited Image 4") |
|
|
|
|
|
post_info5 = gr.Markdown() |
|
with gr.Row(): |
|
source_image5 = gr.Image(label="Source Image 5") |
|
edited_image5 = gr.Image(label="Edited Image 5") |
|
|
|
sample_button = gr.Button("Sample New Item") |
|
info_md = gr.Markdown() |
|
|
|
|
|
sample_button.click( |
|
get_next_samples, |
|
outputs=[ |
|
post_info1, source_image1, edited_image1, |
|
post_info2, source_image2, edited_image2, |
|
post_info3, source_image3, edited_image3, |
|
post_info4, source_image4, edited_image4, |
|
post_info5, source_image5, edited_image5 |
|
] |
|
).then(update_info, outputs=[info_md]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|