taesiri's picture
Update app.py
c384cc2 verified
raw
history blame
5.54 kB
import gradio as gr
from datasets import load_dataset
from PIL import Image
import io
import time
import os
from datetime import datetime, timedelta
import json
access_token = os.environ.get("HUGGINGFACE_TOKEN")
# Global variables
dataset = None
dataset_size = "Unknown"
last_refresh_time = None
REFRESH_INTERVAL = timedelta(hours=24)
def load_and_prepare_dataset():
global dataset, dataset_size, last_refresh_time
dataset = load_dataset(
"taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample",
split="train",
streaming=True,
token=access_token,
)
# Get dataset info
dataset_info = dataset.info
dataset_size = (
dataset_info.splits["train"].num_examples
if dataset_info.splits.get("train")
else "Unknown"
)
last_refresh_time = datetime.now()
def check_and_refresh_dataset():
global last_refresh_time
current_time = datetime.now()
if (
last_refresh_time is None
or (current_time - last_refresh_time) >= REFRESH_INTERVAL
):
load_and_prepare_dataset()
# Initial dataset load
load_and_prepare_dataset()
# (Optional) Load a different dataset for variety, as in your original code:
dataset = load_dataset(
"taesiri/PhotoshopRequest-DailyDump",
split="train",
streaming=True,
token=access_token,
)
# Get dataset info
dataset_info = dataset.info
dataset_size = (
dataset_info.splits["train"].num_examples
if dataset_info.splits.get("train")
else "Unknown"
)
BUFFER_SIZE = 1
sample_iterator = None
sample_count = 0
def reshuffle_dataset():
global sample_iterator, sample_count
seed = int(time.time()) # Convert current time to an integer for randomness
shuffled_dataset = dataset.shuffle(seed=seed, buffer_size=BUFFER_SIZE)
sample_iterator = iter(shuffled_dataset)
sample_count = 0
reshuffle_dataset() # Initial shuffle
def get_next_samples(num_samples=5):
"""
Fetch 'num_samples' items from the dataset and return
the text + source/edited images for each sample.
This yields 3 * num_samples outputs in a fixed order.
"""
check_and_refresh_dataset()
global sample_count
results = []
for _ in range(num_samples):
if sample_count >= BUFFER_SIZE:
reshuffle_dataset()
sample = next(sample_iterator)
sample_count += 1
print(sample)
post_id = sample["post_id"]
title = sample["title"]
reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
selftext = ""
try:
selftext = json.loads(sample["json_data"])["post"]["selftext"]
except:
print("No selftext found")
markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
# Append the triple (post_info, source_image, edited_image)
results.append(markdown_text)
results.append(sample["source_image"])
results.append(sample["edited_image"])
return tuple(results)
def update_info():
"""
Return a small HTML snippet with dataset stats and last refresh time.
"""
return f"""
<div style="text-align: center;">
<hr>
Dataset Size: {dataset_size} items<br>
Last Refreshed: {last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC') if last_refresh_time else 'Unknown'}
</div>
"""
# Build the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PhotoshopRequest Dataset Sampler")
gr.Markdown(
"""
This is a preview of the PhotoshopRequest dataset. Each sample represents a Photoshop editing request post.
Click the 'Sample New Item' button to retrieve **5 random samples** from the dataset.
**Layout**: For each sample, you'll see:
1. A text block with the post info.
2. A row with two images (source on the left, edited on the right).
"""
)
# We'll define 5 sets of outputs, each is: (Markdown, source_image, edited_image).
# Sample 1
post_info1 = gr.Markdown()
with gr.Row():
source_image1 = gr.Image(label="Source Image 1")
edited_image1 = gr.Image(label="Edited Image 1")
# Sample 2
post_info2 = gr.Markdown()
with gr.Row():
source_image2 = gr.Image(label="Source Image 2")
edited_image2 = gr.Image(label="Edited Image 2")
# Sample 3
post_info3 = gr.Markdown()
with gr.Row():
source_image3 = gr.Image(label="Source Image 3")
edited_image3 = gr.Image(label="Edited Image 3")
# Sample 4
post_info4 = gr.Markdown()
with gr.Row():
source_image4 = gr.Image(label="Source Image 4")
edited_image4 = gr.Image(label="Edited Image 4")
# Sample 5
post_info5 = gr.Markdown()
with gr.Row():
source_image5 = gr.Image(label="Source Image 5")
edited_image5 = gr.Image(label="Edited Image 5")
sample_button = gr.Button("Sample New Item")
info_md = gr.Markdown()
# Map the outputs in the same order they are returned by get_next_samples
sample_button.click(
get_next_samples,
outputs=[
post_info1, source_image1, edited_image1,
post_info2, source_image2, edited_image2,
post_info3, source_image3, edited_image3,
post_info4, source_image4, edited_image4,
post_info5, source_image5, edited_image5
]
).then(update_info, outputs=[info_md])
if __name__ == "__main__":
demo.launch()