taesiri commited on
Commit
632758a
·
verified ·
1 Parent(s): c384cc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -180
app.py CHANGED
@@ -1,192 +1,113 @@
1
  import gradio as gr
2
  from datasets import load_dataset
3
- from PIL import Image
4
- import io
5
- import time
6
- import os
7
- from datetime import datetime, timedelta
8
  import json
 
 
 
9
 
 
10
  access_token = os.environ.get("HUGGINGFACE_TOKEN")
11
 
12
- # Global variables
13
- dataset = None
14
- dataset_size = "Unknown"
15
- last_refresh_time = None
16
- REFRESH_INTERVAL = timedelta(hours=24)
17
-
18
- def load_and_prepare_dataset():
19
- global dataset, dataset_size, last_refresh_time
20
-
21
- dataset = load_dataset(
22
- "taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample",
23
- split="train",
24
- streaming=True,
25
- token=access_token,
26
- )
27
-
28
- # Get dataset info
29
- dataset_info = dataset.info
30
- dataset_size = (
31
- dataset_info.splits["train"].num_examples
32
- if dataset_info.splits.get("train")
33
- else "Unknown"
34
- )
35
-
36
- last_refresh_time = datetime.now()
37
-
38
- def check_and_refresh_dataset():
39
- global last_refresh_time
40
- current_time = datetime.now()
41
- if (
42
- last_refresh_time is None
43
- or (current_time - last_refresh_time) >= REFRESH_INTERVAL
44
- ):
45
- load_and_prepare_dataset()
46
-
47
- # Initial dataset load
48
- load_and_prepare_dataset()
49
-
50
- # (Optional) Load a different dataset for variety, as in your original code:
51
- dataset = load_dataset(
52
- "taesiri/PhotoshopRequest-DailyDump",
53
- split="train",
54
- streaming=True,
55
- token=access_token,
56
- )
57
-
58
- # Get dataset info
59
- dataset_info = dataset.info
60
- dataset_size = (
61
- dataset_info.splits["train"].num_examples
62
- if dataset_info.splits.get("train")
63
- else "Unknown"
64
- )
65
-
66
- BUFFER_SIZE = 1
67
- sample_iterator = None
68
- sample_count = 0
69
-
70
- def reshuffle_dataset():
71
- global sample_iterator, sample_count
72
- seed = int(time.time()) # Convert current time to an integer for randomness
73
- shuffled_dataset = dataset.shuffle(seed=seed, buffer_size=BUFFER_SIZE)
74
- sample_iterator = iter(shuffled_dataset)
75
- sample_count = 0
76
-
77
- reshuffle_dataset() # Initial shuffle
78
-
79
- def get_next_samples(num_samples=5):
80
- """
81
- Fetch 'num_samples' items from the dataset and return
82
- the text + source/edited images for each sample.
83
- This yields 3 * num_samples outputs in a fixed order.
84
- """
85
- check_and_refresh_dataset()
86
-
87
- global sample_count
88
-
89
- results = []
90
- for _ in range(num_samples):
91
- if sample_count >= BUFFER_SIZE:
92
- reshuffle_dataset()
93
-
94
- sample = next(sample_iterator)
95
- sample_count += 1
96
- print(sample)
97
-
98
- post_id = sample["post_id"]
99
- title = sample["title"]
100
- reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
101
-
102
- selftext = ""
103
- try:
104
- selftext = json.loads(sample["json_data"])["post"]["selftext"]
105
- except:
106
- print("No selftext found")
107
-
108
- markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
109
-
110
- # Append the triple (post_info, source_image, edited_image)
111
- results.append(markdown_text)
112
- results.append(sample["source_image"])
113
- results.append(sample["edited_image"])
114
-
115
- return tuple(results)
116
-
117
- def update_info():
118
- """
119
- Return a small HTML snippet with dataset stats and last refresh time.
120
- """
121
- return f"""
122
- <div style="text-align: center;">
123
- <hr>
124
- Dataset Size: {dataset_size} items<br>
125
- Last Refreshed: {last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC') if last_refresh_time else 'Unknown'}
126
- </div>
127
- """
128
-
129
- # Build the Gradio interface
130
- with gr.Blocks() as demo:
131
- gr.Markdown("# PhotoshopRequest Dataset Sampler")
132
 
133
- gr.Markdown(
134
- """
135
- This is a preview of the PhotoshopRequest dataset. Each sample represents a Photoshop editing request post.
136
- Click the 'Sample New Item' button to retrieve **5 random samples** from the dataset.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- **Layout**: For each sample, you'll see:
139
- 1. A text block with the post info.
140
- 2. A row with two images (source on the left, edited on the right).
 
 
 
 
 
 
 
141
  """
142
- )
143
-
144
- # We'll define 5 sets of outputs, each is: (Markdown, source_image, edited_image).
145
-
146
- # Sample 1
147
- post_info1 = gr.Markdown()
148
- with gr.Row():
149
- source_image1 = gr.Image(label="Source Image 1")
150
- edited_image1 = gr.Image(label="Edited Image 1")
151
 
152
- # Sample 2
153
- post_info2 = gr.Markdown()
154
- with gr.Row():
155
- source_image2 = gr.Image(label="Source Image 2")
156
- edited_image2 = gr.Image(label="Edited Image 2")
157
-
158
- # Sample 3
159
- post_info3 = gr.Markdown()
160
- with gr.Row():
161
- source_image3 = gr.Image(label="Source Image 3")
162
- edited_image3 = gr.Image(label="Edited Image 3")
163
-
164
- # Sample 4
165
- post_info4 = gr.Markdown()
166
- with gr.Row():
167
- source_image4 = gr.Image(label="Source Image 4")
168
- edited_image4 = gr.Image(label="Edited Image 4")
169
-
170
- # Sample 5
171
- post_info5 = gr.Markdown()
172
- with gr.Row():
173
- source_image5 = gr.Image(label="Source Image 5")
174
- edited_image5 = gr.Image(label="Edited Image 5")
175
-
176
- sample_button = gr.Button("Sample New Item")
177
- info_md = gr.Markdown()
178
-
179
- # Map the outputs in the same order they are returned by get_next_samples
180
- sample_button.click(
181
- get_next_samples,
182
- outputs=[
183
- post_info1, source_image1, edited_image1,
184
- post_info2, source_image2, edited_image2,
185
- post_info3, source_image3, edited_image3,
186
- post_info4, source_image4, edited_image4,
187
- post_info5, source_image5, edited_image5
188
- ]
189
- ).then(update_info, outputs=[info_md])
 
190
 
191
  if __name__ == "__main__":
192
- demo.launch()
 
 
1
  import gradio as gr
2
  from datasets import load_dataset
 
 
 
 
 
3
  import json
4
+ import random
5
+ from datetime import datetime
6
+ import os
7
 
8
+ # Get access token from environment
9
  access_token = os.environ.get("HUGGINGFACE_TOKEN")
10
 
11
+ class DatasetViewer:
12
+ def __init__(self):
13
+ self.dataset = None
14
+ self.dataset_size = 0
15
+ self.last_refresh_time = None
16
+ self.load_dataset()
17
+
18
+ def load_dataset(self):
19
+ """Load the complete dataset into memory"""
20
+ # Load the full dataset (non-streaming)
21
+ self.dataset = load_dataset(
22
+ "taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample",
23
+ split="train",
24
+ token=access_token
25
+ )
26
+
27
+ self.dataset_size = len(self.dataset)
28
+ self.last_refresh_time = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ def get_next_samples(self, num_samples=5):
31
+ """Get random samples from the dataset"""
32
+ # Generate random indices
33
+ indices = random.sample(range(self.dataset_size), min(num_samples, self.dataset_size))
34
+
35
+ results = []
36
+ for idx in indices:
37
+ sample = self.dataset[idx]
38
+
39
+ # Get post information
40
+ post_id = sample["post_id"]
41
+ title = sample["title"]
42
+ reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
43
+
44
+ # Extract selftext if available
45
+ selftext = ""
46
+ try:
47
+ selftext = json.loads(sample["json_data"])["post"]["selftext"]
48
+ except:
49
+ print(f"No selftext found for post {post_id}")
50
+
51
+ # Create markdown text
52
+ markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
53
+
54
+ # Append the triple (post_info, source_image, edited_image)
55
+ results.append(markdown_text)
56
+ results.append(sample["source_image"])
57
+ results.append(sample["edited_image"])
58
 
59
+ return tuple(results)
60
+
61
+ def get_info(self):
62
+ """Return dataset information"""
63
+ return f"""
64
+ <div style="text-align: center;">
65
+ <hr>
66
+ Dataset Size: {self.dataset_size} items<br>
67
+ Last Refreshed: {self.last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC')}
68
+ </div>
69
  """
 
 
 
 
 
 
 
 
 
70
 
71
+ def create_interface():
72
+ viewer = DatasetViewer()
73
+
74
+ with gr.Blocks() as demo:
75
+ gr.Markdown("# PhotoshopRequest Dataset Viewer")
76
+
77
+ gr.Markdown("""
78
+ This is a viewer for the PhotoshopRequest dataset. Each sample shows a Photoshop editing request post.
79
+ Click the 'Show New Samples' button to see **5 random samples** from the dataset.
80
+
81
+ **Layout**: For each sample, you'll see:
82
+ 1. The post title and description
83
+ 2. The source image (left) and edited result (right)
84
+ """)
85
+
86
+ # Create 5 sets of outputs
87
+ outputs = []
88
+ for i in range(5):
89
+ post_info = gr.Markdown()
90
+ outputs.append(post_info)
91
+
92
+ with gr.Row():
93
+ source = gr.Image(label=f"Source Image {i+1}")
94
+ edited = gr.Image(label=f"Edited Image {i+1}")
95
+ outputs.extend([source, edited])
96
+
97
+ sample_button = gr.Button("Show New Samples")
98
+ info_md = gr.Markdown()
99
+
100
+ # Set up event handlers
101
+ sample_button.click(
102
+ viewer.get_next_samples,
103
+ outputs=outputs
104
+ ).then(
105
+ viewer.get_info,
106
+ outputs=[info_md]
107
+ )
108
+
109
+ return demo
110
 
111
  if __name__ == "__main__":
112
+ demo = create_interface()
113
+ demo.launch()