zhijianma commited on
Commit
4938d99
·
verified ·
1 Parent(s): 3de39c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -38
app.py CHANGED
@@ -1,12 +1,17 @@
1
- import os
2
- import inspect
3
  import base64
4
- import yaml
5
  import copy
 
 
 
6
  import shutil
 
7
  import gradio as gr
 
 
 
8
  from data_juicer.ops.base_op import OPERATORS
9
  from data_juicer.utils.constant import Fields
 
10
  demo_path = os.path.dirname(os.path.abspath(__file__))
11
  project_path = os.path.dirname(os.path.dirname(demo_path))
12
 
@@ -73,7 +78,7 @@ op_list_desc = {
73
  'selector':extract_op_desc(op_text, '## Selector <a name="selector"/>'),
74
  }
75
 
76
- op_types = ['mapper', 'filter',]# 'deduplicator'] , 'selector']
77
  local_ops_dict = {op_type:[] for op_type in op_types}
78
  multimodal = os.getenv('MULTI_MODAL', True)
79
  multimodal_visible = False
@@ -113,8 +118,8 @@ def show_code(op_name):
113
 
114
  return ''.join(text[0]), yaml.dump(default_params)
115
 
116
- def change_visible(op_name):
117
- text_visible = True
118
  video_visible = False
119
  audio_visible = False
120
  image_visible = False
@@ -124,6 +129,8 @@ def change_visible(op_name):
124
  audio_visible = True
125
  elif 'image' in op_name:
126
  image_visible = True
 
 
127
  return gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible), gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible)
128
 
129
 
@@ -178,7 +185,7 @@ def create_tab_layout(op_tab, op_type, run_op, has_stats=False):
178
  op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
179
  run_button = gr.Button(value="🚀Run")
180
  show_code_button = gr.Button(value="🔍Show Code")
181
-
182
  with gr.Column():
183
  with gr.Group('Inputs'):
184
  gr.Markdown(" **Inputs**")
@@ -225,13 +232,13 @@ def create_tab_layout(op_tab, op_type, run_op, has_stats=False):
225
  return outputs
226
 
227
  show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
228
- show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
229
  run_button.click(run_func, inputs=inputs, outputs=outputs)
230
- run_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
231
  op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
232
- op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
233
- op_tab.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
234
-
235
 
236
  def create_mapper_tab(op_type, op_tab):
237
  with op_tab:
@@ -262,13 +269,30 @@ def create_filter_tab(op_type, op_tab):
262
 
263
  def create_deduplicator_tab(op_type, op_tab):
264
  with op_tab:
265
- def run_op( input_text, input_image, input_video, input_audio, op_name, op_params):
266
  op_class = OPERATORS.modules[op_name]
267
  op = op_class(**op_params)
268
  sample = encode_sample(input_text, input_image, input_video, input_audio)
269
- output_sample = sample #op.compute_hash(copy.deepcopy(sample))
270
- return decode_sample(output_sample)
271
- create_tab_layout(op_tab, op_type, run_op, has_stats=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
 
274
  def create_tab_double_layout(op_tab, op_type, run_op):
@@ -282,11 +306,12 @@ def create_tab_double_layout(op_tab, op_type, run_op):
282
  op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
283
  run_button = gr.Button(value="🚀Run")
284
  show_code_button = gr.Button(value="🔍Show Code")
285
-
286
  with gr.Column():
287
  with gr.Group('Inputs'):
288
  gr.Markdown(" **Inputs**")
289
  with gr.Row():
 
290
  input_text = gr.TextArea(label="Text",interactive=True,)
291
  input_text2 = gr.TextArea(label="Text",interactive=True,)
292
  input_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible)
@@ -299,42 +324,37 @@ def create_tab_double_layout(op_tab, op_type, run_op):
299
  with gr.Group('Outputs'):
300
  gr.Markdown(" **Outputs**")
301
  with gr.Row():
302
- output_text = gr.TextArea(label="Text",interactive=False,)
303
- output_text2 = gr.TextArea(label="Text",interactive=False,)
304
- output_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible)
305
- output_image2 = gr.Image(label='Image', type='filepath', visible=multimodal_visible)
306
- output_video = gr.Video(label='Video', visible=multimodal_visible)
307
- output_video2 = gr.Video(label='Video', visible=multimodal_visible)
308
- output_audio = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
309
- output_audio2 = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
310
-
311
  code = gr.Code(label='Source', language='python')
312
  inputs = [input_text, input_image, input_video, input_audio, input_text2, input_image2, input_video2, input_audio2, op_selector, op_params]
313
- outputs = [output_text, output_image, output_video, output_audio, output_text2, output_image2, output_video2, output_audio2]
314
 
315
  def run_func(*args):
316
  try:
317
  try:
318
- op_params = args[-1]
 
319
  params = yaml.safe_load(op_params)
320
  except:
321
  params = {}
322
  if params is None:
323
  params = {}
324
- return run_op(input_text, input_image, input_video, input_audio, op_selector, params)
325
  except Exception as e:
326
  gr.Error(str(e))
 
327
  return outputs
328
 
329
- # show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
330
- # show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
331
- # run_button.click(run_func, inputs=inputs, outputs=outputs)
332
- # op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
333
- # op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
334
- show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(show_code, inputs=[op_selector], outputs=[code, op_params])
335
- run_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(run_func, inputs=[op_selector], outputs=[code, op_params])
336
- op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(show_code, inputs=[op_selector], outputs=[code, op_params])
337
- op_tab.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
338
 
339
  with gr.Blocks(css="./app.css") as demo:
340
  dj_image = os.path.join(project_path, 'docs/imgs/data-juicer.jpg')
 
 
 
1
  import base64
 
2
  import copy
3
+ import inspect
4
+ import json
5
+ import os
6
  import shutil
7
+
8
  import gradio as gr
9
+ import yaml
10
+ from datasets import Dataset
11
+
12
  from data_juicer.ops.base_op import OPERATORS
13
  from data_juicer.utils.constant import Fields
14
+
15
  demo_path = os.path.dirname(os.path.abspath(__file__))
16
  project_path = os.path.dirname(os.path.dirname(demo_path))
17
 
 
78
  'selector':extract_op_desc(op_text, '## Selector <a name="selector"/>'),
79
  }
80
 
81
+ op_types = ['mapper', 'filter', 'deduplicator']
82
  local_ops_dict = {op_type:[] for op_type in op_types}
83
  multimodal = os.getenv('MULTI_MODAL', True)
84
  multimodal_visible = False
 
118
 
119
  return ''.join(text[0]), yaml.dump(default_params)
120
 
121
+ def change_visible(op_name, show_text):
122
+ text_visible = show_text
123
  video_visible = False
124
  audio_visible = False
125
  image_visible = False
 
129
  audio_visible = True
130
  elif 'image' in op_name:
131
  image_visible = True
132
+ elif 'document' in op_name:
133
+ text_visible = True
134
  return gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible), gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible)
135
 
136
 
 
185
  op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
186
  run_button = gr.Button(value="🚀Run")
187
  show_code_button = gr.Button(value="🔍Show Code")
188
+ show_text = gr.Checkbox(value=True,visible=False)
189
  with gr.Column():
190
  with gr.Group('Inputs'):
191
  gr.Markdown(" **Inputs**")
 
232
  return outputs
233
 
234
  show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
235
+ show_code_button.click(change_visible, inputs=[op_selector,show_text], outputs=outputs[:4] + inputs[:4])
236
  run_button.click(run_func, inputs=inputs, outputs=outputs)
237
+ run_button.click(change_visible, inputs=[op_selector,show_text], outputs=outputs[:4] + inputs[:4])
238
  op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
239
+ op_selector.select(change_visible, inputs=[op_selector,show_text], outputs=outputs[:4] + inputs[:4])
240
+ op_tab.select(change_visible, inputs=[op_selector,show_text], outputs=outputs[:4] + inputs[:4])
241
+ op_tab.select(show_code, inputs=[op_selector], outputs=[code, op_params])
242
 
243
  def create_mapper_tab(op_type, op_tab):
244
  with op_tab:
 
269
 
270
  def create_deduplicator_tab(op_type, op_tab):
271
  with op_tab:
272
+ def run_op(input_text, input_image, input_video, input_audio, input_text2, input_image2, input_video2, input_audio2, op_name, op_params):
273
  op_class = OPERATORS.modules[op_name]
274
  op = op_class(**op_params)
275
  sample = encode_sample(input_text, input_image, input_video, input_audio)
276
+ sample2 = encode_sample(input_text2, input_image2, input_video2, input_audio2)
277
+ output_sample = op.compute_hash(copy.deepcopy(sample))
278
+ output_sample2 = op.compute_hash(copy.deepcopy(sample2))
279
+ ds = Dataset.from_list([output_sample, output_sample2])
280
+ hash_values = ds.remove_columns([text_key, image_key, video_key, audio_key]).to_dict()
281
+ ds.cleanup_cache_files()
282
+ for key, values in hash_values.items():
283
+ new_values = []
284
+ for value in values:
285
+ if isinstance(value, list):
286
+ new_values.append([v.hex() for v in value])
287
+ hash_values[key] = new_values or values
288
+ _, dedup_pairs = op.process(ds, show_num=1)
289
+ if dedup_pairs:
290
+ dedup = "Yes"
291
+ else:
292
+ dedup = "No"
293
+
294
+ return json.dumps(hash_values), dedup
295
+ create_tab_double_layout(op_tab, op_type, run_op)
296
 
297
 
298
  def create_tab_double_layout(op_tab, op_type, run_op):
 
306
  op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
307
  run_button = gr.Button(value="🚀Run")
308
  show_code_button = gr.Button(value="🔍Show Code")
309
+ show_text = gr.Checkbox(value=False,visible=False)
310
  with gr.Column():
311
  with gr.Group('Inputs'):
312
  gr.Markdown(" **Inputs**")
313
  with gr.Row():
314
+
315
  input_text = gr.TextArea(label="Text",interactive=True,)
316
  input_text2 = gr.TextArea(label="Text",interactive=True,)
317
  input_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible)
 
324
  with gr.Group('Outputs'):
325
  gr.Markdown(" **Outputs**")
326
  with gr.Row():
327
+ output_deduplicated_pairs = gr.Json(label='Deduplicated pairs')
328
+ output_deduplicated = gr.Text(label='Deduplicate or not?', interactive=False)
329
+
 
 
 
 
 
 
330
  code = gr.Code(label='Source', language='python')
331
  inputs = [input_text, input_image, input_video, input_audio, input_text2, input_image2, input_video2, input_audio2, op_selector, op_params]
332
+ outputs = [output_deduplicated_pairs, output_deduplicated]
333
 
334
  def run_func(*args):
335
  try:
336
  try:
337
+ args = list(args)
338
+ op_params = args.pop()
339
  params = yaml.safe_load(op_params)
340
  except:
341
  params = {}
342
  if params is None:
343
  params = {}
344
+ return run_op(*args, params)
345
  except Exception as e:
346
  gr.Error(str(e))
347
+ print(e)
348
  return outputs
349
 
350
+ show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
351
+ show_code_button.click(change_visible, inputs=[op_selector, show_text], outputs=inputs[:8])
352
+ run_button.click(run_func, inputs=inputs, outputs=outputs)
353
+ run_button.click(change_visible, inputs=[op_selector,show_text], outputs=inputs[:8])
354
+ op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
355
+ op_selector.select(change_visible, inputs=[op_selector,show_text], outputs=inputs[:8])
356
+ op_tab.select(change_visible, inputs=[op_selector,show_text], outputs= inputs[:8])
357
+ op_tab.select(show_code, inputs=[op_selector], outputs=[code, op_params])
 
358
 
359
  with gr.Blocks(css="./app.css") as demo:
360
  dj_image = os.path.join(project_path, 'docs/imgs/data-juicer.jpg')