Omarrran commited on
Commit
2a8c51e
·
verified ·
1 Parent(s): c322f07

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +141 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset, Features, Value, Audio, Dataset
3
+ from huggingface_hub import HfApi, create_repo
4
+ import pandas as pd
5
+
6
+
7
+ def filter_dataset(dataset_name, split_name, keywords_text):
8
+ """Filters a dataset based on keywords and returns a Pandas DataFrame."""
9
+ try:
10
+ # --- 1. Load the dataset in streaming mode ---
11
+ dataset = load_dataset(dataset_name, split=split_name, streaming=True)
12
+
13
+ # --- 2. Filter the dataset (streaming) ---
14
+ keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
15
+ if not keywords:
16
+ return pd.DataFrame(), "Error: No keywords provided."
17
+
18
+ # Define a filtering function that handles potential KeyErrors
19
+ def filter_func(example):
20
+ prompt_value = example.get("prompt", "") # Get prompt, default to empty string
21
+ return any(keyword in prompt_value.lower() for keyword in keywords)
22
+
23
+ filtered_dataset = dataset.filter(filter_func)
24
+
25
+
26
+ # --- 3. Select Indices (Efficiently) ---
27
+ matching_indices = []
28
+ data_for_df = [] # Store data for DataFrame
29
+ for i, example in enumerate(filtered_dataset):
30
+ matching_indices.append(i)
31
+ # Extract data and append. Handle potential KeyErrors.
32
+ example_data = {
33
+ 'prompt': example.get('prompt', None), # Use .get() for safety
34
+ 'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
35
+ 'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None, # Handle nested structure
36
+ }
37
+ data_for_df.append(example_data)
38
+
39
+ if not matching_indices:
40
+ return pd.DataFrame(), "No matching examples found."
41
+
42
+ # --- 4. Create Pandas DataFrame ---
43
+ df = pd.DataFrame(data_for_df)
44
+ return df, f"Found {len(matching_indices)} matching examples."
45
+
46
+ except Exception as e:
47
+ return pd.DataFrame(), f"An error occurred: {e}"
48
+
49
+
50
+ def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
51
+ """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
52
+ if not hf_token:
53
+ return "Error: Hugging Face Token is required.", None
54
+
55
+ try:
56
+ # Convert JSON back to DataFrame
57
+ df = pd.read_json(df_json)
58
+
59
+ if df.empty:
60
+ return "Error: Cannot push an empty dataset", None
61
+
62
+ # Convert DataFrame to Hugging Face Dataset
63
+ dataset = Dataset.from_pandas(df)
64
+
65
+
66
+ # --- 5. Define features (for consistent schema) ---
67
+ features_dict = {
68
+ 'prompt': Value(dtype='string', id=None),
69
+ 'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
70
+ 'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
71
+ }
72
+
73
+ features = Features(features_dict)
74
+
75
+ try:
76
+ dataset = dataset.cast(features)
77
+ except Exception as e:
78
+ return f"An error occurred during casting: {e}", None
79
+
80
+ # --- 6. Upload to the Hugging Face Hub ---
81
+ api = HfApi(token=hf_token)
82
+ try:
83
+ create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
84
+ print(f"Repository '{new_dataset_repo_id}' created.")
85
+ except Exception as e:
86
+ if "Repo already exists" not in str(e):
87
+ return f"Error creating repository: {e}", None
88
+
89
+ dataset.push_to_hub(new_dataset_repo_id)
90
+ dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
91
+ return f"Subset dataset uploaded successfully!", dataset_url
92
+
93
+ except Exception as e:
94
+ return f"An error occurred during push: {e}", None
95
+ # --- Gradio Interface ---
96
+ with gr.Blocks() as demo:
97
+ gr.Markdown("# Dataset Filter and Push")
98
+
99
+ with gr.Row():
100
+ dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
101
+ split_name_input = gr.Textbox(label="Split Name", value="train")
102
+
103
+ keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
104
+
105
+ filter_button = gr.Button("Filter Dataset")
106
+
107
+ # Display the filtered data. 'label' is important for presentation.
108
+ filtered_data_output = gr.Dataframe(label="Filtered Data")
109
+ filter_status_output = gr.Textbox(label="Filter Status")
110
+
111
+ with gr.Row():
112
+ new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
113
+ hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
114
+
115
+ push_button = gr.Button("Push to Hub")
116
+ push_status_output = gr.Textbox(label="Push Status")
117
+ dataset_url_output = gr.Textbox(label="Dataset URL") # Display the dataset URL
118
+
119
+ # Hidden component to store the filtered dataset (as JSON)
120
+ filtered_data_json = gr.JSON(visible=False)
121
+
122
+ # Connect the filter button
123
+ filter_button.click(
124
+ filter_dataset,
125
+ inputs=[dataset_name_input, split_name_input, keywords_input],
126
+ outputs=[filtered_data_output, filter_status_output]
127
+ ).then( # Use .then() to chain actions
128
+ lambda df: df.to_json(), # Convert DataFrame to JSON
129
+ inputs=[filtered_data_output],
130
+ outputs=[filtered_data_json] # Store in the hidden JSON component
131
+ )
132
+
133
+ # Connect the push button
134
+ push_button.click(
135
+ push_to_hub,
136
+ inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
137
+ outputs=[push_status_output, dataset_url_output]
138
+ )
139
+
140
+ if __name__ == "__main__":
141
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ datasets
3
+ huggingface_hub
4
+ pandas
5
+ librosa
6
+ soundfile