Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +141 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from datasets import load_dataset, Features, Value, Audio, Dataset
|
3 |
+
from huggingface_hub import HfApi, create_repo
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
def filter_dataset(dataset_name, split_name, keywords_text):
|
8 |
+
"""Filters a dataset based on keywords and returns a Pandas DataFrame."""
|
9 |
+
try:
|
10 |
+
# --- 1. Load the dataset in streaming mode ---
|
11 |
+
dataset = load_dataset(dataset_name, split=split_name, streaming=True)
|
12 |
+
|
13 |
+
# --- 2. Filter the dataset (streaming) ---
|
14 |
+
keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
|
15 |
+
if not keywords:
|
16 |
+
return pd.DataFrame(), "Error: No keywords provided."
|
17 |
+
|
18 |
+
# Define a filtering function that handles potential KeyErrors
|
19 |
+
def filter_func(example):
|
20 |
+
prompt_value = example.get("prompt", "") # Get prompt, default to empty string
|
21 |
+
return any(keyword in prompt_value.lower() for keyword in keywords)
|
22 |
+
|
23 |
+
filtered_dataset = dataset.filter(filter_func)
|
24 |
+
|
25 |
+
|
26 |
+
# --- 3. Select Indices (Efficiently) ---
|
27 |
+
matching_indices = []
|
28 |
+
data_for_df = [] # Store data for DataFrame
|
29 |
+
for i, example in enumerate(filtered_dataset):
|
30 |
+
matching_indices.append(i)
|
31 |
+
# Extract data and append. Handle potential KeyErrors.
|
32 |
+
example_data = {
|
33 |
+
'prompt': example.get('prompt', None), # Use .get() for safety
|
34 |
+
'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
|
35 |
+
'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None, # Handle nested structure
|
36 |
+
}
|
37 |
+
data_for_df.append(example_data)
|
38 |
+
|
39 |
+
if not matching_indices:
|
40 |
+
return pd.DataFrame(), "No matching examples found."
|
41 |
+
|
42 |
+
# --- 4. Create Pandas DataFrame ---
|
43 |
+
df = pd.DataFrame(data_for_df)
|
44 |
+
return df, f"Found {len(matching_indices)} matching examples."
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
return pd.DataFrame(), f"An error occurred: {e}"
|
48 |
+
|
49 |
+
|
50 |
+
def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
|
51 |
+
"""Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
|
52 |
+
if not hf_token:
|
53 |
+
return "Error: Hugging Face Token is required.", None
|
54 |
+
|
55 |
+
try:
|
56 |
+
# Convert JSON back to DataFrame
|
57 |
+
df = pd.read_json(df_json)
|
58 |
+
|
59 |
+
if df.empty:
|
60 |
+
return "Error: Cannot push an empty dataset", None
|
61 |
+
|
62 |
+
# Convert DataFrame to Hugging Face Dataset
|
63 |
+
dataset = Dataset.from_pandas(df)
|
64 |
+
|
65 |
+
|
66 |
+
# --- 5. Define features (for consistent schema) ---
|
67 |
+
features_dict = {
|
68 |
+
'prompt': Value(dtype='string', id=None),
|
69 |
+
'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
|
70 |
+
'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
|
71 |
+
}
|
72 |
+
|
73 |
+
features = Features(features_dict)
|
74 |
+
|
75 |
+
try:
|
76 |
+
dataset = dataset.cast(features)
|
77 |
+
except Exception as e:
|
78 |
+
return f"An error occurred during casting: {e}", None
|
79 |
+
|
80 |
+
# --- 6. Upload to the Hugging Face Hub ---
|
81 |
+
api = HfApi(token=hf_token)
|
82 |
+
try:
|
83 |
+
create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
|
84 |
+
print(f"Repository '{new_dataset_repo_id}' created.")
|
85 |
+
except Exception as e:
|
86 |
+
if "Repo already exists" not in str(e):
|
87 |
+
return f"Error creating repository: {e}", None
|
88 |
+
|
89 |
+
dataset.push_to_hub(new_dataset_repo_id)
|
90 |
+
dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
|
91 |
+
return f"Subset dataset uploaded successfully!", dataset_url
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
return f"An error occurred during push: {e}", None
|
95 |
+
# --- Gradio Interface ---
|
96 |
+
with gr.Blocks() as demo:
|
97 |
+
gr.Markdown("# Dataset Filter and Push")
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
|
101 |
+
split_name_input = gr.Textbox(label="Split Name", value="train")
|
102 |
+
|
103 |
+
keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
|
104 |
+
|
105 |
+
filter_button = gr.Button("Filter Dataset")
|
106 |
+
|
107 |
+
# Display the filtered data. 'label' is important for presentation.
|
108 |
+
filtered_data_output = gr.Dataframe(label="Filtered Data")
|
109 |
+
filter_status_output = gr.Textbox(label="Filter Status")
|
110 |
+
|
111 |
+
with gr.Row():
|
112 |
+
new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
|
113 |
+
hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
|
114 |
+
|
115 |
+
push_button = gr.Button("Push to Hub")
|
116 |
+
push_status_output = gr.Textbox(label="Push Status")
|
117 |
+
dataset_url_output = gr.Textbox(label="Dataset URL") # Display the dataset URL
|
118 |
+
|
119 |
+
# Hidden component to store the filtered dataset (as JSON)
|
120 |
+
filtered_data_json = gr.JSON(visible=False)
|
121 |
+
|
122 |
+
# Connect the filter button
|
123 |
+
filter_button.click(
|
124 |
+
filter_dataset,
|
125 |
+
inputs=[dataset_name_input, split_name_input, keywords_input],
|
126 |
+
outputs=[filtered_data_output, filter_status_output]
|
127 |
+
).then( # Use .then() to chain actions
|
128 |
+
lambda df: df.to_json(), # Convert DataFrame to JSON
|
129 |
+
inputs=[filtered_data_output],
|
130 |
+
outputs=[filtered_data_json] # Store in the hidden JSON component
|
131 |
+
)
|
132 |
+
|
133 |
+
# Connect the push button
|
134 |
+
push_button.click(
|
135 |
+
push_to_hub,
|
136 |
+
inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
|
137 |
+
outputs=[push_status_output, dataset_url_output]
|
138 |
+
)
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
datasets
|
3 |
+
huggingface_hub
|
4 |
+
pandas
|
5 |
+
librosa
|
6 |
+
soundfile
|