Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import tempfile | |
| from faker import Faker | |
| import random | |
| from huggingface_hub import InferenceClient | |
| # Initialize Faker for synthetic data fallback | |
| fake = Faker() | |
| # Function to extract ALL text from a webpage | |
| def extract_all_text_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| text_elements = [text.strip() for text in soup.stripped_strings if text.strip()] | |
| return text_elements | |
| except Exception as e: | |
| raise ValueError(f"Error fetching or parsing the URL: {e}") | |
| # Function to apply common-sense filtering | |
| def apply_common_sense(text_list): | |
| filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()]) | |
| return list(filtered) | |
| # Function to generate synthetic data using HF Inference API or Faker fallback | |
| def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token): | |
| synthetic_data = [] | |
| if not text_list: | |
| text_list = [fake.sentence()] | |
| if not hf_api_token: | |
| # Fallback to Faker if no token provided | |
| for _ in range(num_synthetic): | |
| base_text = random.choice(text_list) | |
| words = base_text.split() | |
| random.shuffle(words) | |
| synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10))) | |
| else: | |
| # Use HF Inference API | |
| client = InferenceClient(model=hf_model, token=hf_api_token) | |
| for _ in range(num_synthetic): | |
| base_text = random.choice(text_list) | |
| try: | |
| prompt = f"Generate a creative variation of this text: '{base_text}'" | |
| generated = client.text_generation(prompt, max_length=50, temperature=0.7) | |
| synthetic_data.append(generated.strip()) | |
| except Exception as e: | |
| synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5)))) | |
| return synthetic_data | |
| # Function to sort text by length | |
| def sort_text_by_length(text_list): | |
| return sorted(text_list, key=len) | |
| # Function to create a DataFrame with only a text column | |
| def create_dataframe(text_list, column_text): | |
| df = pd.DataFrame({column_text: text_list}) | |
| return df | |
| # Function to generate a CSV file | |
| def download_csv(df): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp: | |
| df.to_csv(tmp.name, index=False) | |
| return tmp.name | |
| # Function to generate a JSON file | |
| def download_json(df): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp: | |
| df.to_json(tmp.name, orient='records') | |
| return tmp.name | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| # Header | |
| gr.Markdown("# Webtaset: Website to Dataset Converter") | |
| gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.") | |
| # Inputs | |
| url = gr.Textbox(label="Enter the URL", placeholder="https://example.com") | |
| column_text = gr.Textbox(label="Column name for text", value="Text") | |
| num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0) | |
| hf_model = gr.Dropdown( | |
| label="Hugging Face Model (lightweight)", | |
| choices=["distilgpt2", "facebook/bart-base", "gpt2"], | |
| value="distilgpt2" | |
| ) | |
| hf_api_token = gr.Textbox( | |
| label="Hugging Face API Token (required for HF models)", | |
| type="password", | |
| placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
| ) | |
| # Process button | |
| process_btn = gr.Button("Process") | |
| # Outputs | |
| df_preview = gr.Dataframe(label="Dataset Preview") | |
| state = gr.State() # To store the DataFrame | |
| status = gr.Textbox(label="Status", interactive=False) | |
| download_csv_btn = gr.Button("Download CSV") | |
| download_json_btn = gr.Button("Download JSON") | |
| csv_file = gr.File(label="Download CSV") | |
| json_file = gr.File(label="Download JSON") | |
| # Process function | |
| def process(url, column_text, num_synthetic, hf_model, hf_api_token): | |
| try: | |
| # Step 1 & 2: Get URL and extract ALL text | |
| text_list = extract_all_text_from_url(url) | |
| # Add common-sense filtering | |
| filtered_text = apply_common_sense(text_list) | |
| # Generate synthetic data if requested | |
| if num_synthetic > 0: | |
| synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token) | |
| filtered_text.extend(synthetic_data) | |
| # Step 5 & 6: Sort by increasing size | |
| sorted_text = sort_text_by_length(filtered_text) | |
| # Step 7: Create DataFrame with user-defined column name | |
| df = create_dataframe(sorted_text, column_text) | |
| # Step 8: Return for preview and state | |
| method = "Faker" if not hf_api_token else hf_model | |
| return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}." | |
| except Exception as e: | |
| return None, None, f"Error: {e}" | |
| # Connect process button | |
| process_btn.click( | |
| fn=process, | |
| inputs=[url, column_text, num_synthetic, hf_model, hf_api_token], | |
| outputs=[df_preview, state, status] | |
| ) | |
| # Download CSV function | |
| def gen_csv(state): | |
| if state is None: | |
| return None | |
| return download_csv(state) | |
| # Download JSON function | |
| def gen_json(state): | |
| if state is None: | |
| return None | |
| return download_json(state) | |
| # Connect download buttons | |
| download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file) | |
| download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file) | |
| # Launch the app | |
| demo.launch() |