import gradio as gr import requests from bs4 import BeautifulSoup import pandas as pd import tempfile from faker import Faker import random from huggingface_hub import InferenceClient # Initialize Faker for synthetic data fallback fake = Faker() # Function to extract ALL text from a webpage def extract_all_text_from_url(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') text_elements = [text.strip() for text in soup.stripped_strings if text.strip()] return text_elements except Exception as e: raise ValueError(f"Error fetching or parsing the URL: {e}") # Function to apply common-sense filtering def apply_common_sense(text_list): filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()]) return list(filtered) # Function to generate synthetic data using HF Inference API or Faker fallback def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token): synthetic_data = [] if not text_list: text_list = [fake.sentence()] if not hf_api_token: # Fallback to Faker if no token provided for _ in range(num_synthetic): base_text = random.choice(text_list) words = base_text.split() random.shuffle(words) synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10))) else: # Use HF Inference API client = InferenceClient(model=hf_model, token=hf_api_token) for _ in range(num_synthetic): base_text = random.choice(text_list) try: prompt = f"Generate a creative variation of this text: '{base_text}'" generated = client.text_generation(prompt, max_length=50, temperature=0.7) synthetic_data.append(generated.strip()) except Exception as e: synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5)))) return synthetic_data # Function to sort text by length def sort_text_by_length(text_list): return sorted(text_list, key=len) # Function to create a DataFrame with only a text column def create_dataframe(text_list, column_text): df = pd.DataFrame({column_text: text_list}) return df # Function to generate a CSV file def download_csv(df): with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp: df.to_csv(tmp.name, index=False) return tmp.name # Function to generate a JSON file def download_json(df): with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp: df.to_json(tmp.name, orient='records') return tmp.name # Gradio interface with gr.Blocks() as demo: # Header gr.Markdown("# Webtaset: Website to Dataset Converter") gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.") # Inputs url = gr.Textbox(label="Enter the URL", placeholder="https://example.com") column_text = gr.Textbox(label="Column name for text", value="Text") num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0) hf_model = gr.Dropdown( label="Hugging Face Model (lightweight)", choices=["distilgpt2", "facebook/bart-base", "gpt2"], value="distilgpt2" ) hf_api_token = gr.Textbox( label="Hugging Face API Token (required for HF models)", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" ) # Process button process_btn = gr.Button("Process") # Outputs df_preview = gr.Dataframe(label="Dataset Preview") state = gr.State() # To store the DataFrame status = gr.Textbox(label="Status", interactive=False) download_csv_btn = gr.Button("Download CSV") download_json_btn = gr.Button("Download JSON") csv_file = gr.File(label="Download CSV") json_file = gr.File(label="Download JSON") # Process function def process(url, column_text, num_synthetic, hf_model, hf_api_token): try: # Step 1 & 2: Get URL and extract ALL text text_list = extract_all_text_from_url(url) # Add common-sense filtering filtered_text = apply_common_sense(text_list) # Generate synthetic data if requested if num_synthetic > 0: synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token) filtered_text.extend(synthetic_data) # Step 5 & 6: Sort by increasing size sorted_text = sort_text_by_length(filtered_text) # Step 7: Create DataFrame with user-defined column name df = create_dataframe(sorted_text, column_text) # Step 8: Return for preview and state method = "Faker" if not hf_api_token else hf_model return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}." except Exception as e: return None, None, f"Error: {e}" # Connect process button process_btn.click( fn=process, inputs=[url, column_text, num_synthetic, hf_model, hf_api_token], outputs=[df_preview, state, status] ) # Download CSV function def gen_csv(state): if state is None: return None return download_csv(state) # Download JSON function def gen_json(state): if state is None: return None return download_json(state) # Connect download buttons download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file) download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file) # Launch the app demo.launch()