Webtaset / app.py
Phase-Technologies's picture
Create app.py
6e6a649 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
from faker import Faker
import random
from huggingface_hub import InferenceClient
# Initialize Faker for synthetic data fallback
fake = Faker()
# Function to extract ALL text from a webpage
def extract_all_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
return text_elements
except Exception as e:
raise ValueError(f"Error fetching or parsing the URL: {e}")
# Function to apply common-sense filtering
def apply_common_sense(text_list):
filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
return list(filtered)
# Function to generate synthetic data using HF Inference API or Faker fallback
def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
synthetic_data = []
if not text_list:
text_list = [fake.sentence()]
if not hf_api_token:
# Fallback to Faker if no token provided
for _ in range(num_synthetic):
base_text = random.choice(text_list)
words = base_text.split()
random.shuffle(words)
synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
else:
# Use HF Inference API
client = InferenceClient(model=hf_model, token=hf_api_token)
for _ in range(num_synthetic):
base_text = random.choice(text_list)
try:
prompt = f"Generate a creative variation of this text: '{base_text}'"
generated = client.text_generation(prompt, max_length=50, temperature=0.7)
synthetic_data.append(generated.strip())
except Exception as e:
synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
return synthetic_data
# Function to sort text by length
def sort_text_by_length(text_list):
return sorted(text_list, key=len)
# Function to create a DataFrame with only a text column
def create_dataframe(text_list, column_text):
df = pd.DataFrame({column_text: text_list})
return df
# Function to generate a CSV file
def download_csv(df):
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
df.to_csv(tmp.name, index=False)
return tmp.name
# Function to generate a JSON file
def download_json(df):
with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
df.to_json(tmp.name, orient='records')
return tmp.name
# Gradio interface
with gr.Blocks() as demo:
# Header
gr.Markdown("# Webtaset: Website to Dataset Converter")
gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
# Inputs
url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
column_text = gr.Textbox(label="Column name for text", value="Text")
num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
hf_model = gr.Dropdown(
label="Hugging Face Model (lightweight)",
choices=["distilgpt2", "facebook/bart-base", "gpt2"],
value="distilgpt2"
)
hf_api_token = gr.Textbox(
label="Hugging Face API Token (required for HF models)",
type="password",
placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
)
# Process button
process_btn = gr.Button("Process")
# Outputs
df_preview = gr.Dataframe(label="Dataset Preview")
state = gr.State() # To store the DataFrame
status = gr.Textbox(label="Status", interactive=False)
download_csv_btn = gr.Button("Download CSV")
download_json_btn = gr.Button("Download JSON")
csv_file = gr.File(label="Download CSV")
json_file = gr.File(label="Download JSON")
# Process function
def process(url, column_text, num_synthetic, hf_model, hf_api_token):
try:
# Step 1 & 2: Get URL and extract ALL text
text_list = extract_all_text_from_url(url)
# Add common-sense filtering
filtered_text = apply_common_sense(text_list)
# Generate synthetic data if requested
if num_synthetic > 0:
synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
filtered_text.extend(synthetic_data)
# Step 5 & 6: Sort by increasing size
sorted_text = sort_text_by_length(filtered_text)
# Step 7: Create DataFrame with user-defined column name
df = create_dataframe(sorted_text, column_text)
# Step 8: Return for preview and state
method = "Faker" if not hf_api_token else hf_model
return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
except Exception as e:
return None, None, f"Error: {e}"
# Connect process button
process_btn.click(
fn=process,
inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
outputs=[df_preview, state, status]
)
# Download CSV function
def gen_csv(state):
if state is None:
return None
return download_csv(state)
# Download JSON function
def gen_json(state):
if state is None:
return None
return download_json(state)
# Connect download buttons
download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)
# Launch the app
demo.launch()