Spaces:

PhaseTechnologies
/

Webtaset

Running

App Files Files Community

Webtaset / app.py

Phase-Technologies

Create app.py

6e6a649 verified 9 months ago

raw

history blame contribute delete

6.16 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import tempfile
	from faker import Faker
	import random
	from huggingface_hub import InferenceClient

	# Initialize Faker for synthetic data fallback
	fake = Faker()

	# Function to extract ALL text from a webpage
	def extract_all_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
	return text_elements
	except Exception as e:
	raise ValueError(f"Error fetching or parsing the URL: {e}")

	# Function to apply common-sense filtering
	def apply_common_sense(text_list):
	filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
	return list(filtered)

	# Function to generate synthetic data using HF Inference API or Faker fallback
	def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
	synthetic_data = []
	if not text_list:
	text_list = [fake.sentence()]

	if not hf_api_token:
	# Fallback to Faker if no token provided
	for _ in range(num_synthetic):
	base_text = random.choice(text_list)
	words = base_text.split()
	random.shuffle(words)
	synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
	else:
	# Use HF Inference API
	client = InferenceClient(model=hf_model, token=hf_api_token)
	for _ in range(num_synthetic):
	base_text = random.choice(text_list)
	try:
	prompt = f"Generate a creative variation of this text: '{base_text}'"
	generated = client.text_generation(prompt, max_length=50, temperature=0.7)
	synthetic_data.append(generated.strip())
	except Exception as e:
	synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))

	return synthetic_data

	# Function to sort text by length
	def sort_text_by_length(text_list):
	return sorted(text_list, key=len)

	# Function to create a DataFrame with only a text column
	def create_dataframe(text_list, column_text):
	df = pd.DataFrame({column_text: text_list})
	return df

	# Function to generate a CSV file
	def download_csv(df):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
	df.to_csv(tmp.name, index=False)
	return tmp.name

	# Function to generate a JSON file
	def download_json(df):
	with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
	df.to_json(tmp.name, orient='records')
	return tmp.name

	# Gradio interface
	with gr.Blocks() as demo:
	# Header
	gr.Markdown("# Webtaset: Website to Dataset Converter")
	gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")

	# Inputs
	url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
	column_text = gr.Textbox(label="Column name for text", value="Text")
	num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
	hf_model = gr.Dropdown(
	label="Hugging Face Model (lightweight)",
	choices=["distilgpt2", "facebook/bart-base", "gpt2"],
	value="distilgpt2"
	)
	hf_api_token = gr.Textbox(
	label="Hugging Face API Token (required for HF models)",
	type="password",
	placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	)

	# Process button
	process_btn = gr.Button("Process")

	# Outputs
	df_preview = gr.Dataframe(label="Dataset Preview")
	state = gr.State() # To store the DataFrame
	status = gr.Textbox(label="Status", interactive=False)

	download_csv_btn = gr.Button("Download CSV")
	download_json_btn = gr.Button("Download JSON")
	csv_file = gr.File(label="Download CSV")
	json_file = gr.File(label="Download JSON")

	# Process function
	def process(url, column_text, num_synthetic, hf_model, hf_api_token):
	try:
	# Step 1 & 2: Get URL and extract ALL text
	text_list = extract_all_text_from_url(url)

	# Add common-sense filtering
	filtered_text = apply_common_sense(text_list)

	# Generate synthetic data if requested
	if num_synthetic > 0:
	synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
	filtered_text.extend(synthetic_data)

	# Step 5 & 6: Sort by increasing size
	sorted_text = sort_text_by_length(filtered_text)

	# Step 7: Create DataFrame with user-defined column name
	df = create_dataframe(sorted_text, column_text)

	# Step 8: Return for preview and state
	method = "Faker" if not hf_api_token else hf_model
	return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
	except Exception as e:
	return None, None, f"Error: {e}"

	# Connect process button
	process_btn.click(
	fn=process,
	inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
	outputs=[df_preview, state, status]
	)

	# Download CSV function
	def gen_csv(state):
	if state is None:
	return None
	return download_csv(state)

	# Download JSON function
	def gen_json(state):
	if state is None:
	return None
	return download_json(state)

	# Connect download buttons
	download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
	download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)

	# Launch the app
	demo.launch()