Spaces:

usmanyousaf
/

AI-WebScraper-App

Running

App Files Files Community

AI-WebScraper-App / scrape.py

usmanyousaf

Update scrape.py

135b855 verified 9 months ago

raw

history blame

2.31 kB

	from selenium import webdriver
	from selenium.common.exceptions import WebDriverException
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from bs4 import BeautifulSoup
	import time

	def scrape_website(website):
	print("Connecting to Chrome Browser...")

	# Setup ChromeDriver options
	options = Options()
	options.add_argument("--headless") # Run in headless mode for deployment
	options.add_argument('--no-sandbox') # Overcome limited resource problems
	options.add_argument('--disable-dev-shm-usage') # Overcome limited resource problems

	# Initialize the driver without a specified service (assumes ChromeDriver is in PATH)
	wd = None
	try:
	wd = webdriver.Chrome(options=options)
	wd.set_window_size(1080, 720) # Set the window size
	wd.get(website)
	wd.implicitly_wait(10)
	print("Waiting for CAPTCHA to be solved manually (if present)...")

	# Optional waiting loop for manual CAPTCHA solving
	while "captcha" in wd.page_source.lower():
	print("CAPTCHA detected, waiting...")
	time.sleep(5)

	print("CAPTCHA solved or not present. Scraping page content...")
	html = wd.page_source
	return html

	except WebDriverException as e:
	print(f"WebDriverException occurred: {e}")
	return None # Return None or an empty string based on your requirement

	finally:
	if wd:
	wd.quit()

	def extract_body_content(html_content):
	if html_content is None:
	return "" # Return empty if there is no content
	soup = BeautifulSoup(html_content, "html.parser")
	body_content = soup.body
	return str(body_content) if body_content else ""

	def clean_body_content(body_content):
	soup = BeautifulSoup(body_content, "html.parser")

	for script_or_style in soup(["script", "style"]):
	script_or_style.extract()

	cleaned_content = soup.get_text(separator="\n")
	cleaned_content = "\n".join(
	line.strip() for line in cleaned_content.splitlines() if line.strip()
	)

	return cleaned_content

	def split_dom_content(dom_content, max_length=6000):
	return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]