Spaces:

usmanyousaf
/

AI-WebScraper-App

Running

App Files Files Community

AI-WebScraper-App / scrape.py

usmanyousaf

Update scrape.py

bce428a verified 10 months ago

raw

history blame

1.93 kB


	from selenium import webdriver
	from selenium.common.exceptions import WebDriverException
	from selenium import webdriver # type: ignore
	from selenium.webdriver.chrome.service import Service # type: ignore
	from selenium.webdriver.chrome.options import Options # type: ignore
	from bs4 import BeautifulSoup # type: ignore
	import time

	# Define the ChromeDriver path directly
	CHROME_DRIVER_PATH = "./chrome"

	def scrape_website(website):
	print("Connecting to Chrome Browser...")

	# Setup ChromeDriver service and options
	service = Service(CHROME_DRIVER_PATH)
	options = Options()
	options.add_argument("--headless") # Run in headless mode for deployment
	driver = webdriver.Chrome(service=service, options=options)

	try:
	driver.get(website)
	print("Waiting for CAPTCHA to be solved manually (if present)...")

	# Optional waiting loop for manual CAPTCHA solving
	while "captcha" in driver.page_source.lower():
	print("CAPTCHA detected, waiting...")
	time.sleep(5)

	print("CAPTCHA solved or not present. Scraping page content...")
	html = driver.page_source
	return html

	finally:
	driver.quit()

	def extract_body_content(html_content):
	soup = BeautifulSoup(html_content, "html.parser")
	body_content = soup.body
	return str(body_content) if body_content else ""

	def clean_body_content(body_content):
	soup = BeautifulSoup(body_content, "html.parser")

	for script_or_style in soup(["script", "style"]):
	script_or_style.extract()

	cleaned_content = soup.get_text(separator="\n")
	cleaned_content = "\n".join(
	line.strip() for line in cleaned_content.splitlines() if line.strip()
	)

	return cleaned_content

	def split_dom_content(dom_content, max_length=6000):
	return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]